diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19313 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 11608, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017229496898690558, + "grad_norm": 19.902466664856263, + "learning_rate": 4.306632213608957e-11, + "logits/chosen": -3.5275135040283203, + "logits/rejected": -3.49973726272583, + "logps/chosen": -1.2767510414123535, + "logps/rejected": -1.5448579788208008, + "loss": 1.4992, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.553502082824707, + "rewards/margins": 0.5362138748168945, + "rewards/rejected": -3.0897159576416016, + "step": 1 + }, + { + "epoch": 0.0017229496898690559, + "grad_norm": 24.28980899079471, + "learning_rate": 4.306632213608958e-10, + "logits/chosen": -3.5642738342285156, + "logits/rejected": -3.5475456714630127, + "logps/chosen": -1.3870140314102173, + "logps/rejected": -1.520027995109558, + "loss": 1.6805, + "rewards/accuracies": 0.6041666865348816, + "rewards/chosen": -2.7740280628204346, + "rewards/margins": 0.2660275399684906, + "rewards/rejected": -3.040055990219116, + "step": 10 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 19.464775985259337, + "learning_rate": 8.613264427217916e-10, + "logits/chosen": -3.6084282398223877, + "logits/rejected": -3.6022846698760986, + "logps/chosen": -1.4675112962722778, + "logps/rejected": -1.5436460971832275, + "loss": 1.8022, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.9350225925445557, + "rewards/margins": 0.15227018296718597, + "rewards/rejected": -3.087292194366455, + "step": 20 + }, + { + "epoch": 0.005168849069607168, + "grad_norm": 21.86938726909113, + "learning_rate": 1.2919896640826872e-09, + "logits/chosen": -3.5949459075927734, + "logits/rejected": -3.580786943435669, + "logps/chosen": -1.481797456741333, + "logps/rejected": -1.6944835186004639, + "loss": 1.5957, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.963594913482666, + "rewards/margins": 0.4253724217414856, + "rewards/rejected": -3.3889670372009277, + "step": 30 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 21.507861074322218, + "learning_rate": 1.7226528854435832e-09, + "logits/chosen": -3.5910987854003906, + "logits/rejected": -3.5736076831817627, + "logps/chosen": -1.4902746677398682, + "logps/rejected": -1.5528455972671509, + "loss": 1.8058, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.9805493354797363, + "rewards/margins": 0.125141903758049, + "rewards/rejected": -3.1056911945343018, + "step": 40 + }, + { + "epoch": 0.00861474844934528, + "grad_norm": 23.369618250964074, + "learning_rate": 2.153316106804479e-09, + "logits/chosen": -3.5898914337158203, + "logits/rejected": -3.57939076423645, + "logps/chosen": -1.4452978372573853, + "logps/rejected": -1.5600334405899048, + "loss": 1.7424, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.8905956745147705, + "rewards/margins": 0.22947096824645996, + "rewards/rejected": -3.1200668811798096, + "step": 50 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 19.515795496869245, + "learning_rate": 2.5839793281653743e-09, + "logits/chosen": -3.6230597496032715, + "logits/rejected": -3.6052374839782715, + "logps/chosen": -1.4597759246826172, + "logps/rejected": -1.5308177471160889, + "loss": 1.7742, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.9195518493652344, + "rewards/margins": 0.14208386838436127, + "rewards/rejected": -3.0616354942321777, + "step": 60 + }, + { + "epoch": 0.012060647829083391, + "grad_norm": 22.947892307409642, + "learning_rate": 3.01464254952627e-09, + "logits/chosen": -3.5613467693328857, + "logits/rejected": -3.5486927032470703, + "logps/chosen": -1.4373958110809326, + "logps/rejected": -1.5736496448516846, + "loss": 1.6834, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.8747916221618652, + "rewards/margins": 0.27250775694847107, + "rewards/rejected": -3.147299289703369, + "step": 70 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 18.514944136973586, + "learning_rate": 3.4453057708871665e-09, + "logits/chosen": -3.638662815093994, + "logits/rejected": -3.6183383464813232, + "logps/chosen": -1.4784338474273682, + "logps/rejected": -1.5225541591644287, + "loss": 1.8109, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.9568676948547363, + "rewards/margins": 0.08824063837528229, + "rewards/rejected": -3.0451083183288574, + "step": 80 + }, + { + "epoch": 0.015506547208821502, + "grad_norm": 22.266223550902822, + "learning_rate": 3.8759689922480615e-09, + "logits/chosen": -3.5194664001464844, + "logits/rejected": -3.5089030265808105, + "logps/chosen": -1.4499566555023193, + "logps/rejected": -1.566184401512146, + "loss": 1.7439, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.8999133110046387, + "rewards/margins": 0.23245570063591003, + "rewards/rejected": -3.132368803024292, + "step": 90 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 24.674037714283227, + "learning_rate": 4.306632213608958e-09, + "logits/chosen": -3.641566514968872, + "logits/rejected": -3.6062450408935547, + "logps/chosen": -1.4260755777359009, + "logps/rejected": -1.6064560413360596, + "loss": 1.6606, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8521511554718018, + "rewards/margins": 0.3607611656188965, + "rewards/rejected": -3.212912082672119, + "step": 100 + }, + { + "epoch": 0.01722949689869056, + "eval_logits/chosen": -3.638718843460083, + "eval_logits/rejected": -3.634615421295166, + "eval_logps/chosen": -1.4809681177139282, + "eval_logps/rejected": -1.6082144975662231, + "eval_loss": 1.6732447147369385, + "eval_rewards/accuracies": 0.5959572196006775, + "eval_rewards/chosen": -2.9619362354278564, + "eval_rewards/margins": 0.2544928193092346, + "eval_rewards/rejected": -3.2164289951324463, + "eval_runtime": 156.3898, + "eval_samples_per_second": 27.521, + "eval_steps_per_second": 3.44, + "step": 100 + }, + { + "epoch": 0.018952446588559616, + "grad_norm": 22.2275856632053, + "learning_rate": 4.737295434969853e-09, + "logits/chosen": -3.610905408859253, + "logits/rejected": -3.5971286296844482, + "logps/chosen": -1.4648408889770508, + "logps/rejected": -1.5376741886138916, + "loss": 1.7657, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9296817779541016, + "rewards/margins": 0.14566686749458313, + "rewards/rejected": -3.075348377227783, + "step": 110 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 19.934905722189338, + "learning_rate": 5.167958656330749e-09, + "logits/chosen": -3.5760700702667236, + "logits/rejected": -3.5672030448913574, + "logps/chosen": -1.4174858331680298, + "logps/rejected": -1.6266546249389648, + "loss": 1.5974, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.8349716663360596, + "rewards/margins": 0.418337345123291, + "rewards/rejected": -3.2533092498779297, + "step": 120 + }, + { + "epoch": 0.022398345968297727, + "grad_norm": 17.79258705952585, + "learning_rate": 5.598621877691645e-09, + "logits/chosen": -3.5977466106414795, + "logits/rejected": -3.5887436866760254, + "logps/chosen": -1.4579859972000122, + "logps/rejected": -1.5258009433746338, + "loss": 1.7748, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.9159719944000244, + "rewards/margins": 0.13562998175621033, + "rewards/rejected": -3.0516018867492676, + "step": 130 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 24.15219126577953, + "learning_rate": 6.02928509905254e-09, + "logits/chosen": -3.6053478717803955, + "logits/rejected": -3.592648983001709, + "logps/chosen": -1.4699456691741943, + "logps/rejected": -1.6089420318603516, + "loss": 1.6962, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.9398913383483887, + "rewards/margins": 0.27799224853515625, + "rewards/rejected": -3.217884063720703, + "step": 140 + }, + { + "epoch": 0.025844245348035838, + "grad_norm": 18.76375994673908, + "learning_rate": 6.459948320413436e-09, + "logits/chosen": -3.541599988937378, + "logits/rejected": -3.5312843322753906, + "logps/chosen": -1.4623388051986694, + "logps/rejected": -1.5741360187530518, + "loss": 1.7066, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.924677610397339, + "rewards/margins": 0.22359471023082733, + "rewards/rejected": -3.1482720375061035, + "step": 150 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 19.80323117723848, + "learning_rate": 6.890611541774333e-09, + "logits/chosen": -3.566392183303833, + "logits/rejected": -3.5544562339782715, + "logps/chosen": -1.4512722492218018, + "logps/rejected": -1.5856128931045532, + "loss": 1.6742, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9025444984436035, + "rewards/margins": 0.26868146657943726, + "rewards/rejected": -3.1712257862091064, + "step": 160 + }, + { + "epoch": 0.02929014472777395, + "grad_norm": 21.32288296686614, + "learning_rate": 7.321274763135228e-09, + "logits/chosen": -3.582791566848755, + "logits/rejected": -3.5718207359313965, + "logps/chosen": -1.4859055280685425, + "logps/rejected": -1.518599271774292, + "loss": 1.8526, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.971811056137085, + "rewards/margins": 0.06538741290569305, + "rewards/rejected": -3.037198543548584, + "step": 170 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 25.430375469558072, + "learning_rate": 7.751937984496123e-09, + "logits/chosen": -3.5702807903289795, + "logits/rejected": -3.5591273307800293, + "logps/chosen": -1.6040918827056885, + "logps/rejected": -1.749718427658081, + "loss": 1.7854, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.208183765411377, + "rewards/margins": 0.2912536859512329, + "rewards/rejected": -3.499436855316162, + "step": 180 + }, + { + "epoch": 0.03273604410751206, + "grad_norm": 21.43011983339046, + "learning_rate": 8.18260120585702e-09, + "logits/chosen": -3.605297803878784, + "logits/rejected": -3.5790534019470215, + "logps/chosen": -1.530076026916504, + "logps/rejected": -1.6488306522369385, + "loss": 1.7395, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.060152053833008, + "rewards/margins": 0.23750880360603333, + "rewards/rejected": -3.297661304473877, + "step": 190 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 18.14354253183522, + "learning_rate": 8.613264427217916e-09, + "logits/chosen": -3.573099136352539, + "logits/rejected": -3.563877820968628, + "logps/chosen": -1.5070440769195557, + "logps/rejected": -1.663789987564087, + "loss": 1.6959, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.0140881538391113, + "rewards/margins": 0.3134918212890625, + "rewards/rejected": -3.327579975128174, + "step": 200 + }, + { + "epoch": 0.03445899379738112, + "eval_logits/chosen": -3.6416237354278564, + "eval_logits/rejected": -3.637542963027954, + "eval_logps/chosen": -1.480735182762146, + "eval_logps/rejected": -1.6081981658935547, + "eval_loss": 1.6729189157485962, + "eval_rewards/accuracies": 0.5959572196006775, + "eval_rewards/chosen": -2.961470365524292, + "eval_rewards/margins": 0.2549257278442383, + "eval_rewards/rejected": -3.2163963317871094, + "eval_runtime": 156.089, + "eval_samples_per_second": 27.574, + "eval_steps_per_second": 3.447, + "step": 200 + }, + { + "epoch": 0.03618194348725017, + "grad_norm": 19.695930347722154, + "learning_rate": 9.043927648578812e-09, + "logits/chosen": -3.546250820159912, + "logits/rejected": -3.5405032634735107, + "logps/chosen": -1.4897280931472778, + "logps/rejected": -1.6909301280975342, + "loss": 1.5894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.9794561862945557, + "rewards/margins": 0.40240415930747986, + "rewards/rejected": -3.3818602561950684, + "step": 210 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 22.256011485080002, + "learning_rate": 9.474590869939706e-09, + "logits/chosen": -3.5668411254882812, + "logits/rejected": -3.5473320484161377, + "logps/chosen": -1.414710283279419, + "logps/rejected": -1.553809404373169, + "loss": 1.6774, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.829420566558838, + "rewards/margins": 0.27819839119911194, + "rewards/rejected": -3.107618808746338, + "step": 220 + }, + { + "epoch": 0.03962784286698828, + "grad_norm": 17.700685519776975, + "learning_rate": 9.905254091300603e-09, + "logits/chosen": -3.571800708770752, + "logits/rejected": -3.5605690479278564, + "logps/chosen": -1.402372121810913, + "logps/rejected": -1.5592352151870728, + "loss": 1.6623, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.804744243621826, + "rewards/margins": 0.3137262463569641, + "rewards/rejected": -3.1184704303741455, + "step": 230 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 16.452428439777606, + "learning_rate": 1.0335917312661497e-08, + "logits/chosen": -3.560429811477661, + "logits/rejected": -3.535571336746216, + "logps/chosen": -1.4708964824676514, + "logps/rejected": -1.5993845462799072, + "loss": 1.7096, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.9417929649353027, + "rewards/margins": 0.25697603821754456, + "rewards/rejected": -3.1987690925598145, + "step": 240 + }, + { + "epoch": 0.043073742246726394, + "grad_norm": 22.100161080743394, + "learning_rate": 1.0766580534022395e-08, + "logits/chosen": -3.5984153747558594, + "logits/rejected": -3.5881507396698, + "logps/chosen": -1.4451910257339478, + "logps/rejected": -1.5042827129364014, + "loss": 1.7797, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.8903820514678955, + "rewards/margins": 0.11818329244852066, + "rewards/rejected": -3.0085654258728027, + "step": 250 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 20.901630508535277, + "learning_rate": 1.119724375538329e-08, + "logits/chosen": -3.5904266834259033, + "logits/rejected": -3.5830464363098145, + "logps/chosen": -1.472085952758789, + "logps/rejected": -1.6018329858779907, + "loss": 1.6851, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.944171905517578, + "rewards/margins": 0.25949400663375854, + "rewards/rejected": -3.2036659717559814, + "step": 260 + }, + { + "epoch": 0.046519641626464506, + "grad_norm": 23.52425166518535, + "learning_rate": 1.1627906976744186e-08, + "logits/chosen": -3.550851345062256, + "logits/rejected": -3.539297580718994, + "logps/chosen": -1.5365689992904663, + "logps/rejected": -1.6548383235931396, + "loss": 1.7157, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.0731379985809326, + "rewards/margins": 0.23653873801231384, + "rewards/rejected": -3.3096766471862793, + "step": 270 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 21.11575038456059, + "learning_rate": 1.205857019810508e-08, + "logits/chosen": -3.6116580963134766, + "logits/rejected": -3.589844226837158, + "logps/chosen": -1.47896409034729, + "logps/rejected": -1.5783021450042725, + "loss": 1.7607, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.95792818069458, + "rewards/margins": 0.19867625832557678, + "rewards/rejected": -3.156604290008545, + "step": 280 + }, + { + "epoch": 0.04996554100620262, + "grad_norm": 18.80108999704411, + "learning_rate": 1.2489233419465977e-08, + "logits/chosen": -3.5587875843048096, + "logits/rejected": -3.5508079528808594, + "logps/chosen": -1.4925225973129272, + "logps/rejected": -1.5811045169830322, + "loss": 1.7511, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.9850451946258545, + "rewards/margins": 0.17716369032859802, + "rewards/rejected": -3.1622090339660645, + "step": 290 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 19.179389185274957, + "learning_rate": 1.2919896640826872e-08, + "logits/chosen": -3.5400185585021973, + "logits/rejected": -3.532416820526123, + "logps/chosen": -1.4048042297363281, + "logps/rejected": -1.594025731086731, + "loss": 1.6134, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8096084594726562, + "rewards/margins": 0.3784428536891937, + "rewards/rejected": -3.188051462173462, + "step": 300 + }, + { + "epoch": 0.051688490696071676, + "eval_logits/chosen": -3.6530067920684814, + "eval_logits/rejected": -3.648977518081665, + "eval_logps/chosen": -1.4808309078216553, + "eval_logps/rejected": -1.6080501079559326, + "eval_loss": 1.673154592514038, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -2.9616618156433105, + "eval_rewards/margins": 0.25443872809410095, + "eval_rewards/rejected": -3.2161002159118652, + "eval_runtime": 156.3689, + "eval_samples_per_second": 27.525, + "eval_steps_per_second": 3.441, + "step": 300 + }, + { + "epoch": 0.05341144038594073, + "grad_norm": 26.641819894422046, + "learning_rate": 1.3350559862187768e-08, + "logits/chosen": -3.5809414386749268, + "logits/rejected": -3.5763046741485596, + "logps/chosen": -1.5297424793243408, + "logps/rejected": -1.599830150604248, + "loss": 1.7908, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.0594849586486816, + "rewards/margins": 0.14017537236213684, + "rewards/rejected": -3.199660301208496, + "step": 310 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 37.21186764927901, + "learning_rate": 1.3781223083548666e-08, + "logits/chosen": -3.5418643951416016, + "logits/rejected": -3.5262856483459473, + "logps/chosen": -1.475815773010254, + "logps/rejected": -1.4967548847198486, + "loss": 1.8551, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.951631546020508, + "rewards/margins": 0.04187833145260811, + "rewards/rejected": -2.9935097694396973, + "step": 320 + }, + { + "epoch": 0.05685733976567884, + "grad_norm": 21.47345325457244, + "learning_rate": 1.4211886304909559e-08, + "logits/chosen": -3.5780186653137207, + "logits/rejected": -3.564645767211914, + "logps/chosen": -1.4802000522613525, + "logps/rejected": -1.549534559249878, + "loss": 1.791, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.960400104522705, + "rewards/margins": 0.13866901397705078, + "rewards/rejected": -3.099069118499756, + "step": 330 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 20.28472644506346, + "learning_rate": 1.4642549526270457e-08, + "logits/chosen": -3.5315933227539062, + "logits/rejected": -3.5183043479919434, + "logps/chosen": -1.4519234895706177, + "logps/rejected": -1.5170637369155884, + "loss": 1.7867, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.9038469791412354, + "rewards/margins": 0.13028070330619812, + "rewards/rejected": -3.0341274738311768, + "step": 340 + }, + { + "epoch": 0.06030323914541695, + "grad_norm": 20.02165037054324, + "learning_rate": 1.507321274763135e-08, + "logits/chosen": -3.5075950622558594, + "logits/rejected": -3.4847922325134277, + "logps/chosen": -1.5001376867294312, + "logps/rejected": -1.67621648311615, + "loss": 1.6726, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0002753734588623, + "rewards/margins": 0.3521580696105957, + "rewards/rejected": -3.3524329662323, + "step": 350 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 22.084795534704817, + "learning_rate": 1.5503875968992246e-08, + "logits/chosen": -3.61497163772583, + "logits/rejected": -3.6020846366882324, + "logps/chosen": -1.4818767309188843, + "logps/rejected": -1.5625332593917847, + "loss": 1.7552, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.9637534618377686, + "rewards/margins": 0.1613130271434784, + "rewards/rejected": -3.1250665187835693, + "step": 360 + }, + { + "epoch": 0.06374913852515507, + "grad_norm": 21.74901962519419, + "learning_rate": 1.5934539190353144e-08, + "logits/chosen": -3.596970796585083, + "logits/rejected": -3.5805602073669434, + "logps/chosen": -1.4199258089065552, + "logps/rejected": -1.5549100637435913, + "loss": 1.6796, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.8398516178131104, + "rewards/margins": 0.2699684500694275, + "rewards/rejected": -3.1098201274871826, + "step": 370 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 18.241660602535585, + "learning_rate": 1.636520241171404e-08, + "logits/chosen": -3.6651523113250732, + "logits/rejected": -3.6509056091308594, + "logps/chosen": -1.455418348312378, + "logps/rejected": -1.566058874130249, + "loss": 1.7219, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.910836696624756, + "rewards/margins": 0.22128085792064667, + "rewards/rejected": -3.132117748260498, + "step": 380 + }, + { + "epoch": 0.06719503790489317, + "grad_norm": 20.219729675289923, + "learning_rate": 1.6795865633074936e-08, + "logits/chosen": -3.602299928665161, + "logits/rejected": -3.587104082107544, + "logps/chosen": -1.461916208267212, + "logps/rejected": -1.6185343265533447, + "loss": 1.6465, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.923832416534424, + "rewards/margins": 0.3132365345954895, + "rewards/rejected": -3.2370686531066895, + "step": 390 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 24.030958902850795, + "learning_rate": 1.722652885443583e-08, + "logits/chosen": -3.569058656692505, + "logits/rejected": -3.558835983276367, + "logps/chosen": -1.4776469469070435, + "logps/rejected": -1.6162402629852295, + "loss": 1.6596, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.955293893814087, + "rewards/margins": 0.2771868407726288, + "rewards/rejected": -3.232480525970459, + "step": 400 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.639417886734009, + "eval_logits/rejected": -3.6353211402893066, + "eval_logps/chosen": -1.481080412864685, + "eval_logps/rejected": -1.6083613634109497, + "eval_loss": 1.6733450889587402, + "eval_rewards/accuracies": 0.5945631861686707, + "eval_rewards/chosen": -2.96216082572937, + "eval_rewards/margins": 0.2545616924762726, + "eval_rewards/rejected": -3.2167227268218994, + "eval_runtime": 156.4715, + "eval_samples_per_second": 27.507, + "eval_steps_per_second": 3.438, + "step": 400 + }, + { + "epoch": 0.07064093728463129, + "grad_norm": 19.819531117974044, + "learning_rate": 1.7657192075796726e-08, + "logits/chosen": -3.5810413360595703, + "logits/rejected": -3.578838348388672, + "logps/chosen": -1.4071383476257324, + "logps/rejected": -1.5566108226776123, + "loss": 1.6668, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.814276695251465, + "rewards/margins": 0.2989448010921478, + "rewards/rejected": -3.1132216453552246, + "step": 410 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 18.724521038786648, + "learning_rate": 1.8087855297157624e-08, + "logits/chosen": -3.5790200233459473, + "logits/rejected": -3.574338436126709, + "logps/chosen": -1.4752023220062256, + "logps/rejected": -1.5333774089813232, + "loss": 1.801, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.950404644012451, + "rewards/margins": 0.11635031551122665, + "rewards/rejected": -3.0667548179626465, + "step": 420 + }, + { + "epoch": 0.0740868366643694, + "grad_norm": 21.919992371036447, + "learning_rate": 1.8518518518518518e-08, + "logits/chosen": -3.5955398082733154, + "logits/rejected": -3.586885452270508, + "logps/chosen": -1.4136337041854858, + "logps/rejected": -1.556467890739441, + "loss": 1.6773, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.8272674083709717, + "rewards/margins": 0.28566834330558777, + "rewards/rejected": -3.112935781478882, + "step": 430 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 17.289166112872724, + "learning_rate": 1.8949181739879413e-08, + "logits/chosen": -3.6099190711975098, + "logits/rejected": -3.588343381881714, + "logps/chosen": -1.426127314567566, + "logps/rejected": -1.6452903747558594, + "loss": 1.5766, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.852254629135132, + "rewards/margins": 0.4383259415626526, + "rewards/rejected": -3.2905807495117188, + "step": 440 + }, + { + "epoch": 0.07753273604410751, + "grad_norm": 19.49798453661815, + "learning_rate": 1.937984496124031e-08, + "logits/chosen": -3.561777114868164, + "logits/rejected": -3.5422873497009277, + "logps/chosen": -1.4547079801559448, + "logps/rejected": -1.6311235427856445, + "loss": 1.6482, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9094159603118896, + "rewards/margins": 0.3528306484222412, + "rewards/rejected": -3.262247085571289, + "step": 450 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 19.5184266082414, + "learning_rate": 1.9810508182601205e-08, + "logits/chosen": -3.553361177444458, + "logits/rejected": -3.541141986846924, + "logps/chosen": -1.4897562265396118, + "logps/rejected": -1.5273168087005615, + "loss": 1.871, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.9795124530792236, + "rewards/margins": 0.07512147724628448, + "rewards/rejected": -3.054633617401123, + "step": 460 + }, + { + "epoch": 0.08097863542384562, + "grad_norm": 23.235822242220173, + "learning_rate": 2.02411714039621e-08, + "logits/chosen": -3.5564093589782715, + "logits/rejected": -3.5369067192077637, + "logps/chosen": -1.4298266172409058, + "logps/rejected": -1.5652967691421509, + "loss": 1.6604, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.8596532344818115, + "rewards/margins": 0.27094024419784546, + "rewards/rejected": -3.1305935382843018, + "step": 470 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 21.919629105851154, + "learning_rate": 2.0671834625322995e-08, + "logits/chosen": -3.551980495452881, + "logits/rejected": -3.5509140491485596, + "logps/chosen": -1.4457902908325195, + "logps/rejected": -1.631638765335083, + "loss": 1.6451, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.891580581665039, + "rewards/margins": 0.3716966509819031, + "rewards/rejected": -3.263277530670166, + "step": 480 + }, + { + "epoch": 0.08442453480358374, + "grad_norm": 21.599435847062058, + "learning_rate": 2.1102497846683892e-08, + "logits/chosen": -3.498958110809326, + "logits/rejected": -3.4680278301239014, + "logps/chosen": -1.5218480825424194, + "logps/rejected": -1.6237432956695557, + "loss": 1.747, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.043696165084839, + "rewards/margins": 0.20378997921943665, + "rewards/rejected": -3.2474865913391113, + "step": 490 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 18.89656891058801, + "learning_rate": 2.153316106804479e-08, + "logits/chosen": -3.543241500854492, + "logits/rejected": -3.524479627609253, + "logps/chosen": -1.4444148540496826, + "logps/rejected": -1.5426428318023682, + "loss": 1.7383, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.8888297080993652, + "rewards/margins": 0.19645579159259796, + "rewards/rejected": -3.0852856636047363, + "step": 500 + }, + { + "epoch": 0.08614748449345279, + "eval_logits/chosen": -3.645364999771118, + "eval_logits/rejected": -3.641301155090332, + "eval_logps/chosen": -1.4811058044433594, + "eval_logps/rejected": -1.60848867893219, + "eval_loss": 1.673081398010254, + "eval_rewards/accuracies": 0.5966542959213257, + "eval_rewards/chosen": -2.9622116088867188, + "eval_rewards/margins": 0.2547660171985626, + "eval_rewards/rejected": -3.21697735786438, + "eval_runtime": 156.5165, + "eval_samples_per_second": 27.499, + "eval_steps_per_second": 3.437, + "step": 500 + }, + { + "epoch": 0.08787043418332184, + "grad_norm": 19.6402287410714, + "learning_rate": 2.1963824289405682e-08, + "logits/chosen": -3.5477848052978516, + "logits/rejected": -3.5334854125976562, + "logps/chosen": -1.4841115474700928, + "logps/rejected": -1.5018410682678223, + "loss": 1.8526, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.9682230949401855, + "rewards/margins": 0.0354592502117157, + "rewards/rejected": -3.0036821365356445, + "step": 510 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 18.637066407974324, + "learning_rate": 2.239448751076658e-08, + "logits/chosen": -3.573958158493042, + "logits/rejected": -3.5582401752471924, + "logps/chosen": -1.5139377117156982, + "logps/rejected": -1.5434337854385376, + "loss": 1.8592, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -3.0278754234313965, + "rewards/margins": 0.058991968631744385, + "rewards/rejected": -3.086867570877075, + "step": 520 + }, + { + "epoch": 0.09131633356305996, + "grad_norm": 21.351471815598437, + "learning_rate": 2.2825150732127478e-08, + "logits/chosen": -3.564196825027466, + "logits/rejected": -3.5399632453918457, + "logps/chosen": -1.4041894674301147, + "logps/rejected": -1.5421769618988037, + "loss": 1.7015, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.8083789348602295, + "rewards/margins": 0.27597492933273315, + "rewards/rejected": -3.0843539237976074, + "step": 530 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 20.846167042560563, + "learning_rate": 2.3255813953488372e-08, + "logits/chosen": -3.5540452003479004, + "logits/rejected": -3.545433521270752, + "logps/chosen": -1.484818696975708, + "logps/rejected": -1.6298444271087646, + "loss": 1.6762, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.969637393951416, + "rewards/margins": 0.2900512218475342, + "rewards/rejected": -3.2596888542175293, + "step": 540 + }, + { + "epoch": 0.09476223294279806, + "grad_norm": 20.651081120613064, + "learning_rate": 2.3686477174849267e-08, + "logits/chosen": -3.589341640472412, + "logits/rejected": -3.579744815826416, + "logps/chosen": -1.5013021230697632, + "logps/rejected": -1.5594637393951416, + "loss": 1.8213, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.0026042461395264, + "rewards/margins": 0.11632323265075684, + "rewards/rejected": -3.118927478790283, + "step": 550 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 20.277426880735014, + "learning_rate": 2.411714039621016e-08, + "logits/chosen": -3.580414295196533, + "logits/rejected": -3.5781867504119873, + "logps/chosen": -1.4798322916030884, + "logps/rejected": -1.6554832458496094, + "loss": 1.6673, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.9596645832061768, + "rewards/margins": 0.35130202770233154, + "rewards/rejected": -3.3109664916992188, + "step": 560 + }, + { + "epoch": 0.09820813232253618, + "grad_norm": 22.197250461757175, + "learning_rate": 2.454780361757106e-08, + "logits/chosen": -3.5670745372772217, + "logits/rejected": -3.5619444847106934, + "logps/chosen": -1.4677410125732422, + "logps/rejected": -1.6027021408081055, + "loss": 1.6647, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.9354820251464844, + "rewards/margins": 0.2699225842952728, + "rewards/rejected": -3.205404281616211, + "step": 570 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 19.835432127541818, + "learning_rate": 2.4978466838931954e-08, + "logits/chosen": -3.564160108566284, + "logits/rejected": -3.561082124710083, + "logps/chosen": -1.408730387687683, + "logps/rejected": -1.5311354398727417, + "loss": 1.6892, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.817460775375366, + "rewards/margins": 0.2448098212480545, + "rewards/rejected": -3.0622708797454834, + "step": 580 + }, + { + "epoch": 0.1016540317022743, + "grad_norm": 26.244999845711387, + "learning_rate": 2.540913006029285e-08, + "logits/chosen": -3.571488618850708, + "logits/rejected": -3.5578505992889404, + "logps/chosen": -1.4785385131835938, + "logps/rejected": -1.5544307231903076, + "loss": 1.7912, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.9570770263671875, + "rewards/margins": 0.15178418159484863, + "rewards/rejected": -3.1088614463806152, + "step": 590 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 17.125293824468617, + "learning_rate": 2.5839793281653743e-08, + "logits/chosen": -3.5470309257507324, + "logits/rejected": -3.5336461067199707, + "logps/chosen": -1.398126244544983, + "logps/rejected": -1.6320680379867554, + "loss": 1.5371, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.796252489089966, + "rewards/margins": 0.4678835868835449, + "rewards/rejected": -3.2641360759735107, + "step": 600 + }, + { + "epoch": 0.10337698139214335, + "eval_logits/chosen": -3.6296470165252686, + "eval_logits/rejected": -3.625506639480591, + "eval_logps/chosen": -1.480892300605774, + "eval_logps/rejected": -1.6086041927337646, + "eval_loss": 1.6726182699203491, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -2.961784601211548, + "eval_rewards/margins": 0.2554238438606262, + "eval_rewards/rejected": -3.2172083854675293, + "eval_runtime": 156.2913, + "eval_samples_per_second": 27.538, + "eval_steps_per_second": 3.442, + "step": 600 + }, + { + "epoch": 0.1050999310820124, + "grad_norm": 18.58552928798443, + "learning_rate": 2.6270456503014644e-08, + "logits/chosen": -3.536872148513794, + "logits/rejected": -3.5338969230651855, + "logps/chosen": -1.4550403356552124, + "logps/rejected": -1.5466817617416382, + "loss": 1.7471, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.910080671310425, + "rewards/margins": 0.183282732963562, + "rewards/rejected": -3.0933635234832764, + "step": 610 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 21.394238931720228, + "learning_rate": 2.6701119724375536e-08, + "logits/chosen": -3.626967191696167, + "logits/rejected": -3.611609935760498, + "logps/chosen": -1.5008622407913208, + "logps/rejected": -1.594049096107483, + "loss": 1.7354, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.0017244815826416, + "rewards/margins": 0.1863737851381302, + "rewards/rejected": -3.188098192214966, + "step": 620 + }, + { + "epoch": 0.10854583046175052, + "grad_norm": 20.630734079299348, + "learning_rate": 2.713178294573643e-08, + "logits/chosen": -3.6170506477355957, + "logits/rejected": -3.6032090187072754, + "logps/chosen": -1.5035960674285889, + "logps/rejected": -1.5374523401260376, + "loss": 1.8164, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.0071921348571777, + "rewards/margins": 0.06771223992109299, + "rewards/rejected": -3.074904680252075, + "step": 630 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 20.13556218003904, + "learning_rate": 2.756244616709733e-08, + "logits/chosen": -3.605260133743286, + "logits/rejected": -3.5975863933563232, + "logps/chosen": -1.4863741397857666, + "logps/rejected": -1.567238450050354, + "loss": 1.7733, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.972748279571533, + "rewards/margins": 0.16172856092453003, + "rewards/rejected": -3.134476900100708, + "step": 640 + }, + { + "epoch": 0.11199172984148863, + "grad_norm": 18.811544326124523, + "learning_rate": 2.7993109388458226e-08, + "logits/chosen": -3.6120362281799316, + "logits/rejected": -3.6121413707733154, + "logps/chosen": -1.469596266746521, + "logps/rejected": -1.6095737218856812, + "loss": 1.683, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.939192533493042, + "rewards/margins": 0.2799549400806427, + "rewards/rejected": -3.2191474437713623, + "step": 650 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 20.28609298493491, + "learning_rate": 2.8423772609819118e-08, + "logits/chosen": -3.5339443683624268, + "logits/rejected": -3.5300650596618652, + "logps/chosen": -1.4714010953903198, + "logps/rejected": -1.564021348953247, + "loss": 1.8017, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.9428021907806396, + "rewards/margins": 0.1852409839630127, + "rewards/rejected": -3.128042697906494, + "step": 660 + }, + { + "epoch": 0.11543762922122675, + "grad_norm": 19.739021227854025, + "learning_rate": 2.885443583118002e-08, + "logits/chosen": -3.5562031269073486, + "logits/rejected": -3.5539703369140625, + "logps/chosen": -1.4507935047149658, + "logps/rejected": -1.5839375257492065, + "loss": 1.7222, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.9015870094299316, + "rewards/margins": 0.26628801226615906, + "rewards/rejected": -3.167875051498413, + "step": 670 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 26.12685469756143, + "learning_rate": 2.9285099052540913e-08, + "logits/chosen": -3.523918628692627, + "logits/rejected": -3.511462688446045, + "logps/chosen": -1.5035953521728516, + "logps/rejected": -1.5703895092010498, + "loss": 1.7879, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.007190704345703, + "rewards/margins": 0.13358855247497559, + "rewards/rejected": -3.1407790184020996, + "step": 680 + }, + { + "epoch": 0.11888352860096485, + "grad_norm": 21.362630359085554, + "learning_rate": 2.9715762273901808e-08, + "logits/chosen": -3.6143250465393066, + "logits/rejected": -3.597133159637451, + "logps/chosen": -1.5193963050842285, + "logps/rejected": -1.5184218883514404, + "loss": 1.8977, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -3.038792610168457, + "rewards/margins": -0.0019484668737277389, + "rewards/rejected": -3.036843776702881, + "step": 690 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 22.854103686680425, + "learning_rate": 3.01464254952627e-08, + "logits/chosen": -3.588505506515503, + "logits/rejected": -3.5724167823791504, + "logps/chosen": -1.4619343280792236, + "logps/rejected": -1.5499986410140991, + "loss": 1.8026, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.9238686561584473, + "rewards/margins": 0.17612841725349426, + "rewards/rejected": -3.0999972820281982, + "step": 700 + }, + { + "epoch": 0.1206064782908339, + "eval_logits/chosen": -3.6391210556030273, + "eval_logits/rejected": -3.63502836227417, + "eval_logps/chosen": -1.4810385704040527, + "eval_logps/rejected": -1.6085526943206787, + "eval_loss": 1.6728646755218506, + "eval_rewards/accuracies": 0.5950278639793396, + "eval_rewards/chosen": -2.9620771408081055, + "eval_rewards/margins": 0.25502845644950867, + "eval_rewards/rejected": -3.2171053886413574, + "eval_runtime": 156.8549, + "eval_samples_per_second": 27.439, + "eval_steps_per_second": 3.43, + "step": 700 + }, + { + "epoch": 0.12232942798070297, + "grad_norm": 18.656081057001593, + "learning_rate": 3.05770887166236e-08, + "logits/chosen": -3.570486545562744, + "logits/rejected": -3.552598237991333, + "logps/chosen": -1.4319484233856201, + "logps/rejected": -1.6559686660766602, + "loss": 1.5502, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8638968467712402, + "rewards/margins": 0.4480404853820801, + "rewards/rejected": -3.3119373321533203, + "step": 710 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 23.292631277652074, + "learning_rate": 3.100775193798449e-08, + "logits/chosen": -3.551525592803955, + "logits/rejected": -3.5492522716522217, + "logps/chosen": -1.5225058794021606, + "logps/rejected": -1.6068627834320068, + "loss": 1.7575, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0450117588043213, + "rewards/margins": 0.16871377825737, + "rewards/rejected": -3.2137255668640137, + "step": 720 + }, + { + "epoch": 0.12577532736044109, + "grad_norm": 21.92695943300785, + "learning_rate": 3.143841515934539e-08, + "logits/chosen": -3.6249396800994873, + "logits/rejected": -3.6110148429870605, + "logps/chosen": -1.4908926486968994, + "logps/rejected": -1.5358655452728271, + "loss": 1.8222, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.981785297393799, + "rewards/margins": 0.08994584530591965, + "rewards/rejected": -3.0717310905456543, + "step": 730 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 19.4199277266403, + "learning_rate": 3.186907838070629e-08, + "logits/chosen": -3.5469298362731934, + "logits/rejected": -3.5331854820251465, + "logps/chosen": -1.4376246929168701, + "logps/rejected": -1.5642764568328857, + "loss": 1.7134, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.8752493858337402, + "rewards/margins": 0.2533037066459656, + "rewards/rejected": -3.1285529136657715, + "step": 740 + }, + { + "epoch": 0.1292212267401792, + "grad_norm": 19.02417723474803, + "learning_rate": 3.229974160206718e-08, + "logits/chosen": -3.6596057415008545, + "logits/rejected": -3.6414332389831543, + "logps/chosen": -1.4254577159881592, + "logps/rejected": -1.5591310262680054, + "loss": 1.6597, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8509154319763184, + "rewards/margins": 0.2673465609550476, + "rewards/rejected": -3.1182620525360107, + "step": 750 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 20.62458694394565, + "learning_rate": 3.273040482342808e-08, + "logits/chosen": -3.5639166831970215, + "logits/rejected": -3.543168306350708, + "logps/chosen": -1.4346402883529663, + "logps/rejected": -1.5703129768371582, + "loss": 1.7082, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.8692805767059326, + "rewards/margins": 0.2713455557823181, + "rewards/rejected": -3.1406259536743164, + "step": 760 + }, + { + "epoch": 0.1326671261199173, + "grad_norm": 19.901519929181134, + "learning_rate": 3.3161068044788975e-08, + "logits/chosen": -3.5884757041931152, + "logits/rejected": -3.5742697715759277, + "logps/chosen": -1.4633798599243164, + "logps/rejected": -1.5973695516586304, + "loss": 1.6793, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.926759719848633, + "rewards/margins": 0.26797938346862793, + "rewards/rejected": -3.1947391033172607, + "step": 770 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 21.702157569363273, + "learning_rate": 3.359173126614987e-08, + "logits/chosen": -3.590942859649658, + "logits/rejected": -3.572681427001953, + "logps/chosen": -1.439408779144287, + "logps/rejected": -1.5788915157318115, + "loss": 1.665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.878817558288574, + "rewards/margins": 0.27896538376808167, + "rewards/rejected": -3.157783031463623, + "step": 780 + }, + { + "epoch": 0.1361130254996554, + "grad_norm": 20.23636842667887, + "learning_rate": 3.4022394487510764e-08, + "logits/chosen": -3.59818959236145, + "logits/rejected": -3.5803420543670654, + "logps/chosen": -1.4104173183441162, + "logps/rejected": -1.6195411682128906, + "loss": 1.5999, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.8208346366882324, + "rewards/margins": 0.4182472825050354, + "rewards/rejected": -3.2390823364257812, + "step": 790 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 22.13360956771661, + "learning_rate": 3.445305770887166e-08, + "logits/chosen": -3.525047779083252, + "logits/rejected": -3.5097203254699707, + "logps/chosen": -1.4574487209320068, + "logps/rejected": -1.639593482017517, + "loss": 1.6605, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.9148974418640137, + "rewards/margins": 0.3642897605895996, + "rewards/rejected": -3.279186964035034, + "step": 800 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.645733594894409, + "eval_logits/rejected": -3.6416783332824707, + "eval_logps/chosen": -1.4807242155075073, + "eval_logps/rejected": -1.608466386795044, + "eval_loss": 1.6727149486541748, + "eval_rewards/accuracies": 0.5947955250740051, + "eval_rewards/chosen": -2.9614484310150146, + "eval_rewards/margins": 0.2554841637611389, + "eval_rewards/rejected": -3.216932773590088, + "eval_runtime": 157.0677, + "eval_samples_per_second": 27.402, + "eval_steps_per_second": 3.425, + "step": 800 + }, + { + "epoch": 0.13955892487939353, + "grad_norm": 21.006315450532984, + "learning_rate": 3.488372093023256e-08, + "logits/chosen": -3.5812363624572754, + "logits/rejected": -3.5649590492248535, + "logps/chosen": -1.474457025527954, + "logps/rejected": -1.6316083669662476, + "loss": 1.6835, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.948914051055908, + "rewards/margins": 0.31430238485336304, + "rewards/rejected": -3.263216733932495, + "step": 810 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 18.079511237475508, + "learning_rate": 3.531438415159345e-08, + "logits/chosen": -3.6075165271759033, + "logits/rejected": -3.5917410850524902, + "logps/chosen": -1.4660320281982422, + "logps/rejected": -1.6277542114257812, + "loss": 1.7023, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.9320640563964844, + "rewards/margins": 0.3234441876411438, + "rewards/rejected": -3.2555084228515625, + "step": 820 + }, + { + "epoch": 0.14300482425913164, + "grad_norm": 21.4007452552807, + "learning_rate": 3.574504737295434e-08, + "logits/chosen": -3.5567336082458496, + "logits/rejected": -3.5496459007263184, + "logps/chosen": -1.5539747476577759, + "logps/rejected": -1.6063144207000732, + "loss": 1.8433, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.1079494953155518, + "rewards/margins": 0.10467959940433502, + "rewards/rejected": -3.2126288414001465, + "step": 830 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 23.22345461402823, + "learning_rate": 3.617571059431525e-08, + "logits/chosen": -3.626051664352417, + "logits/rejected": -3.6100640296936035, + "logps/chosen": -1.429190993309021, + "logps/rejected": -1.4924553632736206, + "loss": 1.7973, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.858381986618042, + "rewards/margins": 0.12652866542339325, + "rewards/rejected": -2.984910726547241, + "step": 840 + }, + { + "epoch": 0.14645072363886974, + "grad_norm": 25.045733873882515, + "learning_rate": 3.660637381567614e-08, + "logits/chosen": -3.539808750152588, + "logits/rejected": -3.5359091758728027, + "logps/chosen": -1.436225175857544, + "logps/rejected": -1.5675268173217773, + "loss": 1.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.872450351715088, + "rewards/margins": 0.2626029849052429, + "rewards/rejected": -3.1350536346435547, + "step": 850 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 25.220289397911028, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -3.5514674186706543, + "logits/rejected": -3.5397815704345703, + "logps/chosen": -1.5084354877471924, + "logps/rejected": -1.628011703491211, + "loss": 1.7265, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.0168709754943848, + "rewards/margins": 0.23915258049964905, + "rewards/rejected": -3.256023406982422, + "step": 860 + }, + { + "epoch": 0.14989662301860784, + "grad_norm": 18.916231292729453, + "learning_rate": 3.7467700258397934e-08, + "logits/chosen": -3.5999484062194824, + "logits/rejected": -3.5961127281188965, + "logps/chosen": -1.456427812576294, + "logps/rejected": -1.5721004009246826, + "loss": 1.7414, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.912855625152588, + "rewards/margins": 0.23134560883045197, + "rewards/rejected": -3.1442008018493652, + "step": 870 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 18.3387720490591, + "learning_rate": 3.7898363479758826e-08, + "logits/chosen": -3.550314426422119, + "logits/rejected": -3.537358522415161, + "logps/chosen": -1.4116275310516357, + "logps/rejected": -1.6233447790145874, + "loss": 1.6498, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8232550621032715, + "rewards/margins": 0.4234341084957123, + "rewards/rejected": -3.246689558029175, + "step": 880 + }, + { + "epoch": 0.15334252239834598, + "grad_norm": 22.17106908167341, + "learning_rate": 3.8329026701119724e-08, + "logits/chosen": -3.5667636394500732, + "logits/rejected": -3.5453014373779297, + "logps/chosen": -1.4615423679351807, + "logps/rejected": -1.6251497268676758, + "loss": 1.6636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9230847358703613, + "rewards/margins": 0.32721468806266785, + "rewards/rejected": -3.2502994537353516, + "step": 890 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 20.197536984868997, + "learning_rate": 3.875968992248062e-08, + "logits/chosen": -3.580812931060791, + "logits/rejected": -3.574019193649292, + "logps/chosen": -1.5141441822052002, + "logps/rejected": -1.5647560358047485, + "loss": 1.8262, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0282883644104004, + "rewards/margins": 0.10122356563806534, + "rewards/rejected": -3.129512071609497, + "step": 900 + }, + { + "epoch": 0.15506547208821503, + "eval_logits/chosen": -3.6524665355682373, + "eval_logits/rejected": -3.648446559906006, + "eval_logps/chosen": -1.4808032512664795, + "eval_logps/rejected": -1.608715534210205, + "eval_loss": 1.6722424030303955, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -2.961606502532959, + "eval_rewards/margins": 0.2558245062828064, + "eval_rewards/rejected": -3.21743106842041, + "eval_runtime": 156.5742, + "eval_samples_per_second": 27.489, + "eval_steps_per_second": 3.436, + "step": 900 + }, + { + "epoch": 0.15678842177808408, + "grad_norm": 17.593774749813324, + "learning_rate": 3.919035314384151e-08, + "logits/chosen": -3.5718040466308594, + "logits/rejected": -3.564917802810669, + "logps/chosen": -1.3974502086639404, + "logps/rejected": -1.5122140645980835, + "loss": 1.7322, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.794900417327881, + "rewards/margins": 0.22952762246131897, + "rewards/rejected": -3.024428129196167, + "step": 910 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 21.57888557517378, + "learning_rate": 3.962101636520241e-08, + "logits/chosen": -3.5921006202697754, + "logits/rejected": -3.5670273303985596, + "logps/chosen": -1.4628071784973145, + "logps/rejected": -1.583765983581543, + "loss": 1.7025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.925614356994629, + "rewards/margins": 0.24191761016845703, + "rewards/rejected": -3.167531967163086, + "step": 920 + }, + { + "epoch": 0.16023432115782218, + "grad_norm": 20.71464830674528, + "learning_rate": 4.005167958656331e-08, + "logits/chosen": -3.6280581951141357, + "logits/rejected": -3.6173291206359863, + "logps/chosen": -1.4552582502365112, + "logps/rejected": -1.6299874782562256, + "loss": 1.6432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9105165004730225, + "rewards/margins": 0.34945863485336304, + "rewards/rejected": -3.259974956512451, + "step": 930 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 17.758079417529395, + "learning_rate": 4.04823428079242e-08, + "logits/chosen": -3.616046905517578, + "logits/rejected": -3.59240984916687, + "logps/chosen": -1.504128336906433, + "logps/rejected": -1.616919755935669, + "loss": 1.7329, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.008256673812866, + "rewards/margins": 0.22558251023292542, + "rewards/rejected": -3.233839511871338, + "step": 940 + }, + { + "epoch": 0.16368022053756032, + "grad_norm": 19.511086089262573, + "learning_rate": 4.09130060292851e-08, + "logits/chosen": -3.4845454692840576, + "logits/rejected": -3.4763901233673096, + "logps/chosen": -1.4652578830718994, + "logps/rejected": -1.6250728368759155, + "loss": 1.6685, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.930515766143799, + "rewards/margins": 0.3196297287940979, + "rewards/rejected": -3.250145673751831, + "step": 950 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 25.859353337983073, + "learning_rate": 4.134366925064599e-08, + "logits/chosen": -3.458118438720703, + "logits/rejected": -3.4601783752441406, + "logps/chosen": -1.4490973949432373, + "logps/rejected": -1.5698236227035522, + "loss": 1.7277, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.8981947898864746, + "rewards/margins": 0.24145250022411346, + "rewards/rejected": -3.1396472454071045, + "step": 960 + }, + { + "epoch": 0.16712611991729842, + "grad_norm": 22.477604648216655, + "learning_rate": 4.177433247200689e-08, + "logits/chosen": -3.5714828968048096, + "logits/rejected": -3.5525429248809814, + "logps/chosen": -1.5524942874908447, + "logps/rejected": -1.5612355470657349, + "loss": 1.881, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.1049885749816895, + "rewards/margins": 0.017482534050941467, + "rewards/rejected": -3.1224710941314697, + "step": 970 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 23.961463301691463, + "learning_rate": 4.2204995693367785e-08, + "logits/chosen": -3.637877941131592, + "logits/rejected": -3.6282832622528076, + "logps/chosen": -1.4827979803085327, + "logps/rejected": -1.5704785585403442, + "loss": 1.7518, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.9655959606170654, + "rewards/margins": 0.1753612458705902, + "rewards/rejected": -3.1409571170806885, + "step": 980 + }, + { + "epoch": 0.17057201929703653, + "grad_norm": 21.277401788158823, + "learning_rate": 4.2635658914728676e-08, + "logits/chosen": -3.5512595176696777, + "logits/rejected": -3.5372185707092285, + "logps/chosen": -1.4302070140838623, + "logps/rejected": -1.558959722518921, + "loss": 1.7238, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.8604140281677246, + "rewards/margins": 0.2575052082538605, + "rewards/rejected": -3.117919445037842, + "step": 990 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 20.18282959941036, + "learning_rate": 4.306632213608958e-08, + "logits/chosen": -3.5313973426818848, + "logits/rejected": -3.5173580646514893, + "logps/chosen": -1.5072028636932373, + "logps/rejected": -1.5416905879974365, + "loss": 1.8332, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.0144057273864746, + "rewards/margins": 0.06897564977407455, + "rewards/rejected": -3.083381175994873, + "step": 1000 + }, + { + "epoch": 0.17229496898690558, + "eval_logits/chosen": -3.6392018795013428, + "eval_logits/rejected": -3.6351237297058105, + "eval_logps/chosen": -1.480643630027771, + "eval_logps/rejected": -1.6087019443511963, + "eval_loss": 1.6719063520431519, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -2.961287260055542, + "eval_rewards/margins": 0.2561165988445282, + "eval_rewards/rejected": -3.2174038887023926, + "eval_runtime": 156.9709, + "eval_samples_per_second": 27.419, + "eval_steps_per_second": 3.427, + "step": 1000 + }, + { + "epoch": 0.17401791867677463, + "grad_norm": 22.549235840927622, + "learning_rate": 4.349698535745047e-08, + "logits/chosen": -3.4812381267547607, + "logits/rejected": -3.488011121749878, + "logps/chosen": -1.535354733467102, + "logps/rejected": -1.600327491760254, + "loss": 1.7968, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.070709466934204, + "rewards/margins": 0.129945307970047, + "rewards/rejected": -3.200654983520508, + "step": 1010 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 17.334819212199488, + "learning_rate": 4.3927648578811363e-08, + "logits/chosen": -3.6039252281188965, + "logits/rejected": -3.5823960304260254, + "logps/chosen": -1.4869974851608276, + "logps/rejected": -1.6197044849395752, + "loss": 1.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9739949703216553, + "rewards/margins": 0.26541396975517273, + "rewards/rejected": -3.2394089698791504, + "step": 1020 + }, + { + "epoch": 0.17746381805651276, + "grad_norm": 21.46757631886516, + "learning_rate": 4.435831180017227e-08, + "logits/chosen": -3.625187397003174, + "logits/rejected": -3.6097915172576904, + "logps/chosen": -1.4331996440887451, + "logps/rejected": -1.5433218479156494, + "loss": 1.7376, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.8663992881774902, + "rewards/margins": 0.22024443745613098, + "rewards/rejected": -3.086643695831299, + "step": 1030 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 29.183155292522436, + "learning_rate": 4.478897502153316e-08, + "logits/chosen": -3.5739834308624268, + "logits/rejected": -3.567633867263794, + "logps/chosen": -1.5305505990982056, + "logps/rejected": -1.6371654272079468, + "loss": 1.7329, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.061101198196411, + "rewards/margins": 0.21322989463806152, + "rewards/rejected": -3.2743308544158936, + "step": 1040 + }, + { + "epoch": 0.18090971743625087, + "grad_norm": 23.277523827679712, + "learning_rate": 4.521963824289405e-08, + "logits/chosen": -3.5321097373962402, + "logits/rejected": -3.5123629570007324, + "logps/chosen": -1.4984990358352661, + "logps/rejected": -1.5224298238754272, + "loss": 1.8517, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.9969980716705322, + "rewards/margins": 0.04786177724599838, + "rewards/rejected": -3.0448596477508545, + "step": 1050 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 25.76960064142977, + "learning_rate": 4.5650301464254955e-08, + "logits/chosen": -3.533273220062256, + "logits/rejected": -3.5235283374786377, + "logps/chosen": -1.5117337703704834, + "logps/rejected": -1.5732696056365967, + "loss": 1.7763, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.023467540740967, + "rewards/margins": 0.12307200580835342, + "rewards/rejected": -3.1465392112731934, + "step": 1060 + }, + { + "epoch": 0.18435561681598897, + "grad_norm": 20.340426671580456, + "learning_rate": 4.6080964685615846e-08, + "logits/chosen": -3.629505157470703, + "logits/rejected": -3.612126111984253, + "logps/chosen": -1.5048744678497314, + "logps/rejected": -1.6760103702545166, + "loss": 1.7207, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.009748935699463, + "rewards/margins": 0.34227117896080017, + "rewards/rejected": -3.352020740509033, + "step": 1070 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 19.263393580508907, + "learning_rate": 4.6511627906976744e-08, + "logits/chosen": -3.6053104400634766, + "logits/rejected": -3.5923900604248047, + "logps/chosen": -1.5000163316726685, + "logps/rejected": -1.547035813331604, + "loss": 1.8491, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -3.000032663345337, + "rewards/margins": 0.09403891861438751, + "rewards/rejected": -3.094071626663208, + "step": 1080 + }, + { + "epoch": 0.18780151619572708, + "grad_norm": 21.641648491048254, + "learning_rate": 4.6942291128337636e-08, + "logits/chosen": -3.538170337677002, + "logits/rejected": -3.5322654247283936, + "logps/chosen": -1.4581468105316162, + "logps/rejected": -1.5835081338882446, + "loss": 1.719, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.9162936210632324, + "rewards/margins": 0.2507225573062897, + "rewards/rejected": -3.1670162677764893, + "step": 1090 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 21.756416553981836, + "learning_rate": 4.7372954349698534e-08, + "logits/chosen": -3.569575786590576, + "logits/rejected": -3.566394329071045, + "logps/chosen": -1.464989423751831, + "logps/rejected": -1.607704520225525, + "loss": 1.6755, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.929978847503662, + "rewards/margins": 0.2854299247264862, + "rewards/rejected": -3.21540904045105, + "step": 1100 + }, + { + "epoch": 0.18952446588559613, + "eval_logits/chosen": -3.631704568862915, + "eval_logits/rejected": -3.6275956630706787, + "eval_logps/chosen": -1.4806867837905884, + "eval_logps/rejected": -1.6084924936294556, + "eval_loss": 1.6723828315734863, + "eval_rewards/accuracies": 0.5943308472633362, + "eval_rewards/chosen": -2.9613735675811768, + "eval_rewards/margins": 0.2556114196777344, + "eval_rewards/rejected": -3.216984987258911, + "eval_runtime": 156.7128, + "eval_samples_per_second": 27.464, + "eval_steps_per_second": 3.433, + "step": 1100 + }, + { + "epoch": 0.1912474155754652, + "grad_norm": 26.322350849718706, + "learning_rate": 4.780361757105943e-08, + "logits/chosen": -3.571938991546631, + "logits/rejected": -3.5797340869903564, + "logps/chosen": -1.5180728435516357, + "logps/rejected": -1.6018564701080322, + "loss": 1.7767, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0361456871032715, + "rewards/margins": 0.16756734251976013, + "rewards/rejected": -3.2037129402160645, + "step": 1110 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 20.06204183779379, + "learning_rate": 4.823428079242032e-08, + "logits/chosen": -3.595245361328125, + "logits/rejected": -3.5821094512939453, + "logps/chosen": -1.4761567115783691, + "logps/rejected": -1.612396001815796, + "loss": 1.7096, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9523134231567383, + "rewards/margins": 0.27247875928878784, + "rewards/rejected": -3.224792003631592, + "step": 1120 + }, + { + "epoch": 0.1946933149552033, + "grad_norm": 24.694844697373632, + "learning_rate": 4.866494401378122e-08, + "logits/chosen": -3.6283652782440186, + "logits/rejected": -3.612753391265869, + "logps/chosen": -1.4537689685821533, + "logps/rejected": -1.6631561517715454, + "loss": 1.5852, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.9075379371643066, + "rewards/margins": 0.4187743067741394, + "rewards/rejected": -3.326312303543091, + "step": 1130 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 19.34606417058384, + "learning_rate": 4.909560723514212e-08, + "logits/chosen": -3.569889783859253, + "logits/rejected": -3.5479419231414795, + "logps/chosen": -1.5082037448883057, + "logps/rejected": -1.642213225364685, + "loss": 1.729, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.0164074897766113, + "rewards/margins": 0.2680189609527588, + "rewards/rejected": -3.28442645072937, + "step": 1140 + }, + { + "epoch": 0.19813921433494142, + "grad_norm": 22.098446964601386, + "learning_rate": 4.952627045650301e-08, + "logits/chosen": -3.5346240997314453, + "logits/rejected": -3.5219757556915283, + "logps/chosen": -1.4543148279190063, + "logps/rejected": -1.6135616302490234, + "loss": 1.6559, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.9086296558380127, + "rewards/margins": 0.31849366426467896, + "rewards/rejected": -3.227123260498047, + "step": 1150 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 20.74690560227147, + "learning_rate": 4.995693367786391e-08, + "logits/chosen": -3.5969436168670654, + "logits/rejected": -3.5925216674804688, + "logps/chosen": -1.4716525077819824, + "logps/rejected": -1.5331724882125854, + "loss": 1.7999, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.943305015563965, + "rewards/margins": 0.12303996086120605, + "rewards/rejected": -3.066344976425171, + "step": 1160 + }, + { + "epoch": 0.20158511371467952, + "grad_norm": 19.72058790414574, + "learning_rate": 4.9999908438832287e-08, + "logits/chosen": -3.511765241622925, + "logits/rejected": -3.5033295154571533, + "logps/chosen": -1.4664690494537354, + "logps/rejected": -1.6581010818481445, + "loss": 1.6252, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.9329380989074707, + "rewards/margins": 0.3832641541957855, + "rewards/rejected": -3.316202163696289, + "step": 1170 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 22.012286159249534, + "learning_rate": 4.999959193195308e-08, + "logits/chosen": -3.4963607788085938, + "logits/rejected": -3.478729248046875, + "logps/chosen": -1.4550493955612183, + "logps/rejected": -1.6278083324432373, + "loss": 1.6389, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.9100987911224365, + "rewards/margins": 0.3455182909965515, + "rewards/rejected": -3.2556166648864746, + "step": 1180 + }, + { + "epoch": 0.20503101309441765, + "grad_norm": 22.766611381590625, + "learning_rate": 4.9999049351839105e-08, + "logits/chosen": -3.622887134552002, + "logits/rejected": -3.6028530597686768, + "logps/chosen": -1.4993988275527954, + "logps/rejected": -1.5292878150939941, + "loss": 1.8696, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.998797655105591, + "rewards/margins": 0.05977789685130119, + "rewards/rejected": -3.0585756301879883, + "step": 1190 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 20.23555239884495, + "learning_rate": 4.9998280703396977e-08, + "logits/chosen": -3.5272789001464844, + "logits/rejected": -3.5173110961914062, + "logps/chosen": -1.4271221160888672, + "logps/rejected": -1.5446093082427979, + "loss": 1.7075, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.8542442321777344, + "rewards/margins": 0.23497410118579865, + "rewards/rejected": -3.0892186164855957, + "step": 1200 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.6370413303375244, + "eval_logits/rejected": -3.6329727172851562, + "eval_logps/chosen": -1.480568528175354, + "eval_logps/rejected": -1.6091132164001465, + "eval_loss": 1.6712294816970825, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -2.961137056350708, + "eval_rewards/margins": 0.2570895552635193, + "eval_rewards/rejected": -3.218226432800293, + "eval_runtime": 156.5218, + "eval_samples_per_second": 27.498, + "eval_steps_per_second": 3.437, + "step": 1200 + }, + { + "epoch": 0.20847691247415576, + "grad_norm": 21.549815088276873, + "learning_rate": 4.9997285993577624e-08, + "logits/chosen": -3.571795701980591, + "logits/rejected": -3.550962448120117, + "logps/chosen": -1.4539108276367188, + "logps/rejected": -1.6660518646240234, + "loss": 1.5983, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.9078216552734375, + "rewards/margins": 0.4242820739746094, + "rewards/rejected": -3.332103729248047, + "step": 1210 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 19.375879102870414, + "learning_rate": 4.999606523137628e-08, + "logits/chosen": -3.5903420448303223, + "logits/rejected": -3.574993133544922, + "logps/chosen": -1.4637928009033203, + "logps/rejected": -1.5739166736602783, + "loss": 1.7342, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.9275856018066406, + "rewards/margins": 0.2202473133802414, + "rewards/rejected": -3.1478333473205566, + "step": 1220 + }, + { + "epoch": 0.21192281185389386, + "grad_norm": 23.524020107144423, + "learning_rate": 4.99946184278324e-08, + "logits/chosen": -3.6223480701446533, + "logits/rejected": -3.59686017036438, + "logps/chosen": -1.47654128074646, + "logps/rejected": -1.6213090419769287, + "loss": 1.6763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.95308256149292, + "rewards/margins": 0.2895355522632599, + "rewards/rejected": -3.2426180839538574, + "step": 1230 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 20.20071375144532, + "learning_rate": 4.9992945596029545e-08, + "logits/chosen": -3.5450539588928223, + "logits/rejected": -3.5339252948760986, + "logps/chosen": -1.449210286140442, + "logps/rejected": -1.565976858139038, + "loss": 1.7266, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.898420572280884, + "rewards/margins": 0.23353299498558044, + "rewards/rejected": -3.131953716278076, + "step": 1240 + }, + { + "epoch": 0.21536871123363197, + "grad_norm": 25.132167732962426, + "learning_rate": 4.999104675109525e-08, + "logits/chosen": -3.612946033477783, + "logits/rejected": -3.593456268310547, + "logps/chosen": -1.4484608173370361, + "logps/rejected": -1.5634796619415283, + "loss": 1.7237, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.8969216346740723, + "rewards/margins": 0.23003725707530975, + "rewards/rejected": -3.1269593238830566, + "step": 1250 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 22.57455253662109, + "learning_rate": 4.998892191020092e-08, + "logits/chosen": -3.526859998703003, + "logits/rejected": -3.506258010864258, + "logps/chosen": -1.4318784475326538, + "logps/rejected": -1.597978949546814, + "loss": 1.6736, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.8637568950653076, + "rewards/margins": 0.33220115303993225, + "rewards/rejected": -3.195957899093628, + "step": 1260 + }, + { + "epoch": 0.2188146106133701, + "grad_norm": 22.034137552561063, + "learning_rate": 4.9986571092561664e-08, + "logits/chosen": -3.5680012702941895, + "logits/rejected": -3.563668727874756, + "logps/chosen": -1.4378749132156372, + "logps/rejected": -1.5994627475738525, + "loss": 1.6862, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.8757498264312744, + "rewards/margins": 0.3231762647628784, + "rewards/rejected": -3.198925495147705, + "step": 1270 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 19.62648607149336, + "learning_rate": 4.9983994319436093e-08, + "logits/chosen": -3.6123290061950684, + "logits/rejected": -3.615023136138916, + "logps/chosen": -1.4364560842514038, + "logps/rejected": -1.6675817966461182, + "loss": 1.5943, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8729121685028076, + "rewards/margins": 0.4622512757778168, + "rewards/rejected": -3.3351635932922363, + "step": 1280 + }, + { + "epoch": 0.2222605099931082, + "grad_norm": 19.287218004338456, + "learning_rate": 4.998119161412618e-08, + "logits/chosen": -3.5515658855438232, + "logits/rejected": -3.5324268341064453, + "logps/chosen": -1.449162483215332, + "logps/rejected": -1.5110379457473755, + "loss": 1.7755, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.898324966430664, + "rewards/margins": 0.12375102937221527, + "rewards/rejected": -3.022075891494751, + "step": 1290 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 20.315390943338596, + "learning_rate": 4.997816300197699e-08, + "logits/chosen": -3.5940921306610107, + "logits/rejected": -3.5856773853302, + "logps/chosen": -1.4592989683151245, + "logps/rejected": -1.628976583480835, + "loss": 1.6355, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.918597936630249, + "rewards/margins": 0.3393554091453552, + "rewards/rejected": -3.25795316696167, + "step": 1300 + }, + { + "epoch": 0.22398345968297725, + "eval_logits/chosen": -3.632081985473633, + "eval_logits/rejected": -3.6280007362365723, + "eval_logps/chosen": -1.4803121089935303, + "eval_logps/rejected": -1.609010100364685, + "eval_loss": 1.6708908081054688, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -2.9606242179870605, + "eval_rewards/margins": 0.25739625096321106, + "eval_rewards/rejected": -3.21802020072937, + "eval_runtime": 156.5313, + "eval_samples_per_second": 27.496, + "eval_steps_per_second": 3.437, + "step": 1300 + }, + { + "epoch": 0.2257064093728463, + "grad_norm": 22.70891033700004, + "learning_rate": 4.99749085103765e-08, + "logits/chosen": -3.5989272594451904, + "logits/rejected": -3.5790035724639893, + "logps/chosen": -1.4585853815078735, + "logps/rejected": -1.5244481563568115, + "loss": 1.7823, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.917170763015747, + "rewards/margins": 0.13172602653503418, + "rewards/rejected": -3.048896312713623, + "step": 1310 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 17.754828623145848, + "learning_rate": 4.9971428168755336e-08, + "logits/chosen": -3.573071002960205, + "logits/rejected": -3.5581250190734863, + "logps/chosen": -1.4547202587127686, + "logps/rejected": -1.5332162380218506, + "loss": 1.7641, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.909440517425537, + "rewards/margins": 0.15699197351932526, + "rewards/rejected": -3.066432476043701, + "step": 1320 + }, + { + "epoch": 0.22915230875258444, + "grad_norm": 23.660467236566603, + "learning_rate": 4.9967722008586484e-08, + "logits/chosen": -3.6219820976257324, + "logits/rejected": -3.6049609184265137, + "logps/chosen": -1.481609582901001, + "logps/rejected": -1.5738756656646729, + "loss": 1.8212, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.963219165802002, + "rewards/margins": 0.1845320761203766, + "rewards/rejected": -3.1477513313293457, + "step": 1330 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 20.158173390090354, + "learning_rate": 4.996379006338504e-08, + "logits/chosen": -3.5347938537597656, + "logits/rejected": -3.522195816040039, + "logps/chosen": -1.473888635635376, + "logps/rejected": -1.549046277999878, + "loss": 1.7666, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.947777271270752, + "rewards/margins": 0.15031549334526062, + "rewards/rejected": -3.098092555999756, + "step": 1340 + }, + { + "epoch": 0.23259820813232254, + "grad_norm": 18.752969157364884, + "learning_rate": 4.995963236870789e-08, + "logits/chosen": -3.5722720623016357, + "logits/rejected": -3.5588626861572266, + "logps/chosen": -1.4716870784759521, + "logps/rejected": -1.5807571411132812, + "loss": 1.7371, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9433741569519043, + "rewards/margins": 0.21814005076885223, + "rewards/rejected": -3.1615142822265625, + "step": 1350 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 26.139924068207062, + "learning_rate": 4.995524896215339e-08, + "logits/chosen": -3.54327130317688, + "logits/rejected": -3.536944627761841, + "logps/chosen": -1.52133309841156, + "logps/rejected": -1.6376701593399048, + "loss": 1.7145, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.04266619682312, + "rewards/margins": 0.23267368972301483, + "rewards/rejected": -3.2753403186798096, + "step": 1360 + }, + { + "epoch": 0.23604410751206065, + "grad_norm": 20.865889425628666, + "learning_rate": 4.9950639883361015e-08, + "logits/chosen": -3.6135001182556152, + "logits/rejected": -3.6013622283935547, + "logps/chosen": -1.4751484394073486, + "logps/rejected": -1.6276206970214844, + "loss": 1.6605, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.9502968788146973, + "rewards/margins": 0.3049442172050476, + "rewards/rejected": -3.2552413940429688, + "step": 1370 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 19.939291550838888, + "learning_rate": 4.9945805174011024e-08, + "logits/chosen": -3.5383689403533936, + "logits/rejected": -3.525432586669922, + "logps/chosen": -1.4426517486572266, + "logps/rejected": -1.5534908771514893, + "loss": 1.7205, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.885303497314453, + "rewards/margins": 0.22167833149433136, + "rewards/rejected": -3.1069817543029785, + "step": 1380 + }, + { + "epoch": 0.23949000689179875, + "grad_norm": 19.380543632144967, + "learning_rate": 4.994074487782406e-08, + "logits/chosen": -3.635735273361206, + "logits/rejected": -3.619929552078247, + "logps/chosen": -1.4989776611328125, + "logps/rejected": -1.588587760925293, + "loss": 1.7458, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.997955322265625, + "rewards/margins": 0.17922045290470123, + "rewards/rejected": -3.177175521850586, + "step": 1390 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 21.091985629467143, + "learning_rate": 4.9935459040560776e-08, + "logits/chosen": -3.525209426879883, + "logits/rejected": -3.509110927581787, + "logps/chosen": -1.5093200206756592, + "logps/rejected": -1.632318139076233, + "loss": 1.7433, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.0186400413513184, + "rewards/margins": 0.2459961175918579, + "rewards/rejected": -3.264636278152466, + "step": 1400 + }, + { + "epoch": 0.2412129565816678, + "eval_logits/chosen": -3.639413356781006, + "eval_logits/rejected": -3.6353795528411865, + "eval_logps/chosen": -1.4806628227233887, + "eval_logps/rejected": -1.6097067594528198, + "eval_loss": 1.670569658279419, + "eval_rewards/accuracies": 0.5961896181106567, + "eval_rewards/chosen": -2.9613256454467773, + "eval_rewards/margins": 0.2580878734588623, + "eval_rewards/rejected": -3.2194135189056396, + "eval_runtime": 156.6608, + "eval_samples_per_second": 27.473, + "eval_steps_per_second": 3.434, + "step": 1400 + }, + { + "epoch": 0.24293590627153688, + "grad_norm": 19.993359118082786, + "learning_rate": 4.9929947710021415e-08, + "logits/chosen": -3.5655922889709473, + "logits/rejected": -3.555891513824463, + "logps/chosen": -1.5101096630096436, + "logps/rejected": -1.588991403579712, + "loss": 1.7572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.020219326019287, + "rewards/margins": 0.15776383876800537, + "rewards/rejected": -3.177982807159424, + "step": 1410 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 18.014580965022613, + "learning_rate": 4.992421093604534e-08, + "logits/chosen": -3.511384963989258, + "logits/rejected": -3.5136208534240723, + "logps/chosen": -1.447616457939148, + "logps/rejected": -1.541853904724121, + "loss": 1.7616, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.895232915878296, + "rewards/margins": 0.1884748488664627, + "rewards/rejected": -3.083707809448242, + "step": 1420 + }, + { + "epoch": 0.246381805651275, + "grad_norm": 22.54572725643448, + "learning_rate": 4.9918248770510664e-08, + "logits/chosen": -3.593111753463745, + "logits/rejected": -3.587662935256958, + "logps/chosen": -1.5199015140533447, + "logps/rejected": -1.6602470874786377, + "loss": 1.6979, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.0398030281066895, + "rewards/margins": 0.2806907892227173, + "rewards/rejected": -3.3204941749572754, + "step": 1430 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 21.857755793669604, + "learning_rate": 4.9912061267333696e-08, + "logits/chosen": -3.5442943572998047, + "logits/rejected": -3.526163101196289, + "logps/chosen": -1.4087787866592407, + "logps/rejected": -1.5425742864608765, + "loss": 1.684, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.8175575733184814, + "rewards/margins": 0.26759085059165955, + "rewards/rejected": -3.085148572921753, + "step": 1440 + }, + { + "epoch": 0.2498277050310131, + "grad_norm": 22.767065587859456, + "learning_rate": 4.99056484824685e-08, + "logits/chosen": -3.560847043991089, + "logits/rejected": -3.5403411388397217, + "logps/chosen": -1.4330015182495117, + "logps/rejected": -1.5971437692642212, + "loss": 1.641, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.8660030364990234, + "rewards/margins": 0.32828429341316223, + "rewards/rejected": -3.1942875385284424, + "step": 1450 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 25.51712840759948, + "learning_rate": 4.98990104739064e-08, + "logits/chosen": -3.547229766845703, + "logits/rejected": -3.5296969413757324, + "logps/chosen": -1.442857027053833, + "logps/rejected": -1.6460767984390259, + "loss": 1.6348, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.885714054107666, + "rewards/margins": 0.40643948316574097, + "rewards/rejected": -3.2921535968780518, + "step": 1460 + }, + { + "epoch": 0.2532736044107512, + "grad_norm": 22.864234147491857, + "learning_rate": 4.98921473016754e-08, + "logits/chosen": -3.651045322418213, + "logits/rejected": -3.6313037872314453, + "logps/chosen": -1.5406333208084106, + "logps/rejected": -1.7411892414093018, + "loss": 1.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0812666416168213, + "rewards/margins": 0.40111178159713745, + "rewards/rejected": -3.4823784828186035, + "step": 1470 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 19.777209985530344, + "learning_rate": 4.9885059027839705e-08, + "logits/chosen": -3.6377053260803223, + "logits/rejected": -3.616368532180786, + "logps/chosen": -1.3906090259552002, + "logps/rejected": -1.574596881866455, + "loss": 1.595, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.7812180519104004, + "rewards/margins": 0.36797574162483215, + "rewards/rejected": -3.14919376373291, + "step": 1480 + }, + { + "epoch": 0.2567195037904893, + "grad_norm": 21.765506854573864, + "learning_rate": 4.987774571649912e-08, + "logits/chosen": -3.5903878211975098, + "logits/rejected": -3.576184034347534, + "logps/chosen": -1.5417073965072632, + "logps/rejected": -1.6099088191986084, + "loss": 1.7804, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.0834147930145264, + "rewards/margins": 0.1364024579524994, + "rewards/rejected": -3.219817638397217, + "step": 1490 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 17.44706201988054, + "learning_rate": 4.987020743378848e-08, + "logits/chosen": -3.546238660812378, + "logits/rejected": -3.5451531410217285, + "logps/chosen": -1.4311730861663818, + "logps/rejected": -1.5324180126190186, + "loss": 1.737, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.8623461723327637, + "rewards/margins": 0.20248980820178986, + "rewards/rejected": -3.064836025238037, + "step": 1500 + }, + { + "epoch": 0.2584424534803584, + "eval_logits/chosen": -3.640321969985962, + "eval_logits/rejected": -3.636312961578369, + "eval_logps/chosen": -1.480668306350708, + "eval_logps/rejected": -1.6101878881454468, + "eval_loss": 1.6698158979415894, + "eval_rewards/accuracies": 0.5943308472633362, + "eval_rewards/chosen": -2.961336612701416, + "eval_rewards/margins": 0.25903916358947754, + "eval_rewards/rejected": -3.2203757762908936, + "eval_runtime": 157.0756, + "eval_samples_per_second": 27.401, + "eval_steps_per_second": 3.425, + "step": 1500 + }, + { + "epoch": 0.2601654031702274, + "grad_norm": 21.113911970070056, + "learning_rate": 4.9862444247877054e-08, + "logits/chosen": -3.481945514678955, + "logits/rejected": -3.4615378379821777, + "logps/chosen": -1.5719285011291504, + "logps/rejected": -1.6417945623397827, + "loss": 1.8146, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -3.143857002258301, + "rewards/margins": 0.139731764793396, + "rewards/rejected": -3.2835891246795654, + "step": 1510 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 21.621683672652942, + "learning_rate": 4.985445622896794e-08, + "logits/chosen": -3.566519260406494, + "logits/rejected": -3.562079906463623, + "logps/chosen": -1.5101782083511353, + "logps/rejected": -1.5621944665908813, + "loss": 1.8242, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -3.0203564167022705, + "rewards/margins": 0.10403241217136383, + "rewards/rejected": -3.1243889331817627, + "step": 1520 + }, + { + "epoch": 0.26361130254996556, + "grad_norm": 23.755741149037412, + "learning_rate": 4.98462434492974e-08, + "logits/chosen": -3.498170852661133, + "logits/rejected": -3.4871859550476074, + "logps/chosen": -1.4181193113327026, + "logps/rejected": -1.5177221298217773, + "loss": 1.7626, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.8362386226654053, + "rewards/margins": 0.19920578598976135, + "rewards/rejected": -3.0354442596435547, + "step": 1530 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 26.764271397064284, + "learning_rate": 4.983780598313423e-08, + "logits/chosen": -3.578181505203247, + "logits/rejected": -3.5608437061309814, + "logps/chosen": -1.5191493034362793, + "logps/rejected": -1.5809532403945923, + "loss": 1.788, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.0382986068725586, + "rewards/margins": 0.12360771000385284, + "rewards/rejected": -3.1619064807891846, + "step": 1540 + }, + { + "epoch": 0.26705720192970367, + "grad_norm": 18.367355994613284, + "learning_rate": 4.982914390677909e-08, + "logits/chosen": -3.5325074195861816, + "logits/rejected": -3.52030873298645, + "logps/chosen": -1.5295774936676025, + "logps/rejected": -1.5987284183502197, + "loss": 1.775, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.059154987335205, + "rewards/margins": 0.13830192387104034, + "rewards/rejected": -3.1974568367004395, + "step": 1550 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 21.528320698660544, + "learning_rate": 4.982025729856381e-08, + "logits/chosen": -3.5326709747314453, + "logits/rejected": -3.5235695838928223, + "logps/chosen": -1.4730504751205444, + "logps/rejected": -1.5706760883331299, + "loss": 1.7402, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.946100950241089, + "rewards/margins": 0.19525082409381866, + "rewards/rejected": -3.1413521766662598, + "step": 1560 + }, + { + "epoch": 0.2705031013094418, + "grad_norm": 18.627612911982187, + "learning_rate": 4.981114623885067e-08, + "logits/chosen": -3.5791103839874268, + "logits/rejected": -3.5795066356658936, + "logps/chosen": -1.4292125701904297, + "logps/rejected": -1.6340259313583374, + "loss": 1.6238, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.8584251403808594, + "rewards/margins": 0.40962713956832886, + "rewards/rejected": -3.268051862716675, + "step": 1570 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 20.31257600674884, + "learning_rate": 4.980181081003167e-08, + "logits/chosen": -3.5357413291931152, + "logits/rejected": -3.530127763748169, + "logps/chosen": -1.505494475364685, + "logps/rejected": -1.6010183095932007, + "loss": 1.7511, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.01098895072937, + "rewards/margins": 0.1910475641489029, + "rewards/rejected": -3.2020366191864014, + "step": 1580 + }, + { + "epoch": 0.2739490006891799, + "grad_norm": 19.767565804493607, + "learning_rate": 4.9792251096527826e-08, + "logits/chosen": -3.5537171363830566, + "logits/rejected": -3.5464870929718018, + "logps/chosen": -1.4480928182601929, + "logps/rejected": -1.5720703601837158, + "loss": 1.7073, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.8961856365203857, + "rewards/margins": 0.2479551136493683, + "rewards/rejected": -3.1441407203674316, + "step": 1590 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 22.86677560091986, + "learning_rate": 4.978246718478836e-08, + "logits/chosen": -3.5610060691833496, + "logits/rejected": -3.537564754486084, + "logps/chosen": -1.4404429197311401, + "logps/rejected": -1.604823350906372, + "loss": 1.66, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.8808858394622803, + "rewards/margins": 0.32876071333885193, + "rewards/rejected": -3.209646701812744, + "step": 1600 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.636516809463501, + "eval_logits/rejected": -3.6325063705444336, + "eval_logps/chosen": -1.4810717105865479, + "eval_logps/rejected": -1.6110072135925293, + "eval_loss": 1.6691291332244873, + "eval_rewards/accuracies": 0.5985130071640015, + "eval_rewards/chosen": -2.9621434211730957, + "eval_rewards/margins": 0.2598709762096405, + "eval_rewards/rejected": -3.2220144271850586, + "eval_runtime": 156.4266, + "eval_samples_per_second": 27.514, + "eval_steps_per_second": 3.439, + "step": 1600 + }, + { + "epoch": 0.277394900068918, + "grad_norm": 20.55962131745306, + "learning_rate": 4.9772459163289934e-08, + "logits/chosen": -3.5818023681640625, + "logits/rejected": -3.567166566848755, + "logps/chosen": -1.5387918949127197, + "logps/rejected": -1.7111526727676392, + "loss": 1.6777, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.0775837898254395, + "rewards/margins": 0.34472090005874634, + "rewards/rejected": -3.4223053455352783, + "step": 1610 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 20.135176970862062, + "learning_rate": 4.976222712253587e-08, + "logits/chosen": -3.5374984741210938, + "logits/rejected": -3.5241198539733887, + "logps/chosen": -1.422234296798706, + "logps/rejected": -1.6188961267471313, + "loss": 1.614, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.844468593597412, + "rewards/margins": 0.39332371950149536, + "rewards/rejected": -3.2377922534942627, + "step": 1620 + }, + { + "epoch": 0.2808407994486561, + "grad_norm": 19.867315634525294, + "learning_rate": 4.9751771155055295e-08, + "logits/chosen": -3.5839645862579346, + "logits/rejected": -3.567082166671753, + "logps/chosen": -1.52110755443573, + "logps/rejected": -1.6467859745025635, + "loss": 1.7458, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.04221510887146, + "rewards/margins": 0.25135689973831177, + "rewards/rejected": -3.293571949005127, + "step": 1630 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 20.50821208487501, + "learning_rate": 4.974109135540232e-08, + "logits/chosen": -3.591435670852661, + "logits/rejected": -3.570263385772705, + "logps/chosen": -1.517647385597229, + "logps/rejected": -1.6701990365982056, + "loss": 1.7439, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.035294771194458, + "rewards/margins": 0.3051033914089203, + "rewards/rejected": -3.340398073196411, + "step": 1640 + }, + { + "epoch": 0.2842866988283942, + "grad_norm": 20.70968127149051, + "learning_rate": 4.97301878201552e-08, + "logits/chosen": -3.57568621635437, + "logits/rejected": -3.559844970703125, + "logps/chosen": -1.5328868627548218, + "logps/rejected": -1.5957329273223877, + "loss": 1.8163, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.0657737255096436, + "rewards/margins": 0.12569186091423035, + "rewards/rejected": -3.1914658546447754, + "step": 1650 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 21.844244058574258, + "learning_rate": 4.971906064791544e-08, + "logits/chosen": -3.6076297760009766, + "logits/rejected": -3.5792839527130127, + "logps/chosen": -1.4589565992355347, + "logps/rejected": -1.6513506174087524, + "loss": 1.6022, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9179131984710693, + "rewards/margins": 0.3847882151603699, + "rewards/rejected": -3.302701234817505, + "step": 1660 + }, + { + "epoch": 0.2877325982081323, + "grad_norm": 20.30890070496309, + "learning_rate": 4.970770993930693e-08, + "logits/chosen": -3.5944244861602783, + "logits/rejected": -3.5824813842773438, + "logps/chosen": -1.4535932540893555, + "logps/rejected": -1.6460365056991577, + "loss": 1.6606, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.907186508178711, + "rewards/margins": 0.38488656282424927, + "rewards/rejected": -3.2920730113983154, + "step": 1670 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 22.217122407799273, + "learning_rate": 4.969613579697499e-08, + "logits/chosen": -3.5730159282684326, + "logits/rejected": -3.5582973957061768, + "logps/chosen": -1.5553570985794067, + "logps/rejected": -1.7010650634765625, + "loss": 1.6981, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.1107141971588135, + "rewards/margins": 0.29141610860824585, + "rewards/rejected": -3.402130126953125, + "step": 1680 + }, + { + "epoch": 0.29117849758787046, + "grad_norm": 17.55063354386694, + "learning_rate": 4.968433832558549e-08, + "logits/chosen": -3.5606560707092285, + "logits/rejected": -3.5503761768341064, + "logps/chosen": -1.5366096496582031, + "logps/rejected": -1.595467448234558, + "loss": 1.8201, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.0732192993164062, + "rewards/margins": 0.11771535873413086, + "rewards/rejected": -3.190934896469116, + "step": 1690 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 24.39925282307972, + "learning_rate": 4.967231763182385e-08, + "logits/chosen": -3.4952495098114014, + "logits/rejected": -3.4998080730438232, + "logps/chosen": -1.4651341438293457, + "logps/rejected": -1.594761848449707, + "loss": 1.6776, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.9302682876586914, + "rewards/margins": 0.25925520062446594, + "rewards/rejected": -3.189523696899414, + "step": 1700 + }, + { + "epoch": 0.2929014472777395, + "eval_logits/chosen": -3.635751485824585, + "eval_logits/rejected": -3.6317477226257324, + "eval_logps/chosen": -1.4815140962600708, + "eval_logps/rejected": -1.6119519472122192, + "eval_loss": 1.6685117483139038, + "eval_rewards/accuracies": 0.5934014916419983, + "eval_rewards/chosen": -2.9630281925201416, + "eval_rewards/margins": 0.2608759105205536, + "eval_rewards/rejected": -3.2239038944244385, + "eval_runtime": 156.7242, + "eval_samples_per_second": 27.462, + "eval_steps_per_second": 3.433, + "step": 1700 + }, + { + "epoch": 0.29462439696760856, + "grad_norm": 19.12468919160641, + "learning_rate": 4.966007382439414e-08, + "logits/chosen": -3.579921245574951, + "logits/rejected": -3.557300090789795, + "logps/chosen": -1.50381338596344, + "logps/rejected": -1.6622949838638306, + "loss": 1.685, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.00762677192688, + "rewards/margins": 0.31696319580078125, + "rewards/rejected": -3.324589967727661, + "step": 1710 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 20.486513720886038, + "learning_rate": 4.964760701401807e-08, + "logits/chosen": -3.596654176712036, + "logits/rejected": -3.5813491344451904, + "logps/chosen": -1.5375216007232666, + "logps/rejected": -1.5570892095565796, + "loss": 1.8763, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.075043201446533, + "rewards/margins": 0.03913510590791702, + "rewards/rejected": -3.114178419113159, + "step": 1720 + }, + { + "epoch": 0.29807029634734666, + "grad_norm": 20.07161389032791, + "learning_rate": 4.963491731343395e-08, + "logits/chosen": -3.5821235179901123, + "logits/rejected": -3.5712788105010986, + "logps/chosen": -1.513156533241272, + "logps/rejected": -1.5725635290145874, + "loss": 1.8172, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.026313066482544, + "rewards/margins": 0.11881394684314728, + "rewards/rejected": -3.145127058029175, + "step": 1730 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 17.052676319343707, + "learning_rate": 4.9622004837395725e-08, + "logits/chosen": -3.5470199584960938, + "logits/rejected": -3.5411536693573, + "logps/chosen": -1.4800400733947754, + "logps/rejected": -1.623025894165039, + "loss": 1.7052, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.960080146789551, + "rewards/margins": 0.2859714925289154, + "rewards/rejected": -3.246051788330078, + "step": 1740 + }, + { + "epoch": 0.30151619572708477, + "grad_norm": 20.802444686676857, + "learning_rate": 4.9608869702671903e-08, + "logits/chosen": -3.5859477519989014, + "logits/rejected": -3.577083110809326, + "logps/chosen": -1.461309552192688, + "logps/rejected": -1.5690044164657593, + "loss": 1.7234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.922619104385376, + "rewards/margins": 0.2153901606798172, + "rewards/rejected": -3.1380088329315186, + "step": 1750 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 19.03826486967412, + "learning_rate": 4.9595512028044526e-08, + "logits/chosen": -3.5674026012420654, + "logits/rejected": -3.545771360397339, + "logps/chosen": -1.4754537343978882, + "logps/rejected": -1.612532615661621, + "loss": 1.6898, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.9509074687957764, + "rewards/margins": 0.2741575837135315, + "rewards/rejected": -3.225065231323242, + "step": 1760 + }, + { + "epoch": 0.3049620951068229, + "grad_norm": 16.71690327058305, + "learning_rate": 4.958193193430807e-08, + "logits/chosen": -3.5850799083709717, + "logits/rejected": -3.5649847984313965, + "logps/chosen": -1.5197560787200928, + "logps/rejected": -1.6090255975723267, + "loss": 1.7482, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.0395121574401855, + "rewards/margins": 0.17853932082653046, + "rewards/rejected": -3.2180511951446533, + "step": 1770 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 20.7959914788665, + "learning_rate": 4.956812954426837e-08, + "logits/chosen": -3.5279533863067627, + "logits/rejected": -3.529479503631592, + "logps/chosen": -1.5172889232635498, + "logps/rejected": -1.6478281021118164, + "loss": 1.6877, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0345778465270996, + "rewards/margins": 0.2610776722431183, + "rewards/rejected": -3.295656204223633, + "step": 1780 + }, + { + "epoch": 0.308407994486561, + "grad_norm": 17.694784762991915, + "learning_rate": 4.9554104982741504e-08, + "logits/chosen": -3.629983425140381, + "logits/rejected": -3.616046190261841, + "logps/chosen": -1.422870397567749, + "logps/rejected": -1.5803414583206177, + "loss": 1.6566, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.845740795135498, + "rewards/margins": 0.31494227051734924, + "rewards/rejected": -3.1606829166412354, + "step": 1790 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 19.706724434754133, + "learning_rate": 4.953985837655266e-08, + "logits/chosen": -3.6049816608428955, + "logits/rejected": -3.592071533203125, + "logps/chosen": -1.4670665264129639, + "logps/rejected": -1.64205002784729, + "loss": 1.6114, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.9341330528259277, + "rewards/margins": 0.34996673464775085, + "rewards/rejected": -3.28410005569458, + "step": 1800 + }, + { + "epoch": 0.31013094417643006, + "eval_logits/chosen": -3.6234824657440186, + "eval_logits/rejected": -3.6194350719451904, + "eval_logps/chosen": -1.481577754020691, + "eval_logps/rejected": -1.6121400594711304, + "eval_loss": 1.6682400703430176, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -2.963155508041382, + "eval_rewards/margins": 0.261124849319458, + "eval_rewards/rejected": -3.2242801189422607, + "eval_runtime": 156.8902, + "eval_samples_per_second": 27.433, + "eval_steps_per_second": 3.429, + "step": 1800 + }, + { + "epoch": 0.3118538938662991, + "grad_norm": 23.907833823955638, + "learning_rate": 4.952538985453499e-08, + "logits/chosen": -3.608504056930542, + "logits/rejected": -3.590529203414917, + "logps/chosen": -1.472581148147583, + "logps/rejected": -1.5263159275054932, + "loss": 1.8291, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.945162296295166, + "rewards/margins": 0.10746964067220688, + "rewards/rejected": -3.0526318550109863, + "step": 1810 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 22.503572140680063, + "learning_rate": 4.9510699547528456e-08, + "logits/chosen": -3.5915908813476562, + "logits/rejected": -3.5694305896759033, + "logps/chosen": -1.4992526769638062, + "logps/rejected": -1.576458215713501, + "loss": 1.7721, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.9985053539276123, + "rewards/margins": 0.15441085398197174, + "rewards/rejected": -3.152916431427002, + "step": 1820 + }, + { + "epoch": 0.31529979324603724, + "grad_norm": 18.75166216201898, + "learning_rate": 4.949578758837864e-08, + "logits/chosen": -3.5199248790740967, + "logits/rejected": -3.510275363922119, + "logps/chosen": -1.5318543910980225, + "logps/rejected": -1.6673822402954102, + "loss": 1.7055, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.063708782196045, + "rewards/margins": 0.27105626463890076, + "rewards/rejected": -3.3347644805908203, + "step": 1830 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 21.44354312814991, + "learning_rate": 4.948065411193554e-08, + "logits/chosen": -3.6593379974365234, + "logits/rejected": -3.6570687294006348, + "logps/chosen": -1.5786736011505127, + "logps/rejected": -1.7057005167007446, + "loss": 1.779, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.1573472023010254, + "rewards/margins": 0.2540538012981415, + "rewards/rejected": -3.4114010334014893, + "step": 1840 + }, + { + "epoch": 0.31874569262577535, + "grad_norm": 22.534472583610636, + "learning_rate": 4.946529925505233e-08, + "logits/chosen": -3.568380355834961, + "logits/rejected": -3.568305492401123, + "logps/chosen": -1.4873733520507812, + "logps/rejected": -1.5835912227630615, + "loss": 1.7567, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.9747467041015625, + "rewards/margins": 0.19243571162223816, + "rewards/rejected": -3.167182445526123, + "step": 1850 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 26.050625318098785, + "learning_rate": 4.9449723156584175e-08, + "logits/chosen": -3.5283291339874268, + "logits/rejected": -3.5063698291778564, + "logps/chosen": -1.4795743227005005, + "logps/rejected": -1.6874549388885498, + "loss": 1.6019, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.959148645401001, + "rewards/margins": 0.4157616198062897, + "rewards/rejected": -3.3749098777770996, + "step": 1860 + }, + { + "epoch": 0.32219159200551345, + "grad_norm": 23.309611487946793, + "learning_rate": 4.943392595738694e-08, + "logits/chosen": -3.5602335929870605, + "logits/rejected": -3.546433687210083, + "logps/chosen": -1.5077905654907227, + "logps/rejected": -1.6730937957763672, + "loss": 1.6664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0155811309814453, + "rewards/margins": 0.33060646057128906, + "rewards/rejected": -3.3461875915527344, + "step": 1870 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 20.989446591214158, + "learning_rate": 4.9417907800315904e-08, + "logits/chosen": -3.579502820968628, + "logits/rejected": -3.5554003715515137, + "logps/chosen": -1.3934292793273926, + "logps/rejected": -1.629468560218811, + "loss": 1.588, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.786858558654785, + "rewards/margins": 0.4720785617828369, + "rewards/rejected": -3.258937120437622, + "step": 1880 + }, + { + "epoch": 0.32563749138525155, + "grad_norm": 28.167874737571054, + "learning_rate": 4.94016688302245e-08, + "logits/chosen": -3.6036102771759033, + "logits/rejected": -3.5991806983947754, + "logps/chosen": -1.4940338134765625, + "logps/rejected": -1.635271430015564, + "loss": 1.7079, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.988067626953125, + "rewards/margins": 0.28247541189193726, + "rewards/rejected": -3.270542860031128, + "step": 1890 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 21.438489773195677, + "learning_rate": 4.9385209193962974e-08, + "logits/chosen": -3.554248809814453, + "logits/rejected": -3.5342438220977783, + "logps/chosen": -1.5177969932556152, + "logps/rejected": -1.5785338878631592, + "loss": 1.8167, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -3.0355939865112305, + "rewards/margins": 0.12147398293018341, + "rewards/rejected": -3.1570677757263184, + "step": 1900 + }, + { + "epoch": 0.32736044107512063, + "eval_logits/chosen": -3.6297430992126465, + "eval_logits/rejected": -3.625743865966797, + "eval_logps/chosen": -1.4827522039413452, + "eval_logps/rejected": -1.6139134168624878, + "eval_loss": 1.6674561500549316, + "eval_rewards/accuracies": 0.5964219570159912, + "eval_rewards/chosen": -2.9655044078826904, + "eval_rewards/margins": 0.26232248544692993, + "eval_rewards/rejected": -3.2278268337249756, + "eval_runtime": 156.9512, + "eval_samples_per_second": 27.423, + "eval_steps_per_second": 3.428, + "step": 1900 + }, + { + "epoch": 0.32908339076498966, + "grad_norm": 17.529297650673623, + "learning_rate": 4.93685290403771e-08, + "logits/chosen": -3.49159574508667, + "logits/rejected": -3.4713501930236816, + "logps/chosen": -1.4680726528167725, + "logps/rejected": -1.6576448678970337, + "loss": 1.5914, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.936145305633545, + "rewards/margins": 0.37914425134658813, + "rewards/rejected": -3.3152897357940674, + "step": 1910 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 21.462116474253715, + "learning_rate": 4.9351628520306774e-08, + "logits/chosen": -3.568329334259033, + "logits/rejected": -3.552870988845825, + "logps/chosen": -1.50933837890625, + "logps/rejected": -1.6663627624511719, + "loss": 1.6941, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0186767578125, + "rewards/margins": 0.3140490651130676, + "rewards/rejected": -3.3327255249023438, + "step": 1920 + }, + { + "epoch": 0.33252929014472776, + "grad_norm": 22.016803141321805, + "learning_rate": 4.933450778658472e-08, + "logits/chosen": -3.53804087638855, + "logits/rejected": -3.515195846557617, + "logps/chosen": -1.440914273262024, + "logps/rejected": -1.7439817190170288, + "loss": 1.551, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.881828546524048, + "rewards/margins": 0.6061349511146545, + "rewards/rejected": -3.4879634380340576, + "step": 1930 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 18.860701161365228, + "learning_rate": 4.9317166994035036e-08, + "logits/chosen": -3.555842638015747, + "logits/rejected": -3.5460057258605957, + "logps/chosen": -1.4793164730072021, + "logps/rejected": -1.6231352090835571, + "loss": 1.6821, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.9586329460144043, + "rewards/margins": 0.28763800859451294, + "rewards/rejected": -3.2462704181671143, + "step": 1940 + }, + { + "epoch": 0.33597518952446587, + "grad_norm": 22.379062209616148, + "learning_rate": 4.929960629947185e-08, + "logits/chosen": -3.5424981117248535, + "logits/rejected": -3.540966510772705, + "logps/chosen": -1.5108827352523804, + "logps/rejected": -1.6178505420684814, + "loss": 1.7336, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0217654705047607, + "rewards/margins": 0.21393528580665588, + "rewards/rejected": -3.235701084136963, + "step": 1950 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 21.636286887190597, + "learning_rate": 4.928182586169787e-08, + "logits/chosen": -3.553539276123047, + "logits/rejected": -3.5433926582336426, + "logps/chosen": -1.52933669090271, + "logps/rejected": -1.7150760889053345, + "loss": 1.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.05867338180542, + "rewards/margins": 0.3714786767959595, + "rewards/rejected": -3.430152177810669, + "step": 1960 + }, + { + "epoch": 0.33942108890420397, + "grad_norm": 21.321219104794903, + "learning_rate": 4.926382584150298e-08, + "logits/chosen": -3.5843303203582764, + "logits/rejected": -3.5655570030212402, + "logps/chosen": -1.5155251026153564, + "logps/rejected": -1.638843297958374, + "loss": 1.7488, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.031050205230713, + "rewards/margins": 0.2466357946395874, + "rewards/rejected": -3.277686595916748, + "step": 1970 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 19.82579998907182, + "learning_rate": 4.924560640166273e-08, + "logits/chosen": -3.5257866382598877, + "logits/rejected": -3.5218474864959717, + "logps/chosen": -1.5414459705352783, + "logps/rejected": -1.593079924583435, + "loss": 1.8009, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.0828919410705566, + "rewards/margins": 0.10326793044805527, + "rewards/rejected": -3.18615984916687, + "step": 1980 + }, + { + "epoch": 0.34286698828394213, + "grad_norm": 18.999955948761414, + "learning_rate": 4.922716770693692e-08, + "logits/chosen": -3.607512950897217, + "logits/rejected": -3.588353395462036, + "logps/chosen": -1.459004282951355, + "logps/rejected": -1.5972133874893188, + "loss": 1.6924, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.91800856590271, + "rewards/margins": 0.2764180898666382, + "rewards/rejected": -3.1944267749786377, + "step": 1990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 19.722772280140248, + "learning_rate": 4.920850992406809e-08, + "logits/chosen": -3.5608277320861816, + "logits/rejected": -3.5640289783477783, + "logps/chosen": -1.5221635103225708, + "logps/rejected": -1.7241309881210327, + "loss": 1.6553, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.0443270206451416, + "rewards/margins": 0.4039350152015686, + "rewards/rejected": -3.4482619762420654, + "step": 2000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.6267518997192383, + "eval_logits/rejected": -3.6227598190307617, + "eval_logps/chosen": -1.4836393594741821, + "eval_logps/rejected": -1.6152477264404297, + "eval_loss": 1.6667242050170898, + "eval_rewards/accuracies": 0.5980483293533325, + "eval_rewards/chosen": -2.9672787189483643, + "eval_rewards/margins": 0.2632165849208832, + "eval_rewards/rejected": -3.2304954528808594, + "eval_runtime": 156.7817, + "eval_samples_per_second": 27.452, + "eval_steps_per_second": 3.432, + "step": 2000 + }, + { + "epoch": 0.34631288766368024, + "grad_norm": 19.493125231378887, + "learning_rate": 4.918963322178001e-08, + "logits/chosen": -3.549489974975586, + "logits/rejected": -3.532921552658081, + "logps/chosen": -1.5227919816970825, + "logps/rejected": -1.631601333618164, + "loss": 1.7317, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.045583963394165, + "rewards/margins": 0.21761877834796906, + "rewards/rejected": -3.263202667236328, + "step": 2010 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 19.963919519136866, + "learning_rate": 4.917053777077616e-08, + "logits/chosen": -3.5543811321258545, + "logits/rejected": -3.5396568775177, + "logps/chosen": -1.4497196674346924, + "logps/rejected": -1.7273123264312744, + "loss": 1.5461, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.8994393348693848, + "rewards/margins": 0.5551851987838745, + "rewards/rejected": -3.454624652862549, + "step": 2020 + }, + { + "epoch": 0.34975878704341834, + "grad_norm": 23.00882359656133, + "learning_rate": 4.915122374373815e-08, + "logits/chosen": -3.5998969078063965, + "logits/rejected": -3.5878195762634277, + "logps/chosen": -1.5443811416625977, + "logps/rejected": -1.6665815114974976, + "loss": 1.7196, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.0887622833251953, + "rewards/margins": 0.2444007843732834, + "rewards/rejected": -3.333163022994995, + "step": 2030 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 18.68026697450616, + "learning_rate": 4.9131691315324224e-08, + "logits/chosen": -3.523455858230591, + "logits/rejected": -3.5135726928710938, + "logps/chosen": -1.4714016914367676, + "logps/rejected": -1.6564124822616577, + "loss": 1.621, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.942803382873535, + "rewards/margins": 0.3700217008590698, + "rewards/rejected": -3.3128249645233154, + "step": 2040 + }, + { + "epoch": 0.35320468642315644, + "grad_norm": 20.36953899190694, + "learning_rate": 4.911194066216765e-08, + "logits/chosen": -3.5980594158172607, + "logits/rejected": -3.584456205368042, + "logps/chosen": -1.4483569860458374, + "logps/rejected": -1.6850553750991821, + "loss": 1.5626, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.896713972091675, + "rewards/margins": 0.4733964800834656, + "rewards/rejected": -3.3701107501983643, + "step": 2050 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 18.70213210986618, + "learning_rate": 4.909197196287509e-08, + "logits/chosen": -3.582314968109131, + "logits/rejected": -3.559079647064209, + "logps/chosen": -1.4241091012954712, + "logps/rejected": -1.5616523027420044, + "loss": 1.6533, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.8482182025909424, + "rewards/margins": 0.27508679032325745, + "rewards/rejected": -3.123304605484009, + "step": 2060 + }, + { + "epoch": 0.35665058580289455, + "grad_norm": 22.992970213984307, + "learning_rate": 4.907178539802503e-08, + "logits/chosen": -3.5887069702148438, + "logits/rejected": -3.5751261711120605, + "logps/chosen": -1.5180531740188599, + "logps/rejected": -1.6554996967315674, + "loss": 1.7201, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.0361063480377197, + "rewards/margins": 0.2748925983905792, + "rewards/rejected": -3.3109993934631348, + "step": 2070 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 17.720754033991902, + "learning_rate": 4.9051381150166136e-08, + "logits/chosen": -3.568765640258789, + "logits/rejected": -3.5518951416015625, + "logps/chosen": -1.5020513534545898, + "logps/rejected": -1.628472089767456, + "loss": 1.7118, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.0041027069091797, + "rewards/margins": 0.2528417408466339, + "rewards/rejected": -3.256944179534912, + "step": 2080 + }, + { + "epoch": 0.36009648518263265, + "grad_norm": 21.483102691437697, + "learning_rate": 4.903075940381559e-08, + "logits/chosen": -3.578641414642334, + "logits/rejected": -3.5765514373779297, + "logps/chosen": -1.5061986446380615, + "logps/rejected": -1.550728440284729, + "loss": 1.8193, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.012397289276123, + "rewards/margins": 0.08905954658985138, + "rewards/rejected": -3.101456880569458, + "step": 2090 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 23.899673848583756, + "learning_rate": 4.900992034545743e-08, + "logits/chosen": -3.538653612136841, + "logits/rejected": -3.5266621112823486, + "logps/chosen": -1.548145055770874, + "logps/rejected": -1.627002477645874, + "loss": 1.7986, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.096290111541748, + "rewards/margins": 0.15771517157554626, + "rewards/rejected": -3.254004955291748, + "step": 2100 + }, + { + "epoch": 0.36181943487250173, + "eval_logits/chosen": -3.623908519744873, + "eval_logits/rejected": -3.6199214458465576, + "eval_logps/chosen": -1.4846034049987793, + "eval_logps/rejected": -1.6168063879013062, + "eval_loss": 1.6660202741622925, + "eval_rewards/accuracies": 0.6022304892539978, + "eval_rewards/chosen": -2.9692068099975586, + "eval_rewards/margins": 0.2644062340259552, + "eval_rewards/rejected": -3.2336127758026123, + "eval_runtime": 156.7256, + "eval_samples_per_second": 27.462, + "eval_steps_per_second": 3.433, + "step": 2100 + }, + { + "epoch": 0.36354238456237076, + "grad_norm": 19.324578704737192, + "learning_rate": 4.898886416354088e-08, + "logits/chosen": -3.597489833831787, + "logits/rejected": -3.600764036178589, + "logps/chosen": -1.4722825288772583, + "logps/rejected": -1.6591365337371826, + "loss": 1.6319, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9445650577545166, + "rewards/margins": 0.3737080693244934, + "rewards/rejected": -3.3182730674743652, + "step": 2110 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 18.32372875947991, + "learning_rate": 4.896759104847859e-08, + "logits/chosen": -3.491229295730591, + "logits/rejected": -3.4740347862243652, + "logps/chosen": -1.427669882774353, + "logps/rejected": -1.597784161567688, + "loss": 1.6693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.855339765548706, + "rewards/margins": 0.3402283489704132, + "rewards/rejected": -3.195568323135376, + "step": 2120 + }, + { + "epoch": 0.3669882839421089, + "grad_norm": 23.18604396026762, + "learning_rate": 4.8946101192644994e-08, + "logits/chosen": -3.5139031410217285, + "logits/rejected": -3.504164934158325, + "logps/chosen": -1.5479475259780884, + "logps/rejected": -1.6522204875946045, + "loss": 1.7624, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.0958950519561768, + "rewards/margins": 0.20854637026786804, + "rewards/rejected": -3.304440975189209, + "step": 2130 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 20.5477588650669, + "learning_rate": 4.8924394790374505e-08, + "logits/chosen": -3.5106472969055176, + "logits/rejected": -3.50592041015625, + "logps/chosen": -1.525874376296997, + "logps/rejected": -1.777875304222107, + "loss": 1.5863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.051748752593994, + "rewards/margins": 0.5040016174316406, + "rewards/rejected": -3.555750608444214, + "step": 2140 + }, + { + "epoch": 0.370434183321847, + "grad_norm": 21.123775473580462, + "learning_rate": 4.8902472037959796e-08, + "logits/chosen": -3.5329418182373047, + "logits/rejected": -3.5062851905822754, + "logps/chosen": -1.423621416091919, + "logps/rejected": -1.5364747047424316, + "loss": 1.7046, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.847242832183838, + "rewards/margins": 0.22570693492889404, + "rewards/rejected": -3.0729494094848633, + "step": 2150 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 22.78827563382902, + "learning_rate": 4.888033313365001e-08, + "logits/chosen": -3.516775608062744, + "logits/rejected": -3.5070183277130127, + "logps/chosen": -1.5784735679626465, + "logps/rejected": -1.5640720129013062, + "loss": 1.947, + "rewards/accuracies": 0.46875, + "rewards/chosen": -3.156947135925293, + "rewards/margins": -0.028803348541259766, + "rewards/rejected": -3.1281440258026123, + "step": 2160 + }, + { + "epoch": 0.3738800827015851, + "grad_norm": 23.531649426564748, + "learning_rate": 4.885797827764895e-08, + "logits/chosen": -3.5801033973693848, + "logits/rejected": -3.570537567138672, + "logps/chosen": -1.562754511833191, + "logps/rejected": -1.620516061782837, + "loss": 1.8131, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.125509023666382, + "rewards/margins": 0.11552339792251587, + "rewards/rejected": -3.241032123565674, + "step": 2170 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 21.238711093103408, + "learning_rate": 4.88354076721133e-08, + "logits/chosen": -3.622354507446289, + "logits/rejected": -3.599475145339966, + "logps/chosen": -1.580690622329712, + "logps/rejected": -1.7170826196670532, + "loss": 1.7492, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.161381244659424, + "rewards/margins": 0.2727835476398468, + "rewards/rejected": -3.4341652393341064, + "step": 2180 + }, + { + "epoch": 0.37732598208132323, + "grad_norm": 22.39784789674002, + "learning_rate": 4.88126215211508e-08, + "logits/chosen": -3.6658267974853516, + "logits/rejected": -3.6622262001037598, + "logps/chosen": -1.5159324407577515, + "logps/rejected": -1.6402177810668945, + "loss": 1.7161, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -3.031864881515503, + "rewards/margins": 0.24857065081596375, + "rewards/rejected": -3.280435562133789, + "step": 2190 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 19.33167272025798, + "learning_rate": 4.878962003081835e-08, + "logits/chosen": -3.5528404712677, + "logits/rejected": -3.5377895832061768, + "logps/chosen": -1.439858317375183, + "logps/rejected": -1.6454274654388428, + "loss": 1.5912, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.879716634750366, + "rewards/margins": 0.41113847494125366, + "rewards/rejected": -3.2908549308776855, + "step": 2200 + }, + { + "epoch": 0.37904893177119225, + "eval_logits/chosen": -3.6114118099212646, + "eval_logits/rejected": -3.6073875427246094, + "eval_logps/chosen": -1.4853826761245728, + "eval_logps/rejected": -1.618082046508789, + "eval_loss": 1.6650714874267578, + "eval_rewards/accuracies": 0.6010687947273254, + "eval_rewards/chosen": -2.9707653522491455, + "eval_rewards/margins": 0.2653983235359192, + "eval_rewards/rejected": -3.236164093017578, + "eval_runtime": 156.9168, + "eval_samples_per_second": 27.429, + "eval_steps_per_second": 3.429, + "step": 2200 + }, + { + "epoch": 0.38077188146106133, + "grad_norm": 20.514902741444597, + "learning_rate": 4.87664034091202e-08, + "logits/chosen": -3.586317539215088, + "logits/rejected": -3.575448513031006, + "logps/chosen": -1.4753954410552979, + "logps/rejected": -1.6028869152069092, + "loss": 1.6972, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.9507908821105957, + "rewards/margins": 0.2549833655357361, + "rewards/rejected": -3.2057738304138184, + "step": 2210 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 24.43901065189289, + "learning_rate": 4.8742971866006064e-08, + "logits/chosen": -3.483224391937256, + "logits/rejected": -3.4792912006378174, + "logps/chosen": -1.5009597539901733, + "logps/rejected": -1.6676511764526367, + "loss": 1.6722, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.0019195079803467, + "rewards/margins": 0.3333826959133148, + "rewards/rejected": -3.3353023529052734, + "step": 2220 + }, + { + "epoch": 0.38421778084079944, + "grad_norm": 21.082067492206512, + "learning_rate": 4.8719325613369177e-08, + "logits/chosen": -3.5746986865997314, + "logits/rejected": -3.5565693378448486, + "logps/chosen": -1.5061171054840088, + "logps/rejected": -1.6679279804229736, + "loss": 1.6448, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0122342109680176, + "rewards/margins": 0.32362186908721924, + "rewards/rejected": -3.3358559608459473, + "step": 2230 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 19.693407840804248, + "learning_rate": 4.869546486504443e-08, + "logits/chosen": -3.543463945388794, + "logits/rejected": -3.5245018005371094, + "logps/chosen": -1.5463885068893433, + "logps/rejected": -1.6940767765045166, + "loss": 1.673, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.0927770137786865, + "rewards/margins": 0.295376718044281, + "rewards/rejected": -3.388153553009033, + "step": 2240 + }, + { + "epoch": 0.38766368022053754, + "grad_norm": 23.115209598980552, + "learning_rate": 4.8671389836806395e-08, + "logits/chosen": -3.52524995803833, + "logits/rejected": -3.506822109222412, + "logps/chosen": -1.548965334892273, + "logps/rejected": -1.7001352310180664, + "loss": 1.6557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.097930669784546, + "rewards/margins": 0.30233970284461975, + "rewards/rejected": -3.400270462036133, + "step": 2250 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 21.749498299580363, + "learning_rate": 4.864710074636742e-08, + "logits/chosen": -3.4859185218811035, + "logits/rejected": -3.4742584228515625, + "logps/chosen": -1.5967607498168945, + "logps/rejected": -1.6525167226791382, + "loss": 1.8156, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.193521499633789, + "rewards/margins": 0.11151192337274551, + "rewards/rejected": -3.3050334453582764, + "step": 2260 + }, + { + "epoch": 0.39110957960027565, + "grad_norm": 21.541245340353164, + "learning_rate": 4.862259781337561e-08, + "logits/chosen": -3.478783130645752, + "logits/rejected": -3.4609713554382324, + "logps/chosen": -1.5577605962753296, + "logps/rejected": -1.7488930225372314, + "loss": 1.6575, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.115521192550659, + "rewards/margins": 0.3822646737098694, + "rewards/rejected": -3.497786045074463, + "step": 2270 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 24.449144313801263, + "learning_rate": 4.8597881259412874e-08, + "logits/chosen": -3.540938138961792, + "logits/rejected": -3.5345215797424316, + "logps/chosen": -1.5198066234588623, + "logps/rejected": -1.6579618453979492, + "loss": 1.7133, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0396132469177246, + "rewards/margins": 0.27631083130836487, + "rewards/rejected": -3.3159236907958984, + "step": 2280 + }, + { + "epoch": 0.3945554789800138, + "grad_norm": 20.08669960137653, + "learning_rate": 4.857295130799293e-08, + "logits/chosen": -3.467047929763794, + "logits/rejected": -3.4501757621765137, + "logps/chosen": -1.4869188070297241, + "logps/rejected": -1.7262147665023804, + "loss": 1.5696, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.9738376140594482, + "rewards/margins": 0.4785922169685364, + "rewards/rejected": -3.4524295330047607, + "step": 2290 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 17.41427369424689, + "learning_rate": 4.8547808184559225e-08, + "logits/chosen": -3.583796262741089, + "logits/rejected": -3.565356731414795, + "logps/chosen": -1.4936672449111938, + "logps/rejected": -1.660553216934204, + "loss": 1.6622, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.9873344898223877, + "rewards/margins": 0.33377179503440857, + "rewards/rejected": -3.321106433868408, + "step": 2300 + }, + { + "epoch": 0.39627842866988283, + "eval_logits/chosen": -3.602208137512207, + "eval_logits/rejected": -3.5981619358062744, + "eval_logps/chosen": -1.4872812032699585, + "eval_logps/rejected": -1.621127963066101, + "eval_loss": 1.6637080907821655, + "eval_rewards/accuracies": 0.6043215394020081, + "eval_rewards/chosen": -2.974562406539917, + "eval_rewards/margins": 0.2676936686038971, + "eval_rewards/rejected": -3.242255926132202, + "eval_runtime": 156.6727, + "eval_samples_per_second": 27.471, + "eval_steps_per_second": 3.434, + "step": 2300 + }, + { + "epoch": 0.3980013783597519, + "grad_norm": 23.751481660679964, + "learning_rate": 4.852245211648297e-08, + "logits/chosen": -3.523594617843628, + "logits/rejected": -3.5147643089294434, + "logps/chosen": -1.504022240638733, + "logps/rejected": -1.6421245336532593, + "loss": 1.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.008044481277466, + "rewards/margins": 0.2762050926685333, + "rewards/rejected": -3.2842490673065186, + "step": 2310 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 18.936682348372276, + "learning_rate": 4.8496883333061044e-08, + "logits/chosen": -3.58124041557312, + "logits/rejected": -3.556415557861328, + "logps/chosen": -1.5412237644195557, + "logps/rejected": -1.6455538272857666, + "loss": 1.7429, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0824475288391113, + "rewards/margins": 0.20865985751152039, + "rewards/rejected": -3.291107654571533, + "step": 2320 + }, + { + "epoch": 0.40144727773949, + "grad_norm": 20.544293696229232, + "learning_rate": 4.8471102065513926e-08, + "logits/chosen": -3.5343194007873535, + "logits/rejected": -3.5193076133728027, + "logps/chosen": -1.5306497812271118, + "logps/rejected": -1.6750688552856445, + "loss": 1.7229, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0612995624542236, + "rewards/margins": 0.2888379395008087, + "rewards/rejected": -3.350137710571289, + "step": 2330 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 17.777756883688532, + "learning_rate": 4.844510854698359e-08, + "logits/chosen": -3.5592551231384277, + "logits/rejected": -3.5520949363708496, + "logps/chosen": -1.509926199913025, + "logps/rejected": -1.7177064418792725, + "loss": 1.6068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.01985239982605, + "rewards/margins": 0.4155608117580414, + "rewards/rejected": -3.435412883758545, + "step": 2340 + }, + { + "epoch": 0.4048931771192281, + "grad_norm": 26.201180439918627, + "learning_rate": 4.841890301253144e-08, + "logits/chosen": -3.5495967864990234, + "logits/rejected": -3.533440113067627, + "logps/chosen": -1.5851614475250244, + "logps/rejected": -1.7423877716064453, + "loss": 1.7301, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.170322895050049, + "rewards/margins": 0.31445300579071045, + "rewards/rejected": -3.4847755432128906, + "step": 2350 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 22.728159157170264, + "learning_rate": 4.8392485699136144e-08, + "logits/chosen": -3.547205686569214, + "logits/rejected": -3.5392508506774902, + "logps/chosen": -1.487065076828003, + "logps/rejected": -1.6705167293548584, + "loss": 1.629, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.974130153656006, + "rewards/margins": 0.36690327525138855, + "rewards/rejected": -3.341033458709717, + "step": 2360 + }, + { + "epoch": 0.4083390764989662, + "grad_norm": 22.762704389456022, + "learning_rate": 4.836585684569148e-08, + "logits/chosen": -3.5608458518981934, + "logits/rejected": -3.558295488357544, + "logps/chosen": -1.6040557622909546, + "logps/rejected": -1.685120940208435, + "loss": 1.781, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -3.208111524581909, + "rewards/margins": 0.16213031113147736, + "rewards/rejected": -3.37024188041687, + "step": 2370 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 19.246997421419525, + "learning_rate": 4.833901669300424e-08, + "logits/chosen": -3.5093021392822266, + "logits/rejected": -3.4930291175842285, + "logps/chosen": -1.5158263444900513, + "logps/rejected": -1.6325546503067017, + "loss": 1.7286, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.0316526889801025, + "rewards/margins": 0.23345641791820526, + "rewards/rejected": -3.2651093006134033, + "step": 2380 + }, + { + "epoch": 0.41178497587870433, + "grad_norm": 18.65486778196909, + "learning_rate": 4.831196548379198e-08, + "logits/chosen": -3.5684688091278076, + "logits/rejected": -3.5524325370788574, + "logps/chosen": -1.5612127780914307, + "logps/rejected": -1.6892235279083252, + "loss": 1.7209, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.1224255561828613, + "rewards/margins": 0.25602132081985474, + "rewards/rejected": -3.3784470558166504, + "step": 2390 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 18.66142034582711, + "learning_rate": 4.828470346268089e-08, + "logits/chosen": -3.5882461071014404, + "logits/rejected": -3.573695421218872, + "logps/chosen": -1.4475327730178833, + "logps/rejected": -1.6890789270401, + "loss": 1.519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8950655460357666, + "rewards/margins": 0.4830924868583679, + "rewards/rejected": -3.3781578540802, + "step": 2400 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -3.614332437515259, + "eval_logits/rejected": -3.6103715896606445, + "eval_logps/chosen": -1.489486813545227, + "eval_logps/rejected": -1.6236634254455566, + "eval_loss": 1.6631313562393188, + "eval_rewards/accuracies": 0.6052509546279907, + "eval_rewards/chosen": -2.978973627090454, + "eval_rewards/margins": 0.26835328340530396, + "eval_rewards/rejected": -3.2473268508911133, + "eval_runtime": 156.9176, + "eval_samples_per_second": 27.428, + "eval_steps_per_second": 3.429, + "step": 2400 + }, + { + "epoch": 0.41523087525844243, + "grad_norm": 21.012094406529148, + "learning_rate": 4.825723087620349e-08, + "logits/chosen": -3.582886219024658, + "logits/rejected": -3.558941602706909, + "logps/chosen": -1.488560676574707, + "logps/rejected": -1.6701157093048096, + "loss": 1.6477, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.977121353149414, + "rewards/margins": 0.3631100356578827, + "rewards/rejected": -3.340231418609619, + "step": 2410 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 21.183343957827056, + "learning_rate": 4.822954797279652e-08, + "logits/chosen": -3.5725414752960205, + "logits/rejected": -3.562328815460205, + "logps/chosen": -1.531442403793335, + "logps/rejected": -1.699089765548706, + "loss": 1.6534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.06288480758667, + "rewards/margins": 0.3352944850921631, + "rewards/rejected": -3.398179531097412, + "step": 2420 + }, + { + "epoch": 0.41867677463818054, + "grad_norm": 18.373282773387455, + "learning_rate": 4.82016550027986e-08, + "logits/chosen": -3.5588676929473877, + "logits/rejected": -3.5446724891662598, + "logps/chosen": -1.5013198852539062, + "logps/rejected": -1.6942297220230103, + "loss": 1.6808, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0026397705078125, + "rewards/margins": 0.38581979274749756, + "rewards/rejected": -3.3884594440460205, + "step": 2430 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 22.194506787369036, + "learning_rate": 4.817355221844801e-08, + "logits/chosen": -3.5529041290283203, + "logits/rejected": -3.5453720092773438, + "logps/chosen": -1.5691745281219482, + "logps/rejected": -1.6774365901947021, + "loss": 1.757, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.1383490562438965, + "rewards/margins": 0.2165241241455078, + "rewards/rejected": -3.3548731803894043, + "step": 2440 + }, + { + "epoch": 0.4221226740179187, + "grad_norm": 18.293996458800844, + "learning_rate": 4.814523987388038e-08, + "logits/chosen": -3.527029037475586, + "logits/rejected": -3.5138282775878906, + "logps/chosen": -1.571065902709961, + "logps/rejected": -1.741070032119751, + "loss": 1.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.142131805419922, + "rewards/margins": 0.3400086760520935, + "rewards/rejected": -3.482140064239502, + "step": 2450 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 20.312427568594313, + "learning_rate": 4.811671822512644e-08, + "logits/chosen": -3.503660202026367, + "logits/rejected": -3.48608136177063, + "logps/chosen": -1.6028276681900024, + "logps/rejected": -1.69357168674469, + "loss": 1.7442, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.205655336380005, + "rewards/margins": 0.18148799240589142, + "rewards/rejected": -3.38714337348938, + "step": 2460 + }, + { + "epoch": 0.4255685733976568, + "grad_norm": 21.177121551013244, + "learning_rate": 4.808798753010965e-08, + "logits/chosen": -3.615723133087158, + "logits/rejected": -3.6031582355499268, + "logps/chosen": -1.4958235025405884, + "logps/rejected": -1.6012442111968994, + "loss": 1.7514, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.9916470050811768, + "rewards/margins": 0.2108411341905594, + "rewards/rejected": -3.202488422393799, + "step": 2470 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 20.55201296966641, + "learning_rate": 4.805904804864389e-08, + "logits/chosen": -3.5501275062561035, + "logits/rejected": -3.532611131668091, + "logps/chosen": -1.5525211095809937, + "logps/rejected": -1.656925916671753, + "loss": 1.7442, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1050422191619873, + "rewards/margins": 0.2088095247745514, + "rewards/rejected": -3.313851833343506, + "step": 2480 + }, + { + "epoch": 0.4290144727773949, + "grad_norm": 21.21176978470135, + "learning_rate": 4.802990004243112e-08, + "logits/chosen": -3.5374274253845215, + "logits/rejected": -3.5326461791992188, + "logps/chosen": -1.5243167877197266, + "logps/rejected": -1.6463820934295654, + "loss": 1.7164, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.048633575439453, + "rewards/margins": 0.24413049221038818, + "rewards/rejected": -3.292764186859131, + "step": 2490 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 23.873078617077965, + "learning_rate": 4.800054377505901e-08, + "logits/chosen": -3.5865676403045654, + "logits/rejected": -3.5725231170654297, + "logps/chosen": -1.5730979442596436, + "logps/rejected": -1.6938705444335938, + "loss": 1.7422, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.146195888519287, + "rewards/margins": 0.2415451556444168, + "rewards/rejected": -3.3877410888671875, + "step": 2500 + }, + { + "epoch": 0.43073742246726393, + "eval_logits/chosen": -3.6007297039031982, + "eval_logits/rejected": -3.596728563308716, + "eval_logps/chosen": -1.4907609224319458, + "eval_logps/rejected": -1.625766634941101, + "eval_loss": 1.6620914936065674, + "eval_rewards/accuracies": 0.6050186157226562, + "eval_rewards/chosen": -2.9815218448638916, + "eval_rewards/margins": 0.2700112462043762, + "eval_rewards/rejected": -3.251533269882202, + "eval_runtime": 156.6329, + "eval_samples_per_second": 27.478, + "eval_steps_per_second": 3.435, + "step": 2500 + }, + { + "epoch": 0.432460372157133, + "grad_norm": 20.906971566392755, + "learning_rate": 4.797097951199854e-08, + "logits/chosen": -3.5002143383026123, + "logits/rejected": -3.4941468238830566, + "logps/chosen": -1.5948537588119507, + "logps/rejected": -1.6707570552825928, + "loss": 1.8091, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.1897075176239014, + "rewards/margins": 0.1518067568540573, + "rewards/rejected": -3.3415141105651855, + "step": 2510 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 20.89430618211393, + "learning_rate": 4.7941207520601625e-08, + "logits/chosen": -3.4865036010742188, + "logits/rejected": -3.4702820777893066, + "logps/chosen": -1.4487919807434082, + "logps/rejected": -1.5819661617279053, + "loss": 1.6887, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.8975839614868164, + "rewards/margins": 0.2663484215736389, + "rewards/rejected": -3.1639323234558105, + "step": 2520 + }, + { + "epoch": 0.4359062715368711, + "grad_norm": 22.041075061923358, + "learning_rate": 4.791122807009867e-08, + "logits/chosen": -3.53883695602417, + "logits/rejected": -3.5361385345458984, + "logps/chosen": -1.5153820514678955, + "logps/rejected": -1.6578140258789062, + "loss": 1.6796, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.030764102935791, + "rewards/margins": 0.2848639488220215, + "rewards/rejected": -3.3156280517578125, + "step": 2530 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 24.660251945442205, + "learning_rate": 4.7881041431596156e-08, + "logits/chosen": -3.569469451904297, + "logits/rejected": -3.5647659301757812, + "logps/chosen": -1.6155948638916016, + "logps/rejected": -1.7081197500228882, + "loss": 1.7615, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.231189727783203, + "rewards/margins": 0.18504957854747772, + "rewards/rejected": -3.4162395000457764, + "step": 2540 + }, + { + "epoch": 0.4393521709166092, + "grad_norm": 21.267485430416926, + "learning_rate": 4.7850647878074176e-08, + "logits/chosen": -3.5281989574432373, + "logits/rejected": -3.5106468200683594, + "logps/chosen": -1.5290539264678955, + "logps/rejected": -1.6520280838012695, + "loss": 1.7091, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.058107852935791, + "rewards/margins": 0.24594846367835999, + "rewards/rejected": -3.304056167602539, + "step": 2550 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 19.874037916582047, + "learning_rate": 4.782004768438399e-08, + "logits/chosen": -3.5824317932128906, + "logits/rejected": -3.569504499435425, + "logps/chosen": -1.5643948316574097, + "logps/rejected": -1.7051670551300049, + "loss": 1.7183, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.1287896633148193, + "rewards/margins": 0.2815442681312561, + "rewards/rejected": -3.4103341102600098, + "step": 2560 + }, + { + "epoch": 0.4427980702963473, + "grad_norm": 19.17351153079332, + "learning_rate": 4.7789241127245484e-08, + "logits/chosen": -3.54736065864563, + "logits/rejected": -3.5352063179016113, + "logps/chosen": -1.565794587135315, + "logps/rejected": -1.7177879810333252, + "loss": 1.7395, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.13158917427063, + "rewards/margins": 0.3039868474006653, + "rewards/rejected": -3.4355759620666504, + "step": 2570 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 22.530671921916554, + "learning_rate": 4.775822848524473e-08, + "logits/chosen": -3.561293840408325, + "logits/rejected": -3.5500340461730957, + "logps/chosen": -1.5855414867401123, + "logps/rejected": -1.7455179691314697, + "loss": 1.7282, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.1710829734802246, + "rewards/margins": 0.31995266675949097, + "rewards/rejected": -3.4910359382629395, + "step": 2580 + }, + { + "epoch": 0.4462439696760855, + "grad_norm": 24.893982618759377, + "learning_rate": 4.7727010038831456e-08, + "logits/chosen": -3.555220365524292, + "logits/rejected": -3.5336151123046875, + "logps/chosen": -1.5849583148956299, + "logps/rejected": -1.7369537353515625, + "loss": 1.7196, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.1699166297912598, + "rewards/margins": 0.3039911091327667, + "rewards/rejected": -3.473907470703125, + "step": 2590 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 26.369368405424176, + "learning_rate": 4.769558607031646e-08, + "logits/chosen": -3.52154803276062, + "logits/rejected": -3.4922738075256348, + "logps/chosen": -1.5694255828857422, + "logps/rejected": -1.7204395532608032, + "loss": 1.7132, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1388511657714844, + "rewards/margins": 0.3020276427268982, + "rewards/rejected": -3.4408791065216064, + "step": 2600 + }, + { + "epoch": 0.4479669193659545, + "eval_logits/chosen": -3.595148801803589, + "eval_logits/rejected": -3.591151237487793, + "eval_logps/chosen": -1.493095874786377, + "eval_logps/rejected": -1.6284310817718506, + "eval_loss": 1.6615219116210938, + "eval_rewards/accuracies": 0.6078066825866699, + "eval_rewards/chosen": -2.986191749572754, + "eval_rewards/margins": 0.27067071199417114, + "eval_rewards/rejected": -3.256862163543701, + "eval_runtime": 156.7248, + "eval_samples_per_second": 27.462, + "eval_steps_per_second": 3.433, + "step": 2600 + }, + { + "epoch": 0.4496898690558236, + "grad_norm": 20.677041938428193, + "learning_rate": 4.766395686386911e-08, + "logits/chosen": -3.51015043258667, + "logits/rejected": -3.486515760421753, + "logps/chosen": -1.5552761554718018, + "logps/rejected": -1.7138694524765015, + "loss": 1.7239, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.1105523109436035, + "rewards/margins": 0.3171863555908203, + "rewards/rejected": -3.427738904953003, + "step": 2610 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 17.365741091766054, + "learning_rate": 4.7632122705514764e-08, + "logits/chosen": -3.5786216259002686, + "logits/rejected": -3.560622453689575, + "logps/chosen": -1.5478794574737549, + "logps/rejected": -1.7387754917144775, + "loss": 1.6611, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.0957589149475098, + "rewards/margins": 0.3817915916442871, + "rewards/rejected": -3.477550983428955, + "step": 2620 + }, + { + "epoch": 0.4531357684355617, + "grad_norm": 24.87506235878461, + "learning_rate": 4.760008388313216e-08, + "logits/chosen": -3.501713991165161, + "logits/rejected": -3.4820847511291504, + "logps/chosen": -1.5628116130828857, + "logps/rejected": -1.7581707239151, + "loss": 1.6362, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1256232261657715, + "rewards/margins": 0.39071792364120483, + "rewards/rejected": -3.5163414478302, + "step": 2630 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 23.012175339515903, + "learning_rate": 4.7567840686450835e-08, + "logits/chosen": -3.523447036743164, + "logits/rejected": -3.509652614593506, + "logps/chosen": -1.4769433736801147, + "logps/rejected": -1.6250473260879517, + "loss": 1.6781, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9538867473602295, + "rewards/margins": 0.2962080240249634, + "rewards/rejected": -3.2500946521759033, + "step": 2640 + }, + { + "epoch": 0.4565816678152998, + "grad_norm": 18.849676765957174, + "learning_rate": 4.7535393407048503e-08, + "logits/chosen": -3.5010228157043457, + "logits/rejected": -3.489445924758911, + "logps/chosen": -1.4864994287490845, + "logps/rejected": -1.6713321208953857, + "loss": 1.6511, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.972998857498169, + "rewards/margins": 0.3696654438972473, + "rewards/rejected": -3.3426642417907715, + "step": 2650 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 20.338054685030137, + "learning_rate": 4.7502742338348405e-08, + "logits/chosen": -3.5091450214385986, + "logits/rejected": -3.4845690727233887, + "logps/chosen": -1.595173716545105, + "logps/rejected": -1.8385064601898193, + "loss": 1.5992, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.19034743309021, + "rewards/margins": 0.48666566610336304, + "rewards/rejected": -3.6770129203796387, + "step": 2660 + }, + { + "epoch": 0.4600275671950379, + "grad_norm": 21.44776874283989, + "learning_rate": 4.746988777561668e-08, + "logits/chosen": -3.489619016647339, + "logits/rejected": -3.4728455543518066, + "logps/chosen": -1.515324592590332, + "logps/rejected": -1.6942918300628662, + "loss": 1.6823, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.030649185180664, + "rewards/margins": 0.3579341769218445, + "rewards/rejected": -3.3885836601257324, + "step": 2670 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 24.96311640250297, + "learning_rate": 4.743683001595965e-08, + "logits/chosen": -3.5090701580047607, + "logits/rejected": -3.4941611289978027, + "logps/chosen": -1.6230363845825195, + "logps/rejected": -1.7698160409927368, + "loss": 1.7483, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.246072769165039, + "rewards/margins": 0.293559193611145, + "rewards/rejected": -3.5396320819854736, + "step": 2680 + }, + { + "epoch": 0.463473466574776, + "grad_norm": 15.894487269324646, + "learning_rate": 4.7403569358321205e-08, + "logits/chosen": -3.5090789794921875, + "logits/rejected": -3.4939322471618652, + "logps/chosen": -1.512623906135559, + "logps/rejected": -1.7935421466827393, + "loss": 1.5223, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.025247812271118, + "rewards/margins": 0.5618361830711365, + "rewards/rejected": -3.5870842933654785, + "step": 2690 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 21.152825962774116, + "learning_rate": 4.737010610348001e-08, + "logits/chosen": -3.5270488262176514, + "logits/rejected": -3.512974500656128, + "logps/chosen": -1.4774478673934937, + "logps/rejected": -1.6769657135009766, + "loss": 1.5902, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.9548957347869873, + "rewards/margins": 0.39903539419174194, + "rewards/rejected": -3.353931427001953, + "step": 2700 + }, + { + "epoch": 0.4651964162646451, + "eval_logits/chosen": -3.5952651500701904, + "eval_logits/rejected": -3.5912773609161377, + "eval_logps/chosen": -1.4962078332901, + "eval_logps/rejected": -1.6325432062149048, + "eval_loss": 1.6600419282913208, + "eval_rewards/accuracies": 0.6103624701499939, + "eval_rewards/chosen": -2.9924156665802, + "eval_rewards/margins": 0.27267059683799744, + "eval_rewards/rejected": -3.2650864124298096, + "eval_runtime": 156.8739, + "eval_samples_per_second": 27.436, + "eval_steps_per_second": 3.43, + "step": 2700 + }, + { + "epoch": 0.4669193659545141, + "grad_norm": 23.818693249814505, + "learning_rate": 4.733644055404687e-08, + "logits/chosen": -3.5344905853271484, + "logits/rejected": -3.5250649452209473, + "logps/chosen": -1.5737957954406738, + "logps/rejected": -1.625176191329956, + "loss": 1.8328, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.1475915908813477, + "rewards/margins": 0.10276087373495102, + "rewards/rejected": -3.250352382659912, + "step": 2710 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 19.315113448552225, + "learning_rate": 4.730257301446193e-08, + "logits/chosen": -3.538361072540283, + "logits/rejected": -3.5386557579040527, + "logps/chosen": -1.5454622507095337, + "logps/rejected": -1.6588369607925415, + "loss": 1.7386, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.0909245014190674, + "rewards/margins": 0.226748988032341, + "rewards/rejected": -3.317673921585083, + "step": 2720 + }, + { + "epoch": 0.4703652653342522, + "grad_norm": 17.456384027930188, + "learning_rate": 4.726850379099198e-08, + "logits/chosen": -3.5382239818573, + "logits/rejected": -3.526935577392578, + "logps/chosen": -1.566082239151001, + "logps/rejected": -1.7395288944244385, + "loss": 1.6818, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.132164478302002, + "rewards/margins": 0.3468930125236511, + "rewards/rejected": -3.479057788848877, + "step": 2730 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 25.75366109140946, + "learning_rate": 4.7234233191727604e-08, + "logits/chosen": -3.537454605102539, + "logits/rejected": -3.523944139480591, + "logps/chosen": -1.5369473695755005, + "logps/rejected": -1.5752270221710205, + "loss": 1.8519, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.073894739151001, + "rewards/margins": 0.07655888050794601, + "rewards/rejected": -3.150454044342041, + "step": 2740 + }, + { + "epoch": 0.4738111647139904, + "grad_norm": 18.19229534674573, + "learning_rate": 4.7199761526580484e-08, + "logits/chosen": -3.4892055988311768, + "logits/rejected": -3.4843814373016357, + "logps/chosen": -1.5363198518753052, + "logps/rejected": -1.703956961631775, + "loss": 1.6718, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.0726397037506104, + "rewards/margins": 0.3352746367454529, + "rewards/rejected": -3.40791392326355, + "step": 2750 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 24.328272232403172, + "learning_rate": 4.716508910728054e-08, + "logits/chosen": -3.514357328414917, + "logits/rejected": -3.5018057823181152, + "logps/chosen": -1.5704712867736816, + "logps/rejected": -1.7397445440292358, + "loss": 1.6713, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1409425735473633, + "rewards/margins": 0.3385472893714905, + "rewards/rejected": -3.4794890880584717, + "step": 2760 + }, + { + "epoch": 0.4772570640937285, + "grad_norm": 21.273293187178403, + "learning_rate": 4.713021624737312e-08, + "logits/chosen": -3.597074508666992, + "logits/rejected": -3.5819308757781982, + "logps/chosen": -1.5182253122329712, + "logps/rejected": -1.63119375705719, + "loss": 1.7164, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.0364506244659424, + "rewards/margins": 0.22593703866004944, + "rewards/rejected": -3.26238751411438, + "step": 2770 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 25.202790716498647, + "learning_rate": 4.70951432622162e-08, + "logits/chosen": -3.4886536598205566, + "logits/rejected": -3.4704430103302, + "logps/chosen": -1.5664305686950684, + "logps/rejected": -1.7604808807373047, + "loss": 1.6346, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.1328611373901367, + "rewards/margins": 0.3881005346775055, + "rewards/rejected": -3.5209617614746094, + "step": 2780 + }, + { + "epoch": 0.4807029634734666, + "grad_norm": 19.358149849011784, + "learning_rate": 4.7059870468977484e-08, + "logits/chosen": -3.5623669624328613, + "logits/rejected": -3.542065382003784, + "logps/chosen": -1.4591697454452515, + "logps/rejected": -1.6379585266113281, + "loss": 1.6356, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.918339490890503, + "rewards/margins": 0.3575778901576996, + "rewards/rejected": -3.2759170532226562, + "step": 2790 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 20.83877943539323, + "learning_rate": 4.702439818663153e-08, + "logits/chosen": -3.5364794731140137, + "logits/rejected": -3.5230765342712402, + "logps/chosen": -1.6508305072784424, + "logps/rejected": -1.735446572303772, + "loss": 1.7921, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.3016610145568848, + "rewards/margins": 0.16923236846923828, + "rewards/rejected": -3.470893144607544, + "step": 2800 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -3.591460704803467, + "eval_logits/rejected": -3.587481737136841, + "eval_logps/chosen": -1.4990216493606567, + "eval_logps/rejected": -1.6359308958053589, + "eval_loss": 1.659471869468689, + "eval_rewards/accuracies": 0.607342004776001, + "eval_rewards/chosen": -2.9980432987213135, + "eval_rewards/margins": 0.2738187909126282, + "eval_rewards/rejected": -3.2718617916107178, + "eval_runtime": 156.6298, + "eval_samples_per_second": 27.479, + "eval_steps_per_second": 3.435, + "step": 2800 + }, + { + "epoch": 0.4841488628532047, + "grad_norm": 21.75163485316632, + "learning_rate": 4.6988726735956954e-08, + "logits/chosen": -3.474433183670044, + "logits/rejected": -3.460675001144409, + "logps/chosen": -1.5531630516052246, + "logps/rejected": -1.7034194469451904, + "loss": 1.6531, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.106326103210449, + "rewards/margins": 0.3005131781101227, + "rewards/rejected": -3.406838893890381, + "step": 2810 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 20.67346197530482, + "learning_rate": 4.69528564395334e-08, + "logits/chosen": -3.574214220046997, + "logits/rejected": -3.5675208568573, + "logps/chosen": -1.5892540216445923, + "logps/rejected": -1.6991474628448486, + "loss": 1.7856, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.1785080432891846, + "rewards/margins": 0.21978645026683807, + "rewards/rejected": -3.3982949256896973, + "step": 2820 + }, + { + "epoch": 0.4875947622329428, + "grad_norm": 17.557793212321087, + "learning_rate": 4.691678762173874e-08, + "logits/chosen": -3.448230266571045, + "logits/rejected": -3.4391448497772217, + "logps/chosen": -1.576761245727539, + "logps/rejected": -1.6903159618377686, + "loss": 1.7308, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.153522491455078, + "rewards/margins": 0.22710946202278137, + "rewards/rejected": -3.380631923675537, + "step": 2830 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 23.83017148831422, + "learning_rate": 4.688052060874606e-08, + "logits/chosen": -3.5590946674346924, + "logits/rejected": -3.5524983406066895, + "logps/chosen": -1.6167621612548828, + "logps/rejected": -1.6693241596221924, + "loss": 1.8297, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -3.2335243225097656, + "rewards/margins": 0.10512399673461914, + "rewards/rejected": -3.3386483192443848, + "step": 2840 + }, + { + "epoch": 0.4910406616126809, + "grad_norm": 19.222901486578564, + "learning_rate": 4.684405572852077e-08, + "logits/chosen": -3.5320706367492676, + "logits/rejected": -3.520273208618164, + "logps/chosen": -1.528803825378418, + "logps/rejected": -1.816014289855957, + "loss": 1.5162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.057607650756836, + "rewards/margins": 0.5744206309318542, + "rewards/rejected": -3.632028579711914, + "step": 2850 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 21.715640438635077, + "learning_rate": 4.6807393310817575e-08, + "logits/chosen": -3.5172202587127686, + "logits/rejected": -3.504070281982422, + "logps/chosen": -1.5411921739578247, + "logps/rejected": -1.651689887046814, + "loss": 1.7329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.0823843479156494, + "rewards/margins": 0.22099581360816956, + "rewards/rejected": -3.303379774093628, + "step": 2860 + }, + { + "epoch": 0.494486560992419, + "grad_norm": 20.66113857498892, + "learning_rate": 4.677053368717754e-08, + "logits/chosen": -3.5403544902801514, + "logits/rejected": -3.5256800651550293, + "logps/chosen": -1.5302436351776123, + "logps/rejected": -1.6945724487304688, + "loss": 1.6495, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.0604872703552246, + "rewards/margins": 0.3286581039428711, + "rewards/rejected": -3.3891448974609375, + "step": 2870 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 27.137584764117378, + "learning_rate": 4.673347719092507e-08, + "logits/chosen": -3.582447052001953, + "logits/rejected": -3.568591594696045, + "logps/chosen": -1.6116552352905273, + "logps/rejected": -1.7022960186004639, + "loss": 1.7332, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.2233104705810547, + "rewards/margins": 0.18128186464309692, + "rewards/rejected": -3.4045920372009277, + "step": 2880 + }, + { + "epoch": 0.49793246037215716, + "grad_norm": 16.96494640318737, + "learning_rate": 4.669622415716494e-08, + "logits/chosen": -3.5985405445098877, + "logits/rejected": -3.5956931114196777, + "logps/chosen": -1.5874855518341064, + "logps/rejected": -1.6971670389175415, + "loss": 1.7567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.174971103668213, + "rewards/margins": 0.2193630188703537, + "rewards/rejected": -3.394334077835083, + "step": 2890 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 24.40242723023024, + "learning_rate": 4.665877492277919e-08, + "logits/chosen": -3.5503909587860107, + "logits/rejected": -3.5451762676239014, + "logps/chosen": -1.5206695795059204, + "logps/rejected": -1.5757780075073242, + "loss": 1.8097, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.041339159011841, + "rewards/margins": 0.11021687090396881, + "rewards/rejected": -3.1515560150146484, + "step": 2900 + }, + { + "epoch": 0.4996554100620262, + "eval_logits/chosen": -3.6017730236053467, + "eval_logits/rejected": -3.5978736877441406, + "eval_logps/chosen": -1.502264142036438, + "eval_logps/rejected": -1.6400846242904663, + "eval_loss": 1.658117413520813, + "eval_rewards/accuracies": 0.609433114528656, + "eval_rewards/chosen": -3.004528284072876, + "eval_rewards/margins": 0.2756408154964447, + "eval_rewards/rejected": -3.2801692485809326, + "eval_runtime": 156.4349, + "eval_samples_per_second": 27.513, + "eval_steps_per_second": 3.439, + "step": 2900 + }, + { + "epoch": 0.5013783597518953, + "grad_norm": 23.67658230688155, + "learning_rate": 4.6621129826424115e-08, + "logits/chosen": -3.5685806274414062, + "logits/rejected": -3.5625579357147217, + "logps/chosen": -1.5707086324691772, + "logps/rejected": -1.6956619024276733, + "loss": 1.6986, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.1414172649383545, + "rewards/margins": 0.24990662932395935, + "rewards/rejected": -3.3913238048553467, + "step": 2910 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 21.533720717233635, + "learning_rate": 4.6583289208527247e-08, + "logits/chosen": -3.5343785285949707, + "logits/rejected": -3.5328216552734375, + "logps/chosen": -1.5680859088897705, + "logps/rejected": -1.7383458614349365, + "loss": 1.6529, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.136171817779541, + "rewards/margins": 0.34051990509033203, + "rewards/rejected": -3.476691722869873, + "step": 2920 + }, + { + "epoch": 0.5048242591316333, + "grad_norm": 20.06588074823616, + "learning_rate": 4.654525341128418e-08, + "logits/chosen": -3.516204833984375, + "logits/rejected": -3.499937057495117, + "logps/chosen": -1.493445634841919, + "logps/rejected": -1.732699990272522, + "loss": 1.6043, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.986891269683838, + "rewards/margins": 0.4785088002681732, + "rewards/rejected": -3.465399980545044, + "step": 2930 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 20.25682452682222, + "learning_rate": 4.650702277865558e-08, + "logits/chosen": -3.525402784347534, + "logits/rejected": -3.50477933883667, + "logps/chosen": -1.4905730485916138, + "logps/rejected": -1.7481991052627563, + "loss": 1.565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9811460971832275, + "rewards/margins": 0.5152522325515747, + "rewards/rejected": -3.4963982105255127, + "step": 2940 + }, + { + "epoch": 0.5082701585113715, + "grad_norm": 19.355519283415006, + "learning_rate": 4.6468597656363996e-08, + "logits/chosen": -3.559162139892578, + "logits/rejected": -3.548393726348877, + "logps/chosen": -1.5758719444274902, + "logps/rejected": -1.7218215465545654, + "loss": 1.7044, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.1517438888549805, + "rewards/margins": 0.2918991446495056, + "rewards/rejected": -3.443643093109131, + "step": 2950 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 22.905628924505262, + "learning_rate": 4.642997839189076e-08, + "logits/chosen": -3.527604341506958, + "logits/rejected": -3.5118815898895264, + "logps/chosen": -1.538595199584961, + "logps/rejected": -1.7134153842926025, + "loss": 1.6702, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.077190399169922, + "rewards/margins": 0.34964001178741455, + "rewards/rejected": -3.426830768585205, + "step": 2960 + }, + { + "epoch": 0.5117160578911096, + "grad_norm": 26.875358944936394, + "learning_rate": 4.639116533447286e-08, + "logits/chosen": -3.480087995529175, + "logits/rejected": -3.4620203971862793, + "logps/chosen": -1.5843675136566162, + "logps/rejected": -1.7241653203964233, + "loss": 1.7065, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.1687350273132324, + "rewards/margins": 0.2795955538749695, + "rewards/rejected": -3.4483306407928467, + "step": 2970 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 18.745957854494844, + "learning_rate": 4.6352158835099756e-08, + "logits/chosen": -3.4964210987091064, + "logits/rejected": -3.480994462966919, + "logps/chosen": -1.5148389339447021, + "logps/rejected": -1.6883690357208252, + "loss": 1.6358, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.0296778678894043, + "rewards/margins": 0.34706050157546997, + "rewards/rejected": -3.3767380714416504, + "step": 2980 + }, + { + "epoch": 0.5151619572708477, + "grad_norm": 18.150285355544217, + "learning_rate": 4.6312959246510234e-08, + "logits/chosen": -3.5180251598358154, + "logits/rejected": -3.5006356239318848, + "logps/chosen": -1.5528838634490967, + "logps/rejected": -1.7472543716430664, + "loss": 1.6102, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.1057677268981934, + "rewards/margins": 0.3887407183647156, + "rewards/rejected": -3.494508743286133, + "step": 2990 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 18.766133147314772, + "learning_rate": 4.627356692318919e-08, + "logits/chosen": -3.523806095123291, + "logits/rejected": -3.5196926593780518, + "logps/chosen": -1.4886195659637451, + "logps/rejected": -1.5956366062164307, + "loss": 1.7279, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.9772391319274902, + "rewards/margins": 0.214033842086792, + "rewards/rejected": -3.1912732124328613, + "step": 3000 + }, + { + "epoch": 0.5168849069607168, + "eval_logits/chosen": -3.60276198387146, + "eval_logits/rejected": -3.5988879203796387, + "eval_logps/chosen": -1.5055873394012451, + "eval_logps/rejected": -1.6443113088607788, + "eval_loss": 1.6569929122924805, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -3.0111746788024902, + "eval_rewards/margins": 0.27744823694229126, + "eval_rewards/rejected": -3.2886226177215576, + "eval_runtime": 156.594, + "eval_samples_per_second": 27.485, + "eval_steps_per_second": 3.436, + "step": 3000 + }, + { + "epoch": 0.5186078566505858, + "grad_norm": 19.050395398111565, + "learning_rate": 4.6233982221364434e-08, + "logits/chosen": -3.54716420173645, + "logits/rejected": -3.5339436531066895, + "logps/chosen": -1.5635385513305664, + "logps/rejected": -1.7567265033721924, + "loss": 1.6233, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.127077102661133, + "rewards/margins": 0.38637575507164, + "rewards/rejected": -3.5134530067443848, + "step": 3010 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 19.260032212613318, + "learning_rate": 4.6194205499003467e-08, + "logits/chosen": -3.6005859375, + "logits/rejected": -3.581347942352295, + "logps/chosen": -1.5460759401321411, + "logps/rejected": -1.7060333490371704, + "loss": 1.6888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0921518802642822, + "rewards/margins": 0.31991493701934814, + "rewards/rejected": -3.412066698074341, + "step": 3020 + }, + { + "epoch": 0.5220537560303239, + "grad_norm": 20.707361642872737, + "learning_rate": 4.6154237115810266e-08, + "logits/chosen": -3.523972988128662, + "logits/rejected": -3.5119528770446777, + "logps/chosen": -1.633847951889038, + "logps/rejected": -1.7418301105499268, + "loss": 1.7225, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.267695903778076, + "rewards/margins": 0.21596410870552063, + "rewards/rejected": -3.4836602210998535, + "step": 3030 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 17.13842870421205, + "learning_rate": 4.6114077433221995e-08, + "logits/chosen": -3.5997676849365234, + "logits/rejected": -3.596501588821411, + "logps/chosen": -1.5556844472885132, + "logps/rejected": -1.793182611465454, + "loss": 1.6319, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.1113688945770264, + "rewards/margins": 0.4749966561794281, + "rewards/rejected": -3.586365222930908, + "step": 3040 + }, + { + "epoch": 0.525499655410062, + "grad_norm": 19.09971736532056, + "learning_rate": 4.6073726814405746e-08, + "logits/chosen": -3.50529408454895, + "logits/rejected": -3.4967079162597656, + "logps/chosen": -1.5145231485366821, + "logps/rejected": -1.743211030960083, + "loss": 1.5676, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.0290462970733643, + "rewards/margins": 0.4573756754398346, + "rewards/rejected": -3.486422061920166, + "step": 3050 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 22.044922015941783, + "learning_rate": 4.603318562425528e-08, + "logits/chosen": -3.53715181350708, + "logits/rejected": -3.5272300243377686, + "logps/chosen": -1.6040160655975342, + "logps/rejected": -1.7148224115371704, + "loss": 1.7607, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.2080321311950684, + "rewards/margins": 0.22161240875720978, + "rewards/rejected": -3.429644823074341, + "step": 3060 + }, + { + "epoch": 0.5289455547898001, + "grad_norm": 20.564774809948467, + "learning_rate": 4.59924542293877e-08, + "logits/chosen": -3.5075111389160156, + "logits/rejected": -3.493360996246338, + "logps/chosen": -1.6072747707366943, + "logps/rejected": -1.7730575799942017, + "loss": 1.7245, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.2145495414733887, + "rewards/margins": 0.3315655589103699, + "rewards/rejected": -3.5461151599884033, + "step": 3070 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 23.645175177603708, + "learning_rate": 4.5951532998140134e-08, + "logits/chosen": -3.4444823265075684, + "logits/rejected": -3.4323654174804688, + "logps/chosen": -1.6246315240859985, + "logps/rejected": -1.886561632156372, + "loss": 1.5355, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.249263048171997, + "rewards/margins": 0.5238602161407471, + "rewards/rejected": -3.773123264312744, + "step": 3080 + }, + { + "epoch": 0.5323914541695383, + "grad_norm": 20.16606709632464, + "learning_rate": 4.591042230056644e-08, + "logits/chosen": -3.5102641582489014, + "logits/rejected": -3.5000598430633545, + "logps/chosen": -1.5432647466659546, + "logps/rejected": -1.8028837442398071, + "loss": 1.5332, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.086529493331909, + "rewards/margins": 0.5192378759384155, + "rewards/rejected": -3.6057674884796143, + "step": 3090 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 22.06332337072983, + "learning_rate": 4.5869122508433834e-08, + "logits/chosen": -3.5367438793182373, + "logits/rejected": -3.5194523334503174, + "logps/chosen": -1.515683889389038, + "logps/rejected": -1.682324767112732, + "loss": 1.6241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.031367778778076, + "rewards/margins": 0.33328235149383545, + "rewards/rejected": -3.364649534225464, + "step": 3100 + }, + { + "epoch": 0.5341144038594073, + "eval_logits/chosen": -3.5880203247070312, + "eval_logits/rejected": -3.584097385406494, + "eval_logps/chosen": -1.5081013441085815, + "eval_logps/rejected": -1.6470235586166382, + "eval_loss": 1.6564841270446777, + "eval_rewards/accuracies": 0.6143122911453247, + "eval_rewards/chosen": -3.016202688217163, + "eval_rewards/margins": 0.27784448862075806, + "eval_rewards/rejected": -3.2940471172332764, + "eval_runtime": 156.5889, + "eval_samples_per_second": 27.486, + "eval_steps_per_second": 3.436, + "step": 3100 + }, + { + "epoch": 0.5358373535492763, + "grad_norm": 22.61657398898043, + "learning_rate": 4.5827633995219486e-08, + "logits/chosen": -3.485015392303467, + "logits/rejected": -3.4906506538391113, + "logps/chosen": -1.6232506036758423, + "logps/rejected": -1.774287462234497, + "loss": 1.7608, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.2465012073516846, + "rewards/margins": 0.30207350850105286, + "rewards/rejected": -3.548574924468994, + "step": 3110 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 21.465823531708455, + "learning_rate": 4.5785957136107236e-08, + "logits/chosen": -3.515562057495117, + "logits/rejected": -3.501282215118408, + "logps/chosen": -1.652313232421875, + "logps/rejected": -1.8524677753448486, + "loss": 1.632, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.30462646484375, + "rewards/margins": 0.4003086984157562, + "rewards/rejected": -3.7049355506896973, + "step": 3120 + }, + { + "epoch": 0.5392832529290145, + "grad_norm": 19.50958558128429, + "learning_rate": 4.574409230798413e-08, + "logits/chosen": -3.473536729812622, + "logits/rejected": -3.4665894508361816, + "logps/chosen": -1.57256281375885, + "logps/rejected": -1.7457729578018188, + "loss": 1.6611, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.1451256275177, + "rewards/margins": 0.34642040729522705, + "rewards/rejected": -3.4915459156036377, + "step": 3130 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 20.29450802521176, + "learning_rate": 4.5702039889437015e-08, + "logits/chosen": -3.5421128273010254, + "logits/rejected": -3.5337085723876953, + "logps/chosen": -1.5624816417694092, + "logps/rejected": -1.768216848373413, + "loss": 1.6375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.1249632835388184, + "rewards/margins": 0.41147032380104065, + "rewards/rejected": -3.536433696746826, + "step": 3140 + }, + { + "epoch": 0.5427291523087526, + "grad_norm": 18.203886475110178, + "learning_rate": 4.565980026074917e-08, + "logits/chosen": -3.529654026031494, + "logits/rejected": -3.5151774883270264, + "logps/chosen": -1.5307319164276123, + "logps/rejected": -1.7041393518447876, + "loss": 1.6552, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.0614638328552246, + "rewards/margins": 0.3468155860900879, + "rewards/rejected": -3.408278703689575, + "step": 3150 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 19.77320049107566, + "learning_rate": 4.5617373803896796e-08, + "logits/chosen": -3.5037784576416016, + "logits/rejected": -3.4868767261505127, + "logps/chosen": -1.4582583904266357, + "logps/rejected": -1.6604583263397217, + "loss": 1.5966, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9165167808532715, + "rewards/margins": 0.4043997824192047, + "rewards/rejected": -3.3209166526794434, + "step": 3160 + }, + { + "epoch": 0.5461750516884907, + "grad_norm": 24.28769758642667, + "learning_rate": 4.557476090254562e-08, + "logits/chosen": -3.5170352458953857, + "logits/rejected": -3.4969589710235596, + "logps/chosen": -1.5380915403366089, + "logps/rejected": -1.6670929193496704, + "loss": 1.7092, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.0761830806732178, + "rewards/margins": 0.2580029368400574, + "rewards/rejected": -3.334185838699341, + "step": 3170 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 25.299694352214047, + "learning_rate": 4.5531961942047384e-08, + "logits/chosen": -3.5451698303222656, + "logits/rejected": -3.5239338874816895, + "logps/chosen": -1.5409469604492188, + "logps/rejected": -1.764762282371521, + "loss": 1.595, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.0818939208984375, + "rewards/margins": 0.44763070344924927, + "rewards/rejected": -3.529524564743042, + "step": 3180 + }, + { + "epoch": 0.5496209510682288, + "grad_norm": 17.38667090280676, + "learning_rate": 4.548897730943638e-08, + "logits/chosen": -3.5432746410369873, + "logits/rejected": -3.5396530628204346, + "logps/chosen": -1.5347793102264404, + "logps/rejected": -1.7214361429214478, + "loss": 1.6229, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.069558620452881, + "rewards/margins": 0.3733134865760803, + "rewards/rejected": -3.4428722858428955, + "step": 3190 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 23.301006937369813, + "learning_rate": 4.544580739342596e-08, + "logits/chosen": -3.5083885192871094, + "logits/rejected": -3.501107692718506, + "logps/chosen": -1.5942661762237549, + "logps/rejected": -1.6617510318756104, + "loss": 1.7948, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.1885323524475098, + "rewards/margins": 0.13496962189674377, + "rewards/rejected": -3.3235020637512207, + "step": 3200 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -3.5796115398406982, + "eval_logits/rejected": -3.575671672821045, + "eval_logps/chosen": -1.511987566947937, + "eval_logps/rejected": -1.6522151231765747, + "eval_loss": 1.6549086570739746, + "eval_rewards/accuracies": 0.6152416467666626, + "eval_rewards/chosen": -3.023975133895874, + "eval_rewards/margins": 0.28045499324798584, + "eval_rewards/rejected": -3.3044302463531494, + "eval_runtime": 156.681, + "eval_samples_per_second": 27.47, + "eval_steps_per_second": 3.434, + "step": 3200 + }, + { + "epoch": 0.5530668504479669, + "grad_norm": 18.09016245589862, + "learning_rate": 4.540245258440499e-08, + "logits/chosen": -3.4462122917175293, + "logits/rejected": -3.4327616691589355, + "logps/chosen": -1.539306640625, + "logps/rejected": -1.7113540172576904, + "loss": 1.6231, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.07861328125, + "rewards/margins": 0.3440949320793152, + "rewards/rejected": -3.422708034515381, + "step": 3210 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 16.870902650896433, + "learning_rate": 4.535891327443435e-08, + "logits/chosen": -3.4663162231445312, + "logits/rejected": -3.4632110595703125, + "logps/chosen": -1.5624347925186157, + "logps/rejected": -1.7508165836334229, + "loss": 1.6039, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.1248695850372314, + "rewards/margins": 0.3767639994621277, + "rewards/rejected": -3.5016331672668457, + "step": 3220 + }, + { + "epoch": 0.556512749827705, + "grad_norm": 21.160078092628815, + "learning_rate": 4.531518985724338e-08, + "logits/chosen": -3.502720594406128, + "logits/rejected": -3.4935011863708496, + "logps/chosen": -1.5427656173706055, + "logps/rejected": -1.7398267984390259, + "loss": 1.6359, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.085531234741211, + "rewards/margins": 0.3941222131252289, + "rewards/rejected": -3.4796535968780518, + "step": 3230 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 21.995930084183374, + "learning_rate": 4.527128272822629e-08, + "logits/chosen": -3.570192337036133, + "logits/rejected": -3.555345058441162, + "logps/chosen": -1.629023790359497, + "logps/rejected": -1.7916666269302368, + "loss": 1.7055, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.258047580718994, + "rewards/margins": 0.32528600096702576, + "rewards/rejected": -3.5833332538604736, + "step": 3240 + }, + { + "epoch": 0.5599586492074431, + "grad_norm": 19.5607094902708, + "learning_rate": 4.5227192284438634e-08, + "logits/chosen": -3.5141613483428955, + "logits/rejected": -3.5007147789001465, + "logps/chosen": -1.5176727771759033, + "logps/rejected": -1.6937882900238037, + "loss": 1.6095, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.0353455543518066, + "rewards/margins": 0.35223108530044556, + "rewards/rejected": -3.3875765800476074, + "step": 3250 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 22.10198370662188, + "learning_rate": 4.5182918924593705e-08, + "logits/chosen": -3.51397442817688, + "logits/rejected": -3.505155086517334, + "logps/chosen": -1.5539159774780273, + "logps/rejected": -1.7412288188934326, + "loss": 1.6195, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1078319549560547, + "rewards/margins": 0.37462592124938965, + "rewards/rejected": -3.4824576377868652, + "step": 3260 + }, + { + "epoch": 0.5634045485871813, + "grad_norm": 20.372770269447795, + "learning_rate": 4.5138463049058887e-08, + "logits/chosen": -3.555504322052002, + "logits/rejected": -3.552196979522705, + "logps/chosen": -1.6757938861846924, + "logps/rejected": -1.8228938579559326, + "loss": 1.7234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.3515877723693848, + "rewards/margins": 0.2942003905773163, + "rewards/rejected": -3.6457877159118652, + "step": 3270 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 22.555524733843377, + "learning_rate": 4.5093825059852096e-08, + "logits/chosen": -3.5553245544433594, + "logits/rejected": -3.550888776779175, + "logps/chosen": -1.5342967510223389, + "logps/rejected": -1.7154210805892944, + "loss": 1.6376, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.0685935020446777, + "rewards/margins": 0.3622487783432007, + "rewards/rejected": -3.430842161178589, + "step": 3280 + }, + { + "epoch": 0.5668504479669194, + "grad_norm": 22.212302775556836, + "learning_rate": 4.50490053606381e-08, + "logits/chosen": -3.582587480545044, + "logits/rejected": -3.5653979778289795, + "logps/chosen": -1.5807745456695557, + "logps/rejected": -1.7688995599746704, + "loss": 1.6242, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.1615490913391113, + "rewards/margins": 0.3762495219707489, + "rewards/rejected": -3.537799119949341, + "step": 3290 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 21.327933756495668, + "learning_rate": 4.5004004356724893e-08, + "logits/chosen": -3.5153117179870605, + "logits/rejected": -3.502204179763794, + "logps/chosen": -1.6156543493270874, + "logps/rejected": -1.7519696950912476, + "loss": 1.7049, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.231308698654175, + "rewards/margins": 0.2726311981678009, + "rewards/rejected": -3.503939390182495, + "step": 3300 + }, + { + "epoch": 0.5685733976567884, + "eval_logits/chosen": -3.5840988159179688, + "eval_logits/rejected": -3.5802087783813477, + "eval_logps/chosen": -1.5169111490249634, + "eval_logps/rejected": -1.6580452919006348, + "eval_loss": 1.6538208723068237, + "eval_rewards/accuracies": 0.6154739856719971, + "eval_rewards/chosen": -3.0338222980499268, + "eval_rewards/margins": 0.2822684347629547, + "eval_rewards/rejected": -3.3160905838012695, + "eval_runtime": 156.705, + "eval_samples_per_second": 27.466, + "eval_steps_per_second": 3.433, + "step": 3300 + }, + { + "epoch": 0.5702963473466575, + "grad_norm": 21.679167328737122, + "learning_rate": 4.495882245506002e-08, + "logits/chosen": -3.4700264930725098, + "logits/rejected": -3.455059051513672, + "logps/chosen": -1.5428388118743896, + "logps/rejected": -1.7376238107681274, + "loss": 1.6195, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0856776237487793, + "rewards/margins": 0.38957005739212036, + "rewards/rejected": -3.475247621536255, + "step": 3310 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 25.603498684548633, + "learning_rate": 4.4913460064226896e-08, + "logits/chosen": -3.518009662628174, + "logits/rejected": -3.4975593090057373, + "logps/chosen": -1.5803298950195312, + "logps/rejected": -1.7262861728668213, + "loss": 1.666, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.1606597900390625, + "rewards/margins": 0.2919124960899353, + "rewards/rejected": -3.4525723457336426, + "step": 3320 + }, + { + "epoch": 0.5737422467263956, + "grad_norm": 28.944264175946945, + "learning_rate": 4.486791759444111e-08, + "logits/chosen": -3.56878399848938, + "logits/rejected": -3.5499520301818848, + "logps/chosen": -1.5471227169036865, + "logps/rejected": -1.7770448923110962, + "loss": 1.5814, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.094245433807373, + "rewards/margins": 0.4598442018032074, + "rewards/rejected": -3.5540897846221924, + "step": 3330 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 21.293102422654563, + "learning_rate": 4.482219545754672e-08, + "logits/chosen": -3.5328147411346436, + "logits/rejected": -3.523144245147705, + "logps/chosen": -1.6290180683135986, + "logps/rejected": -1.7476980686187744, + "loss": 1.7738, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.2580361366271973, + "rewards/margins": 0.23736043274402618, + "rewards/rejected": -3.495396137237549, + "step": 3340 + }, + { + "epoch": 0.5771881461061337, + "grad_norm": 21.646065919154257, + "learning_rate": 4.4776294067012546e-08, + "logits/chosen": -3.470078229904175, + "logits/rejected": -3.461217164993286, + "logps/chosen": -1.6474339962005615, + "logps/rejected": -1.9271421432495117, + "loss": 1.5833, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.294867992401123, + "rewards/margins": 0.5594164133071899, + "rewards/rejected": -3.8542842864990234, + "step": 3350 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 21.080959424054257, + "learning_rate": 4.473021383792838e-08, + "logits/chosen": -3.5456137657165527, + "logits/rejected": -3.5254878997802734, + "logps/chosen": -1.5511671304702759, + "logps/rejected": -1.8334741592407227, + "loss": 1.5623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.1023342609405518, + "rewards/margins": 0.5646138191223145, + "rewards/rejected": -3.6669483184814453, + "step": 3360 + }, + { + "epoch": 0.5806340454858718, + "grad_norm": 21.770103048345664, + "learning_rate": 4.468395518700129e-08, + "logits/chosen": -3.4885201454162598, + "logits/rejected": -3.4819750785827637, + "logps/chosen": -1.570568323135376, + "logps/rejected": -1.7823537588119507, + "loss": 1.6124, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.141136646270752, + "rewards/margins": 0.4235707223415375, + "rewards/rejected": -3.5647075176239014, + "step": 3370 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 22.708915579764643, + "learning_rate": 4.463751853255182e-08, + "logits/chosen": -3.5486316680908203, + "logits/rejected": -3.5280730724334717, + "logps/chosen": -1.5971581935882568, + "logps/rejected": -1.7577226161956787, + "loss": 1.6647, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.1943163871765137, + "rewards/margins": 0.321128785610199, + "rewards/rejected": -3.5154452323913574, + "step": 3380 + }, + { + "epoch": 0.5840799448656099, + "grad_norm": 20.6616118324372, + "learning_rate": 4.45909042945102e-08, + "logits/chosen": -3.515942096710205, + "logits/rejected": -3.500392198562622, + "logps/chosen": -1.6013892889022827, + "logps/rejected": -1.7677446603775024, + "loss": 1.6614, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.2027785778045654, + "rewards/margins": 0.3327101469039917, + "rewards/rejected": -3.535489320755005, + "step": 3390 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 21.108183190199703, + "learning_rate": 4.454411289441259e-08, + "logits/chosen": -3.5433859825134277, + "logits/rejected": -3.5207626819610596, + "logps/chosen": -1.5307109355926514, + "logps/rejected": -1.789168357849121, + "loss": 1.543, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0614218711853027, + "rewards/margins": 0.5169152021408081, + "rewards/rejected": -3.578336715698242, + "step": 3400 + }, + { + "epoch": 0.585802894555479, + "eval_logits/chosen": -3.575551986694336, + "eval_logits/rejected": -3.571643590927124, + "eval_logps/chosen": -1.523762583732605, + "eval_logps/rejected": -1.6658252477645874, + "eval_loss": 1.6526745557785034, + "eval_rewards/accuracies": 0.6150093078613281, + "eval_rewards/chosen": -3.04752516746521, + "eval_rewards/margins": 0.2841256260871887, + "eval_rewards/rejected": -3.331650495529175, + "eval_runtime": 156.4592, + "eval_samples_per_second": 27.509, + "eval_steps_per_second": 3.439, + "step": 3400 + }, + { + "epoch": 0.587525844245348, + "grad_norm": 16.82487683977254, + "learning_rate": 4.4497144755397215e-08, + "logits/chosen": -3.475933790206909, + "logits/rejected": -3.4593138694763184, + "logps/chosen": -1.513566017150879, + "logps/rejected": -1.6990461349487305, + "loss": 1.6397, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.027132034301758, + "rewards/margins": 0.3709601163864136, + "rewards/rejected": -3.398092269897461, + "step": 3410 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 20.643045213933963, + "learning_rate": 4.4450000302200574e-08, + "logits/chosen": -3.494248867034912, + "logits/rejected": -3.478797197341919, + "logps/chosen": -1.4995458126068115, + "logps/rejected": -1.7580335140228271, + "loss": 1.5348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.999091625213623, + "rewards/margins": 0.5169753432273865, + "rewards/rejected": -3.5160670280456543, + "step": 3420 + }, + { + "epoch": 0.5909717436250862, + "grad_norm": 18.504600038550002, + "learning_rate": 4.440267996115359e-08, + "logits/chosen": -3.5350253582000732, + "logits/rejected": -3.5231475830078125, + "logps/chosen": -1.532119631767273, + "logps/rejected": -1.7873508930206299, + "loss": 1.5508, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.064239263534546, + "rewards/margins": 0.5104620456695557, + "rewards/rejected": -3.5747017860412598, + "step": 3430 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 18.7050385437194, + "learning_rate": 4.435518416017774e-08, + "logits/chosen": -3.4847183227539062, + "logits/rejected": -3.4754958152770996, + "logps/chosen": -1.6193721294403076, + "logps/rejected": -1.873342514038086, + "loss": 1.5969, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.2387442588806152, + "rewards/margins": 0.507940411567688, + "rewards/rejected": -3.746685028076172, + "step": 3440 + }, + { + "epoch": 0.5944176430048242, + "grad_norm": 21.39463795979804, + "learning_rate": 4.430751332878122e-08, + "logits/chosen": -3.6356494426727295, + "logits/rejected": -3.6142475605010986, + "logps/chosen": -1.624468207359314, + "logps/rejected": -1.8694798946380615, + "loss": 1.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.248936414718628, + "rewards/margins": 0.490023672580719, + "rewards/rejected": -3.738959789276123, + "step": 3450 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 21.780286574835966, + "learning_rate": 4.425966789805503e-08, + "logits/chosen": -3.5139224529266357, + "logits/rejected": -3.5047507286071777, + "logps/chosen": -1.5473768711090088, + "logps/rejected": -1.7656255960464478, + "loss": 1.6057, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.0947537422180176, + "rewards/margins": 0.4364975392818451, + "rewards/rejected": -3.5312511920928955, + "step": 3460 + }, + { + "epoch": 0.5978635423845624, + "grad_norm": 19.246292701235816, + "learning_rate": 4.4211648300669074e-08, + "logits/chosen": -3.561131238937378, + "logits/rejected": -3.5516982078552246, + "logps/chosen": -1.6029859781265259, + "logps/rejected": -1.7689380645751953, + "loss": 1.6538, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.2059719562530518, + "rewards/margins": 0.33190399408340454, + "rewards/rejected": -3.5378761291503906, + "step": 3470 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 22.9708867674151, + "learning_rate": 4.416345497086827e-08, + "logits/chosen": -3.508579969406128, + "logits/rejected": -3.488091230392456, + "logps/chosen": -1.5746628046035767, + "logps/rejected": -1.7909082174301147, + "loss": 1.6497, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.1493256092071533, + "rewards/margins": 0.43249064683914185, + "rewards/rejected": -3.5818164348602295, + "step": 3480 + }, + { + "epoch": 0.6013094417643005, + "grad_norm": 23.122590243408812, + "learning_rate": 4.411508834446863e-08, + "logits/chosen": -3.513617753982544, + "logits/rejected": -3.4974639415740967, + "logps/chosen": -1.5689325332641602, + "logps/rejected": -1.7848838567733765, + "loss": 1.6383, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.1378650665283203, + "rewards/margins": 0.43190279603004456, + "rewards/rejected": -3.569767713546753, + "step": 3490 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 15.50382914169406, + "learning_rate": 4.406654885885326e-08, + "logits/chosen": -3.478710889816284, + "logits/rejected": -3.4744277000427246, + "logps/chosen": -1.5697340965270996, + "logps/rejected": -1.754762053489685, + "loss": 1.6486, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.139468193054199, + "rewards/margins": 0.3700560927391052, + "rewards/rejected": -3.50952410697937, + "step": 3500 + }, + { + "epoch": 0.6030323914541695, + "eval_logits/chosen": -3.5703976154327393, + "eval_logits/rejected": -3.5664963722229004, + "eval_logps/chosen": -1.5292646884918213, + "eval_logps/rejected": -1.6728681325912476, + "eval_loss": 1.6510655879974365, + "eval_rewards/accuracies": 0.6145446300506592, + "eval_rewards/chosen": -3.0585293769836426, + "eval_rewards/margins": 0.28720709681510925, + "eval_rewards/rejected": -3.345736265182495, + "eval_runtime": 156.3653, + "eval_samples_per_second": 27.525, + "eval_steps_per_second": 3.441, + "step": 3500 + }, + { + "epoch": 0.6047553411440386, + "grad_norm": 22.29886046193453, + "learning_rate": 4.401783695296847e-08, + "logits/chosen": -3.4483230113983154, + "logits/rejected": -3.4364447593688965, + "logps/chosen": -1.721631646156311, + "logps/rejected": -1.8468137979507446, + "loss": 1.7322, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.443263292312622, + "rewards/margins": 0.2503639757633209, + "rewards/rejected": -3.6936275959014893, + "step": 3510 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 18.98119878587945, + "learning_rate": 4.3968953067319766e-08, + "logits/chosen": -3.471553087234497, + "logits/rejected": -3.461832046508789, + "logps/chosen": -1.6318126916885376, + "logps/rejected": -1.7605682611465454, + "loss": 1.7338, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.263625383377075, + "rewards/margins": 0.2575114667415619, + "rewards/rejected": -3.521136522293091, + "step": 3520 + }, + { + "epoch": 0.6082012405237767, + "grad_norm": 20.57754929882489, + "learning_rate": 4.391989764396792e-08, + "logits/chosen": -3.571039915084839, + "logits/rejected": -3.5478732585906982, + "logps/chosen": -1.5793527364730835, + "logps/rejected": -1.7808879613876343, + "loss": 1.6339, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.158705472946167, + "rewards/margins": 0.4030703604221344, + "rewards/rejected": -3.5617759227752686, + "step": 3530 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 20.105181655361378, + "learning_rate": 4.387067112652487e-08, + "logits/chosen": -3.458124876022339, + "logits/rejected": -3.4447808265686035, + "logps/chosen": -1.6232678890228271, + "logps/rejected": -1.8294509649276733, + "loss": 1.6243, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.2465357780456543, + "rewards/margins": 0.41236647963523865, + "rewards/rejected": -3.6589019298553467, + "step": 3540 + }, + { + "epoch": 0.6116471399035148, + "grad_norm": 25.393004927248732, + "learning_rate": 4.382127396014982e-08, + "logits/chosen": -3.5119049549102783, + "logits/rejected": -3.505063533782959, + "logps/chosen": -1.6516516208648682, + "logps/rejected": -1.7571630477905273, + "loss": 1.7394, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.3033032417297363, + "rewards/margins": 0.21102304756641388, + "rewards/rejected": -3.5143260955810547, + "step": 3550 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 20.919342579803196, + "learning_rate": 4.377170659154514e-08, + "logits/chosen": -3.4713821411132812, + "logits/rejected": -3.454129695892334, + "logps/chosen": -1.5624113082885742, + "logps/rejected": -1.8053419589996338, + "loss": 1.5454, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1248226165771484, + "rewards/margins": 0.4858609139919281, + "rewards/rejected": -3.6106839179992676, + "step": 3560 + }, + { + "epoch": 0.6150930392832529, + "grad_norm": 23.017179568357527, + "learning_rate": 4.372196946895238e-08, + "logits/chosen": -3.558825969696045, + "logits/rejected": -3.5400726795196533, + "logps/chosen": -1.5892484188079834, + "logps/rejected": -1.755082368850708, + "loss": 1.6787, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.178496837615967, + "rewards/margins": 0.3316681385040283, + "rewards/rejected": -3.510164737701416, + "step": 3570 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 22.31539552235463, + "learning_rate": 4.367206304214815e-08, + "logits/chosen": -3.5113017559051514, + "logits/rejected": -3.496495485305786, + "logps/chosen": -1.6263859272003174, + "logps/rejected": -1.8575493097305298, + "loss": 1.5525, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2527718544006348, + "rewards/margins": 0.46232661604881287, + "rewards/rejected": -3.7150986194610596, + "step": 3580 + }, + { + "epoch": 0.618538938662991, + "grad_norm": 21.267612715111294, + "learning_rate": 4.3621987762440115e-08, + "logits/chosen": -3.5221686363220215, + "logits/rejected": -3.5131359100341797, + "logps/chosen": -1.6660581827163696, + "logps/rejected": -1.8160755634307861, + "loss": 1.6868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.3321163654327393, + "rewards/margins": 0.30003491044044495, + "rewards/rejected": -3.6321511268615723, + "step": 3590 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 28.304200647673298, + "learning_rate": 4.3571744082662884e-08, + "logits/chosen": -3.4943509101867676, + "logits/rejected": -3.481778383255005, + "logps/chosen": -1.667035460472107, + "logps/rejected": -1.7948644161224365, + "loss": 1.7359, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.334070920944214, + "rewards/margins": 0.2556580603122711, + "rewards/rejected": -3.589728832244873, + "step": 3600 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -3.583289623260498, + "eval_logits/rejected": -3.579462766647339, + "eval_logps/chosen": -1.5357015132904053, + "eval_logps/rejected": -1.6801916360855103, + "eval_loss": 1.6500396728515625, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -3.0714030265808105, + "eval_rewards/margins": 0.28898051381111145, + "eval_rewards/rejected": -3.3603832721710205, + "eval_runtime": 156.6415, + "eval_samples_per_second": 27.477, + "eval_steps_per_second": 3.435, + "step": 3600 + }, + { + "epoch": 0.6219848380427292, + "grad_norm": 18.362961629827968, + "learning_rate": 4.352133245717393e-08, + "logits/chosen": -3.484989881515503, + "logits/rejected": -3.4745235443115234, + "logps/chosen": -1.6583967208862305, + "logps/rejected": -1.741321325302124, + "loss": 1.7713, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.316793441772461, + "rewards/margins": 0.16584941744804382, + "rewards/rejected": -3.482642650604248, + "step": 3610 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 21.323593124309703, + "learning_rate": 4.347075334184946e-08, + "logits/chosen": -3.4707393646240234, + "logits/rejected": -3.4574012756347656, + "logps/chosen": -1.5460138320922852, + "logps/rejected": -1.7408815622329712, + "loss": 1.6498, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.0920276641845703, + "rewards/margins": 0.389735609292984, + "rewards/rejected": -3.4817631244659424, + "step": 3620 + }, + { + "epoch": 0.6254307374224672, + "grad_norm": 18.71569368538105, + "learning_rate": 4.34200071940803e-08, + "logits/chosen": -3.5621440410614014, + "logits/rejected": -3.5573132038116455, + "logps/chosen": -1.600992202758789, + "logps/rejected": -1.7913658618927002, + "loss": 1.6231, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.201984405517578, + "rewards/margins": 0.3807474672794342, + "rewards/rejected": -3.5827317237854004, + "step": 3630 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 19.261910901312337, + "learning_rate": 4.3369094472767786e-08, + "logits/chosen": -3.5214202404022217, + "logits/rejected": -3.512678623199463, + "logps/chosen": -1.601197600364685, + "logps/rejected": -1.7796001434326172, + "loss": 1.648, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.20239520072937, + "rewards/margins": 0.3568049669265747, + "rewards/rejected": -3.5592002868652344, + "step": 3640 + }, + { + "epoch": 0.6288766368022054, + "grad_norm": 19.719993584275006, + "learning_rate": 4.331801563831956e-08, + "logits/chosen": -3.4922096729278564, + "logits/rejected": -3.493436098098755, + "logps/chosen": -1.5860096216201782, + "logps/rejected": -1.7005609273910522, + "loss": 1.7123, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1720192432403564, + "rewards/margins": 0.22910240292549133, + "rewards/rejected": -3.4011218547821045, + "step": 3650 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 16.998889027459683, + "learning_rate": 4.326677115264547e-08, + "logits/chosen": -3.5350887775421143, + "logits/rejected": -3.5195841789245605, + "logps/chosen": -1.6006271839141846, + "logps/rejected": -1.8400976657867432, + "loss": 1.5517, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.201254367828369, + "rewards/margins": 0.47894150018692017, + "rewards/rejected": -3.6801953315734863, + "step": 3660 + }, + { + "epoch": 0.6323225361819435, + "grad_norm": 19.25527910303187, + "learning_rate": 4.321536147915334e-08, + "logits/chosen": -3.507047653198242, + "logits/rejected": -3.4921467304229736, + "logps/chosen": -1.5557204484939575, + "logps/rejected": -1.7485002279281616, + "loss": 1.6086, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.111440896987915, + "rewards/margins": 0.3855598568916321, + "rewards/rejected": -3.4970004558563232, + "step": 3670 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 18.082137968870132, + "learning_rate": 4.3163787082744806e-08, + "logits/chosen": -3.5070457458496094, + "logits/rejected": -3.4832539558410645, + "logps/chosen": -1.5834510326385498, + "logps/rejected": -1.8070939779281616, + "loss": 1.6122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.1669020652770996, + "rewards/margins": 0.44728603959083557, + "rewards/rejected": -3.6141879558563232, + "step": 3680 + }, + { + "epoch": 0.6357684355616816, + "grad_norm": 24.87352331697395, + "learning_rate": 4.31120484298111e-08, + "logits/chosen": -3.470752716064453, + "logits/rejected": -3.4754955768585205, + "logps/chosen": -1.6419143676757812, + "logps/rejected": -1.8033899068832397, + "loss": 1.6887, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2838287353515625, + "rewards/margins": 0.32295089960098267, + "rewards/rejected": -3.6067798137664795, + "step": 3690 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 21.175970402928275, + "learning_rate": 4.306014598822886e-08, + "logits/chosen": -3.489018201828003, + "logits/rejected": -3.4745326042175293, + "logps/chosen": -1.5837953090667725, + "logps/rejected": -1.8907371759414673, + "loss": 1.4877, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.167590618133545, + "rewards/margins": 0.613883376121521, + "rewards/rejected": -3.7814743518829346, + "step": 3700 + }, + { + "epoch": 0.6374913852515507, + "eval_logits/chosen": -3.5780184268951416, + "eval_logits/rejected": -3.5741944313049316, + "eval_logps/chosen": -1.5419650077819824, + "eval_logps/rejected": -1.687447428703308, + "eval_loss": 1.6489049196243286, + "eval_rewards/accuracies": 0.6117565035820007, + "eval_rewards/chosen": -3.083930015563965, + "eval_rewards/margins": 0.29096490144729614, + "eval_rewards/rejected": -3.374894857406616, + "eval_runtime": 156.7107, + "eval_samples_per_second": 27.465, + "eval_steps_per_second": 3.433, + "step": 3700 + }, + { + "epoch": 0.6392143349414197, + "grad_norm": 21.199368564559006, + "learning_rate": 4.300808022735584e-08, + "logits/chosen": -3.518216371536255, + "logits/rejected": -3.4965262413024902, + "logps/chosen": -1.5694875717163086, + "logps/rejected": -1.7815688848495483, + "loss": 1.6212, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.138975143432617, + "rewards/margins": 0.42416271567344666, + "rewards/rejected": -3.5631377696990967, + "step": 3710 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 19.668637884855393, + "learning_rate": 4.295585161802674e-08, + "logits/chosen": -3.5326778888702393, + "logits/rejected": -3.5229575634002686, + "logps/chosen": -1.5495307445526123, + "logps/rejected": -1.8039547204971313, + "loss": 1.5556, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.0990614891052246, + "rewards/margins": 0.5088481903076172, + "rewards/rejected": -3.6079094409942627, + "step": 3720 + }, + { + "epoch": 0.6426602343211578, + "grad_norm": 23.00959694622578, + "learning_rate": 4.290346063254889e-08, + "logits/chosen": -3.5017476081848145, + "logits/rejected": -3.490367889404297, + "logps/chosen": -1.6386464834213257, + "logps/rejected": -1.769214391708374, + "loss": 1.7011, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.2772929668426514, + "rewards/margins": 0.2611355781555176, + "rewards/rejected": -3.538428783416748, + "step": 3730 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 20.90858620284605, + "learning_rate": 4.285090774469802e-08, + "logits/chosen": -3.498084306716919, + "logits/rejected": -3.485083818435669, + "logps/chosen": -1.6079012155532837, + "logps/rejected": -1.85768723487854, + "loss": 1.5716, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.2158024311065674, + "rewards/margins": 0.49957188963890076, + "rewards/rejected": -3.71537446975708, + "step": 3740 + }, + { + "epoch": 0.646106133700896, + "grad_norm": 20.704303295325644, + "learning_rate": 4.279819342971391e-08, + "logits/chosen": -3.5477099418640137, + "logits/rejected": -3.531142473220825, + "logps/chosen": -1.540325403213501, + "logps/rejected": -1.760369062423706, + "loss": 1.5762, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.080650806427002, + "rewards/margins": 0.44008731842041016, + "rewards/rejected": -3.520738124847412, + "step": 3750 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 23.407593061439524, + "learning_rate": 4.27453181642962e-08, + "logits/chosen": -3.531482696533203, + "logits/rejected": -3.5305285453796387, + "logps/chosen": -1.7031500339508057, + "logps/rejected": -1.788808822631836, + "loss": 1.7919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.4063000679016113, + "rewards/margins": 0.17131750285625458, + "rewards/rejected": -3.577617645263672, + "step": 3760 + }, + { + "epoch": 0.649552033080634, + "grad_norm": 19.452589109579158, + "learning_rate": 4.269228242659997e-08, + "logits/chosen": -3.4985294342041016, + "logits/rejected": -3.4864680767059326, + "logps/chosen": -1.6103837490081787, + "logps/rejected": -1.8434162139892578, + "loss": 1.6227, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.2207674980163574, + "rewards/margins": 0.46606501936912537, + "rewards/rejected": -3.6868324279785156, + "step": 3770 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 23.42530377801617, + "learning_rate": 4.2639086696231486e-08, + "logits/chosen": -3.5350756645202637, + "logits/rejected": -3.5143585205078125, + "logps/chosen": -1.6326147317886353, + "logps/rejected": -1.8029708862304688, + "loss": 1.6695, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.2652294635772705, + "rewards/margins": 0.340712308883667, + "rewards/rejected": -3.6059417724609375, + "step": 3780 + }, + { + "epoch": 0.6529979324603722, + "grad_norm": 20.933357744560663, + "learning_rate": 4.2585731454243836e-08, + "logits/chosen": -3.494610548019409, + "logits/rejected": -3.4775424003601074, + "logps/chosen": -1.637798547744751, + "logps/rejected": -1.887102484703064, + "loss": 1.7018, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.275597095489502, + "rewards/margins": 0.4986083507537842, + "rewards/rejected": -3.774204969406128, + "step": 3790 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 27.485690854300742, + "learning_rate": 4.2532217183132566e-08, + "logits/chosen": -3.503495454788208, + "logits/rejected": -3.484673023223877, + "logps/chosen": -1.6580177545547485, + "logps/rejected": -1.7828104496002197, + "loss": 1.7414, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.316035509109497, + "rewards/margins": 0.2495851069688797, + "rewards/rejected": -3.5656208992004395, + "step": 3800 + }, + { + "epoch": 0.6547208821502413, + "eval_logits/chosen": -3.565796375274658, + "eval_logits/rejected": -3.56192946434021, + "eval_logps/chosen": -1.550700306892395, + "eval_logps/rejected": -1.696800708770752, + "eval_loss": 1.6483756303787231, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -3.10140061378479, + "eval_rewards/margins": 0.29220104217529297, + "eval_rewards/rejected": -3.393601417541504, + "eval_runtime": 156.6412, + "eval_samples_per_second": 27.477, + "eval_steps_per_second": 3.435, + "step": 3800 + }, + { + "epoch": 0.6564438318401102, + "grad_norm": 20.133480070529412, + "learning_rate": 4.247854436683137e-08, + "logits/chosen": -3.5265660285949707, + "logits/rejected": -3.500843048095703, + "logps/chosen": -1.604543685913086, + "logps/rejected": -1.788063645362854, + "loss": 1.6292, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.209087371826172, + "rewards/margins": 0.36703991889953613, + "rewards/rejected": -3.576127290725708, + "step": 3810 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 20.920457830727724, + "learning_rate": 4.242471349070765e-08, + "logits/chosen": -3.5083422660827637, + "logits/rejected": -3.4994397163391113, + "logps/chosen": -1.5892269611358643, + "logps/rejected": -1.8270584344863892, + "loss": 1.6352, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.1784539222717285, + "rewards/margins": 0.47566255927085876, + "rewards/rejected": -3.6541168689727783, + "step": 3820 + }, + { + "epoch": 0.6598897312198484, + "grad_norm": 23.42467987703444, + "learning_rate": 4.237072504155817e-08, + "logits/chosen": -3.5431761741638184, + "logits/rejected": -3.5177981853485107, + "logps/chosen": -1.6288204193115234, + "logps/rejected": -1.7654516696929932, + "loss": 1.7012, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.257640838623047, + "rewards/margins": 0.27326300740242004, + "rewards/rejected": -3.5309033393859863, + "step": 3830 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 24.645071242568825, + "learning_rate": 4.231657950760461e-08, + "logits/chosen": -3.4864509105682373, + "logits/rejected": -3.478947401046753, + "logps/chosen": -1.5760220289230347, + "logps/rejected": -1.8253257274627686, + "loss": 1.5384, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.1520440578460693, + "rewards/margins": 0.498607873916626, + "rewards/rejected": -3.650651454925537, + "step": 3840 + }, + { + "epoch": 0.6633356305995864, + "grad_norm": 20.614900632030324, + "learning_rate": 4.2262277378489225e-08, + "logits/chosen": -3.568324565887451, + "logits/rejected": -3.5590262413024902, + "logps/chosen": -1.6439754962921143, + "logps/rejected": -1.7441537380218506, + "loss": 1.7574, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.2879509925842285, + "rewards/margins": 0.2003568708896637, + "rewards/rejected": -3.488307476043701, + "step": 3850 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 19.802028450554975, + "learning_rate": 4.220781914527035e-08, + "logits/chosen": -3.564906597137451, + "logits/rejected": -3.5518977642059326, + "logps/chosen": -1.6339585781097412, + "logps/rejected": -1.7989826202392578, + "loss": 1.6644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.2679171562194824, + "rewards/margins": 0.3300477862358093, + "rewards/rejected": -3.5979652404785156, + "step": 3860 + }, + { + "epoch": 0.6667815299793246, + "grad_norm": 28.590384136874516, + "learning_rate": 4.2153205300417966e-08, + "logits/chosen": -3.5015079975128174, + "logits/rejected": -3.4868526458740234, + "logps/chosen": -1.6366775035858154, + "logps/rejected": -1.7721493244171143, + "loss": 1.6781, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.273355007171631, + "rewards/margins": 0.2709435820579529, + "rewards/rejected": -3.5442986488342285, + "step": 3870 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 22.62904792343159, + "learning_rate": 4.209843633780929e-08, + "logits/chosen": -3.5137856006622314, + "logits/rejected": -3.5165233612060547, + "logps/chosen": -1.6856848001480103, + "logps/rejected": -1.8106880187988281, + "loss": 1.7276, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.3713696002960205, + "rewards/margins": 0.25000637769699097, + "rewards/rejected": -3.6213760375976562, + "step": 3880 + }, + { + "epoch": 0.6702274293590628, + "grad_norm": 21.401002059587398, + "learning_rate": 4.2043512752724265e-08, + "logits/chosen": -3.5426464080810547, + "logits/rejected": -3.5255541801452637, + "logps/chosen": -1.6136757135391235, + "logps/rejected": -1.8320449590682983, + "loss": 1.6157, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.227351427078247, + "rewards/margins": 0.43673810362815857, + "rewards/rejected": -3.6640899181365967, + "step": 3890 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 26.03380949427367, + "learning_rate": 4.19884350418411e-08, + "logits/chosen": -3.5326545238494873, + "logits/rejected": -3.501910448074341, + "logps/chosen": -1.6139007806777954, + "logps/rejected": -1.8865505456924438, + "loss": 1.5137, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.227801561355591, + "rewards/margins": 0.5452998280525208, + "rewards/rejected": -3.7731010913848877, + "step": 3900 + }, + { + "epoch": 0.6719503790489317, + "eval_logits/chosen": -3.560681104660034, + "eval_logits/rejected": -3.5568206310272217, + "eval_logps/chosen": -1.5588419437408447, + "eval_logps/rejected": -1.7063748836517334, + "eval_loss": 1.6468024253845215, + "eval_rewards/accuracies": 0.6089683771133423, + "eval_rewards/chosen": -3.1176838874816895, + "eval_rewards/margins": 0.29506558179855347, + "eval_rewards/rejected": -3.412749767303467, + "eval_runtime": 156.435, + "eval_samples_per_second": 27.513, + "eval_steps_per_second": 3.439, + "step": 3900 + }, + { + "epoch": 0.6736733287388008, + "grad_norm": 25.600458874723326, + "learning_rate": 4.1933203703231764e-08, + "logits/chosen": -3.5126616954803467, + "logits/rejected": -3.5031394958496094, + "logps/chosen": -1.666640281677246, + "logps/rejected": -1.853045105934143, + "loss": 1.6252, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.333280563354492, + "rewards/margins": 0.3728094696998596, + "rewards/rejected": -3.706090211868286, + "step": 3910 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 25.08181950685703, + "learning_rate": 4.187781923635753e-08, + "logits/chosen": -3.5553736686706543, + "logits/rejected": -3.530075788497925, + "logps/chosen": -1.596978783607483, + "logps/rejected": -1.8495222330093384, + "loss": 1.6157, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.193957567214966, + "rewards/margins": 0.50508713722229, + "rewards/rejected": -3.6990444660186768, + "step": 3920 + }, + { + "epoch": 0.677119228118539, + "grad_norm": 22.770304649410065, + "learning_rate": 4.182228214206437e-08, + "logits/chosen": -3.564268112182617, + "logits/rejected": -3.5615744590759277, + "logps/chosen": -1.6015243530273438, + "logps/rejected": -1.779693841934204, + "loss": 1.6527, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.2030487060546875, + "rewards/margins": 0.3563389182090759, + "rewards/rejected": -3.559387683868408, + "step": 3930 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 22.166024867711545, + "learning_rate": 4.176659292257853e-08, + "logits/chosen": -3.4495582580566406, + "logits/rejected": -3.440077304840088, + "logps/chosen": -1.6224391460418701, + "logps/rejected": -1.8546664714813232, + "loss": 1.5647, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.2448782920837402, + "rewards/margins": 0.4644545912742615, + "rewards/rejected": -3.7093329429626465, + "step": 3940 + }, + { + "epoch": 0.680565127498277, + "grad_norm": 24.004216369263208, + "learning_rate": 4.1710752081501877e-08, + "logits/chosen": -3.4501404762268066, + "logits/rejected": -3.423409938812256, + "logps/chosen": -1.6014677286148071, + "logps/rejected": -1.840460181236267, + "loss": 1.5627, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2029354572296143, + "rewards/margins": 0.4779849052429199, + "rewards/rejected": -3.680920362472534, + "step": 3950 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 17.61842965989023, + "learning_rate": 4.1654760123807465e-08, + "logits/chosen": -3.508578062057495, + "logits/rejected": -3.5096092224121094, + "logps/chosen": -1.5765204429626465, + "logps/rejected": -1.8091967105865479, + "loss": 1.6168, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.153040885925293, + "rewards/margins": 0.4653526842594147, + "rewards/rejected": -3.6183934211730957, + "step": 3960 + }, + { + "epoch": 0.6840110268780152, + "grad_norm": 21.127823425012448, + "learning_rate": 4.1598617555834866e-08, + "logits/chosen": -3.536834716796875, + "logits/rejected": -3.5221779346466064, + "logps/chosen": -1.6127170324325562, + "logps/rejected": -1.8586766719818115, + "loss": 1.6015, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.2254340648651123, + "rewards/margins": 0.4919193685054779, + "rewards/rejected": -3.717353343963623, + "step": 3970 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 25.790552310092274, + "learning_rate": 4.1542324885285656e-08, + "logits/chosen": -3.4440293312072754, + "logits/rejected": -3.432039260864258, + "logps/chosen": -1.7090425491333008, + "logps/rejected": -1.9035180807113647, + "loss": 1.6636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.4180850982666016, + "rewards/margins": 0.3889507055282593, + "rewards/rejected": -3.8070361614227295, + "step": 3980 + }, + { + "epoch": 0.6874569262577532, + "grad_norm": 20.63058512955964, + "learning_rate": 4.148588262121877e-08, + "logits/chosen": -3.5169568061828613, + "logits/rejected": -3.5108158588409424, + "logps/chosen": -1.618025779724121, + "logps/rejected": -1.8616539239883423, + "loss": 1.5462, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.236051559448242, + "rewards/margins": 0.4872560501098633, + "rewards/rejected": -3.7233078479766846, + "step": 3990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 25.08588143130398, + "learning_rate": 4.1429291274045966e-08, + "logits/chosen": -3.554903507232666, + "logits/rejected": -3.5359508991241455, + "logps/chosen": -1.6810871362686157, + "logps/rejected": -1.8635432720184326, + "loss": 1.6939, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.3621742725372314, + "rewards/margins": 0.36491283774375916, + "rewards/rejected": -3.7270865440368652, + "step": 4000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -3.5580458641052246, + "eval_logits/rejected": -3.5541951656341553, + "eval_logps/chosen": -1.5700632333755493, + "eval_logps/rejected": -1.7186118364334106, + "eval_loss": 1.6458131074905396, + "eval_rewards/accuracies": 0.6096654534339905, + "eval_rewards/chosen": -3.1401264667510986, + "eval_rewards/margins": 0.2970971167087555, + "eval_rewards/rejected": -3.4372236728668213, + "eval_runtime": 156.6316, + "eval_samples_per_second": 27.478, + "eval_steps_per_second": 3.435, + "step": 4000 + }, + { + "epoch": 0.6909028256374914, + "grad_norm": 19.77571708393977, + "learning_rate": 4.137255135552714e-08, + "logits/chosen": -3.4882278442382812, + "logits/rejected": -3.483491897583008, + "logps/chosen": -1.5649970769882202, + "logps/rejected": -1.8530149459838867, + "loss": 1.5259, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.1299941539764404, + "rewards/margins": 0.5760356187820435, + "rewards/rejected": -3.7060298919677734, + "step": 4010 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 18.105863239714825, + "learning_rate": 4.131566337876575e-08, + "logits/chosen": -3.4880757331848145, + "logits/rejected": -3.478909969329834, + "logps/chosen": -1.6576032638549805, + "logps/rejected": -1.8756574392318726, + "loss": 1.653, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.315206527709961, + "rewards/margins": 0.4361085891723633, + "rewards/rejected": -3.751314878463745, + "step": 4020 + }, + { + "epoch": 0.6943487250172296, + "grad_norm": 19.46335096717498, + "learning_rate": 4.1258627858204156e-08, + "logits/chosen": -3.4773623943328857, + "logits/rejected": -3.463923692703247, + "logps/chosen": -1.6468673944473267, + "logps/rejected": -1.9714231491088867, + "loss": 1.5003, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.2937347888946533, + "rewards/margins": 0.6491113901138306, + "rewards/rejected": -3.9428462982177734, + "step": 4030 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 22.16197965863107, + "learning_rate": 4.1201445309618957e-08, + "logits/chosen": -3.551835536956787, + "logits/rejected": -3.5449154376983643, + "logps/chosen": -1.7117366790771484, + "logps/rejected": -1.7705423831939697, + "loss": 1.819, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -3.423473358154297, + "rewards/margins": 0.1176108866930008, + "rewards/rejected": -3.5410847663879395, + "step": 4040 + }, + { + "epoch": 0.6977946243969676, + "grad_norm": 23.024523984764016, + "learning_rate": 4.114411625011634e-08, + "logits/chosen": -3.4751930236816406, + "logits/rejected": -3.4652562141418457, + "logps/chosen": -1.5987608432769775, + "logps/rejected": -1.8437213897705078, + "loss": 1.5936, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.197521686553955, + "rewards/margins": 0.48992109298706055, + "rewards/rejected": -3.6874427795410156, + "step": 4050 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 22.637125658671426, + "learning_rate": 4.10866411981274e-08, + "logits/chosen": -3.483513355255127, + "logits/rejected": -3.4649059772491455, + "logps/chosen": -1.7063789367675781, + "logps/rejected": -1.910467505455017, + "loss": 1.6342, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.4127578735351562, + "rewards/margins": 0.4081777036190033, + "rewards/rejected": -3.820935010910034, + "step": 4060 + }, + { + "epoch": 0.7012405237767058, + "grad_norm": 23.893748344060583, + "learning_rate": 4.102902067340348e-08, + "logits/chosen": -3.4984142780303955, + "logits/rejected": -3.485945224761963, + "logps/chosen": -1.7215543985366821, + "logps/rejected": -1.9019677639007568, + "loss": 1.6597, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.4431087970733643, + "rewards/margins": 0.3608267307281494, + "rewards/rejected": -3.8039355278015137, + "step": 4070 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 23.380994390067567, + "learning_rate": 4.0971255197011395e-08, + "logits/chosen": -3.472560167312622, + "logits/rejected": -3.468611478805542, + "logps/chosen": -1.5881741046905518, + "logps/rejected": -1.773781180381775, + "loss": 1.6249, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.1763482093811035, + "rewards/margins": 0.3712146580219269, + "rewards/rejected": -3.54756236076355, + "step": 4080 + }, + { + "epoch": 0.7046864231564438, + "grad_norm": 24.066216159605954, + "learning_rate": 4.091334529132881e-08, + "logits/chosen": -3.5234484672546387, + "logits/rejected": -3.5055222511291504, + "logps/chosen": -1.7111469507217407, + "logps/rejected": -1.9372665882110596, + "loss": 1.5782, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.4222939014434814, + "rewards/margins": 0.45223864912986755, + "rewards/rejected": -3.874533176422119, + "step": 4090 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 22.692905851202205, + "learning_rate": 4.085529148003945e-08, + "logits/chosen": -3.48921537399292, + "logits/rejected": -3.4737040996551514, + "logps/chosen": -1.543280005455017, + "logps/rejected": -1.7435003519058228, + "loss": 1.5735, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.086560010910034, + "rewards/margins": 0.4004410207271576, + "rewards/rejected": -3.4870007038116455, + "step": 4100 + }, + { + "epoch": 0.7064093728463129, + "eval_logits/chosen": -3.5662167072296143, + "eval_logits/rejected": -3.5624449253082275, + "eval_logps/chosen": -1.5791372060775757, + "eval_logps/rejected": -1.7292059659957886, + "eval_loss": 1.6444528102874756, + "eval_rewards/accuracies": 0.6101301312446594, + "eval_rewards/chosen": -3.1582744121551514, + "eval_rewards/margins": 0.3001375198364258, + "eval_rewards/rejected": -3.458411931991577, + "eval_runtime": 156.8964, + "eval_samples_per_second": 27.432, + "eval_steps_per_second": 3.429, + "step": 4100 + }, + { + "epoch": 0.708132322536182, + "grad_norm": 25.31298108546094, + "learning_rate": 4.079709428812842e-08, + "logits/chosen": -3.483440399169922, + "logits/rejected": -3.4772610664367676, + "logps/chosen": -1.7696669101715088, + "logps/rejected": -1.8060896396636963, + "loss": 1.8531, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -3.5393338203430176, + "rewards/margins": 0.07284572720527649, + "rewards/rejected": -3.6121792793273926, + "step": 4110 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 19.434279890163513, + "learning_rate": 4.073875424187739e-08, + "logits/chosen": -3.458526611328125, + "logits/rejected": -3.4575905799865723, + "logps/chosen": -1.6551834344863892, + "logps/rejected": -1.8632535934448242, + "loss": 1.6216, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.3103668689727783, + "rewards/margins": 0.4161405563354492, + "rewards/rejected": -3.7265071868896484, + "step": 4120 + }, + { + "epoch": 0.71157822191592, + "grad_norm": 24.678422743200148, + "learning_rate": 4.06802718688599e-08, + "logits/chosen": -3.511986494064331, + "logits/rejected": -3.4969642162323, + "logps/chosen": -1.6833579540252686, + "logps/rejected": -1.8844410181045532, + "loss": 1.6342, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.366715908050537, + "rewards/margins": 0.40216556191444397, + "rewards/rejected": -3.7688820362091064, + "step": 4130 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 22.68349215521244, + "learning_rate": 4.0621647697936555e-08, + "logits/chosen": -3.50946044921875, + "logits/rejected": -3.491915464401245, + "logps/chosen": -1.6703637838363647, + "logps/rejected": -1.7834079265594482, + "loss": 1.7765, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.3407275676727295, + "rewards/margins": 0.22608856856822968, + "rewards/rejected": -3.5668158531188965, + "step": 4140 + }, + { + "epoch": 0.7150241212956582, + "grad_norm": 21.690648627971818, + "learning_rate": 4.056288225925023e-08, + "logits/chosen": -3.518974781036377, + "logits/rejected": -3.502434253692627, + "logps/chosen": -1.740659475326538, + "logps/rejected": -1.9289363622665405, + "loss": 1.6937, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.481318950653076, + "rewards/margins": 0.376553475856781, + "rewards/rejected": -3.857872724533081, + "step": 4150 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 20.65863527345167, + "learning_rate": 4.050397608422132e-08, + "logits/chosen": -3.4524409770965576, + "logits/rejected": -3.434460401535034, + "logps/chosen": -1.6515560150146484, + "logps/rejected": -1.9148203134536743, + "loss": 1.4983, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.303112030029297, + "rewards/margins": 0.5265283584594727, + "rewards/rejected": -3.8296406269073486, + "step": 4160 + }, + { + "epoch": 0.7184700206753962, + "grad_norm": 23.703249988880387, + "learning_rate": 4.044492970554292e-08, + "logits/chosen": -3.4783706665039062, + "logits/rejected": -3.4739551544189453, + "logps/chosen": -1.6811408996582031, + "logps/rejected": -1.7930870056152344, + "loss": 1.7286, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.3622817993164062, + "rewards/margins": 0.2238922119140625, + "rewards/rejected": -3.5861740112304688, + "step": 4170 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 25.990755148311667, + "learning_rate": 4.038574365717594e-08, + "logits/chosen": -3.4913508892059326, + "logits/rejected": -3.4846763610839844, + "logps/chosen": -1.6782547235488892, + "logps/rejected": -1.8710553646087646, + "loss": 1.6432, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.3565094470977783, + "rewards/margins": 0.3856014907360077, + "rewards/rejected": -3.7421107292175293, + "step": 4180 + }, + { + "epoch": 0.7219159200551344, + "grad_norm": 20.339971142696943, + "learning_rate": 4.0326418474344414e-08, + "logits/chosen": -3.476672410964966, + "logits/rejected": -3.4679577350616455, + "logps/chosen": -1.6024541854858398, + "logps/rejected": -1.845259666442871, + "loss": 1.5605, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2049083709716797, + "rewards/margins": 0.48561081290245056, + "rewards/rejected": -3.690519332885742, + "step": 4190 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 25.33302436982735, + "learning_rate": 4.026695469353051e-08, + "logits/chosen": -3.5066425800323486, + "logits/rejected": -3.498121738433838, + "logps/chosen": -1.672947883605957, + "logps/rejected": -1.7910398244857788, + "loss": 1.736, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.345895767211914, + "rewards/margins": 0.23618388175964355, + "rewards/rejected": -3.5820796489715576, + "step": 4200 + }, + { + "epoch": 0.7236388697450035, + "eval_logits/chosen": -3.5506134033203125, + "eval_logits/rejected": -3.5467817783355713, + "eval_logps/chosen": -1.5880303382873535, + "eval_logps/rejected": -1.7393450736999512, + "eval_loss": 1.6429996490478516, + "eval_rewards/accuracies": 0.6122211813926697, + "eval_rewards/chosen": -3.176060676574707, + "eval_rewards/margins": 0.30262914299964905, + "eval_rewards/rejected": -3.4786901473999023, + "eval_runtime": 156.7717, + "eval_samples_per_second": 27.454, + "eval_steps_per_second": 3.432, + "step": 4200 + }, + { + "epoch": 0.7253618194348725, + "grad_norm": 20.151873209146856, + "learning_rate": 4.020735285246979e-08, + "logits/chosen": -3.501326084136963, + "logits/rejected": -3.486607074737549, + "logps/chosen": -1.697442650794983, + "logps/rejected": -1.8985875844955444, + "loss": 1.6689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.394885301589966, + "rewards/margins": 0.40229034423828125, + "rewards/rejected": -3.797175168991089, + "step": 4210 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 18.762083433666167, + "learning_rate": 4.0147613490146285e-08, + "logits/chosen": -3.404310703277588, + "logits/rejected": -3.392958164215088, + "logps/chosen": -1.66287100315094, + "logps/rejected": -1.9267852306365967, + "loss": 1.5097, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.32574200630188, + "rewards/margins": 0.5278291702270508, + "rewards/rejected": -3.8535704612731934, + "step": 4220 + }, + { + "epoch": 0.7288077188146106, + "grad_norm": 21.531535382405927, + "learning_rate": 4.0087737146787653e-08, + "logits/chosen": -3.512150287628174, + "logits/rejected": -3.5014138221740723, + "logps/chosen": -1.7222988605499268, + "logps/rejected": -1.9231723546981812, + "loss": 1.6357, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.4445977210998535, + "rewards/margins": 0.40174731612205505, + "rewards/rejected": -3.8463447093963623, + "step": 4230 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 21.956729011883837, + "learning_rate": 4.002772436386027e-08, + "logits/chosen": -3.4671401977539062, + "logits/rejected": -3.4529216289520264, + "logps/chosen": -1.6873142719268799, + "logps/rejected": -2.0064282417297363, + "loss": 1.4684, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.3746285438537598, + "rewards/margins": 0.638228178024292, + "rewards/rejected": -4.012856483459473, + "step": 4240 + }, + { + "epoch": 0.7322536181943488, + "grad_norm": 27.293037005407832, + "learning_rate": 3.996757568406437e-08, + "logits/chosen": -3.4776413440704346, + "logits/rejected": -3.4696097373962402, + "logps/chosen": -1.7368190288543701, + "logps/rejected": -1.8650646209716797, + "loss": 1.7232, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.4736380577087402, + "rewards/margins": 0.25649121403694153, + "rewards/rejected": -3.7301292419433594, + "step": 4250 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 21.72418970595855, + "learning_rate": 3.990729165132907e-08, + "logits/chosen": -3.443450450897217, + "logits/rejected": -3.433893918991089, + "logps/chosen": -1.6811186075210571, + "logps/rejected": -1.8643074035644531, + "loss": 1.6534, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.3622372150421143, + "rewards/margins": 0.36637765169143677, + "rewards/rejected": -3.7286148071289062, + "step": 4260 + }, + { + "epoch": 0.7356995175740868, + "grad_norm": 22.806990037890206, + "learning_rate": 3.9846872810807543e-08, + "logits/chosen": -3.4632415771484375, + "logits/rejected": -3.4483656883239746, + "logps/chosen": -1.5963927507400513, + "logps/rejected": -1.8413455486297607, + "loss": 1.5972, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.1927855014801025, + "rewards/margins": 0.48990577459335327, + "rewards/rejected": -3.6826910972595215, + "step": 4270 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 17.617571410042377, + "learning_rate": 3.978631970887201e-08, + "logits/chosen": -3.437887191772461, + "logits/rejected": -3.421170473098755, + "logps/chosen": -1.6769145727157593, + "logps/rejected": -1.975531816482544, + "loss": 1.4972, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.3538291454315186, + "rewards/margins": 0.5972347855567932, + "rewards/rejected": -3.951063632965088, + "step": 4280 + }, + { + "epoch": 0.739145416953825, + "grad_norm": 21.83716317145047, + "learning_rate": 3.9725632893108816e-08, + "logits/chosen": -3.4825375080108643, + "logits/rejected": -3.4686477184295654, + "logps/chosen": -1.70248544216156, + "logps/rejected": -1.8623663187026978, + "loss": 1.6965, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.40497088432312, + "rewards/margins": 0.31976184248924255, + "rewards/rejected": -3.7247326374053955, + "step": 4290 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 19.482653599596045, + "learning_rate": 3.9664812912313536e-08, + "logits/chosen": -3.4876956939697266, + "logits/rejected": -3.4820048809051514, + "logps/chosen": -1.7153770923614502, + "logps/rejected": -1.88519287109375, + "loss": 1.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.4307541847229004, + "rewards/margins": 0.339631587266922, + "rewards/rejected": -3.7703857421875, + "step": 4300 + }, + { + "epoch": 0.740868366643694, + "eval_logits/chosen": -3.5524227619171143, + "eval_logits/rejected": -3.5486185550689697, + "eval_logps/chosen": -1.5960999727249146, + "eval_logps/rejected": -1.7483726739883423, + "eval_loss": 1.642235517501831, + "eval_rewards/accuracies": 0.6115241646766663, + "eval_rewards/chosen": -3.192199945449829, + "eval_rewards/margins": 0.3045448660850525, + "eval_rewards/rejected": -3.4967453479766846, + "eval_runtime": 156.7987, + "eval_samples_per_second": 27.449, + "eval_steps_per_second": 3.431, + "step": 4300 + }, + { + "epoch": 0.742591316333563, + "grad_norm": 25.6021348884712, + "learning_rate": 3.960386031648592e-08, + "logits/chosen": -3.463371992111206, + "logits/rejected": -3.4526145458221436, + "logps/chosen": -1.7143220901489258, + "logps/rejected": -1.8105621337890625, + "loss": 1.7479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.4286441802978516, + "rewards/margins": 0.19247978925704956, + "rewards/rejected": -3.621124267578125, + "step": 4310 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 22.513610199517206, + "learning_rate": 3.9542775656825e-08, + "logits/chosen": -3.5040640830993652, + "logits/rejected": -3.4886155128479004, + "logps/chosen": -1.7358089685440063, + "logps/rejected": -1.9500296115875244, + "loss": 1.6016, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.4716179370880127, + "rewards/margins": 0.42844128608703613, + "rewards/rejected": -3.900059223175049, + "step": 4320 + }, + { + "epoch": 0.7460372157133012, + "grad_norm": 22.129690600230035, + "learning_rate": 3.9481559485724046e-08, + "logits/chosen": -3.440296173095703, + "logits/rejected": -3.4225516319274902, + "logps/chosen": -1.7704474925994873, + "logps/rejected": -1.9372055530548096, + "loss": 1.6457, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.5408949851989746, + "rewards/margins": 0.3335161805152893, + "rewards/rejected": -3.874411106109619, + "step": 4330 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 23.186414915562825, + "learning_rate": 3.942021235676561e-08, + "logits/chosen": -3.456206798553467, + "logits/rejected": -3.448406219482422, + "logps/chosen": -1.669115662574768, + "logps/rejected": -1.9304440021514893, + "loss": 1.5761, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.338231325149536, + "rewards/margins": 0.5226563215255737, + "rewards/rejected": -3.8608880043029785, + "step": 4340 + }, + { + "epoch": 0.7494831150930393, + "grad_norm": 23.943534367240648, + "learning_rate": 3.93587348247165e-08, + "logits/chosen": -3.4580702781677246, + "logits/rejected": -3.4480578899383545, + "logps/chosen": -1.571955919265747, + "logps/rejected": -1.8190895318984985, + "loss": 1.5405, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.143911838531494, + "rewards/margins": 0.49426794052124023, + "rewards/rejected": -3.638179063796997, + "step": 4350 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 19.33151743110255, + "learning_rate": 3.929712744552278e-08, + "logits/chosen": -3.4842731952667236, + "logits/rejected": -3.4722793102264404, + "logps/chosen": -1.7027311325073242, + "logps/rejected": -1.8658158779144287, + "loss": 1.6632, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.4054622650146484, + "rewards/margins": 0.3261692225933075, + "rewards/rejected": -3.7316317558288574, + "step": 4360 + }, + { + "epoch": 0.7529290144727774, + "grad_norm": 20.347311039604026, + "learning_rate": 3.923539077630471e-08, + "logits/chosen": -3.48675537109375, + "logits/rejected": -3.4799110889434814, + "logps/chosen": -1.7057020664215088, + "logps/rejected": -1.8181917667388916, + "loss": 1.7377, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.4114041328430176, + "rewards/margins": 0.2249792516231537, + "rewards/rejected": -3.636383533477783, + "step": 4370 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 24.588396459986683, + "learning_rate": 3.917352537535176e-08, + "logits/chosen": -3.4995181560516357, + "logits/rejected": -3.486206531524658, + "logps/chosen": -1.6363849639892578, + "logps/rejected": -1.8033367395401, + "loss": 1.6385, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.2727699279785156, + "rewards/margins": 0.3339036703109741, + "rewards/rejected": -3.6066734790802, + "step": 4380 + }, + { + "epoch": 0.7563749138525155, + "grad_norm": 23.664774807389783, + "learning_rate": 3.91115318021175e-08, + "logits/chosen": -3.462287187576294, + "logits/rejected": -3.4548110961914062, + "logps/chosen": -1.7081372737884521, + "logps/rejected": -1.901919960975647, + "loss": 1.6372, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.4162745475769043, + "rewards/margins": 0.3875652253627777, + "rewards/rejected": -3.803839921951294, + "step": 4390 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 21.80154982147482, + "learning_rate": 3.9049410617214604e-08, + "logits/chosen": -3.4724624156951904, + "logits/rejected": -3.462489604949951, + "logps/chosen": -1.7059593200683594, + "logps/rejected": -1.8600431680679321, + "loss": 1.6779, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.4119186401367188, + "rewards/margins": 0.30816811323165894, + "rewards/rejected": -3.7200863361358643, + "step": 4400 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -3.5511891841888428, + "eval_logits/rejected": -3.5473997592926025, + "eval_logps/chosen": -1.606135606765747, + "eval_logps/rejected": -1.7594319581985474, + "eval_loss": 1.6411340236663818, + "eval_rewards/accuracies": 0.6145446300506592, + "eval_rewards/chosen": -3.212271213531494, + "eval_rewards/margins": 0.30659252405166626, + "eval_rewards/rejected": -3.5188639163970947, + "eval_runtime": 156.61, + "eval_samples_per_second": 27.482, + "eval_steps_per_second": 3.435, + "step": 4400 + }, + { + "epoch": 0.7598208132322536, + "grad_norm": 23.012166909542163, + "learning_rate": 3.898716238240971e-08, + "logits/chosen": -3.4577736854553223, + "logits/rejected": -3.4492886066436768, + "logps/chosen": -1.7938112020492554, + "logps/rejected": -1.9123637676239014, + "loss": 1.7351, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.5876224040985107, + "rewards/margins": 0.23710520565509796, + "rewards/rejected": -3.8247275352478027, + "step": 4410 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 22.587111132654908, + "learning_rate": 3.892478766061841e-08, + "logits/chosen": -3.523894786834717, + "logits/rejected": -3.5004539489746094, + "logps/chosen": -1.6084932088851929, + "logps/rejected": -1.9061130285263062, + "loss": 1.4769, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2169864177703857, + "rewards/margins": 0.595239520072937, + "rewards/rejected": -3.8122260570526123, + "step": 4420 + }, + { + "epoch": 0.7632667126119917, + "grad_norm": 21.455441386327024, + "learning_rate": 3.886228701590011e-08, + "logits/chosen": -3.456550121307373, + "logits/rejected": -3.439866542816162, + "logps/chosen": -1.655550241470337, + "logps/rejected": -1.8683096170425415, + "loss": 1.6243, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.311100482940674, + "rewards/margins": 0.42551904916763306, + "rewards/rejected": -3.736619234085083, + "step": 4430 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 24.70656613060341, + "learning_rate": 3.879966101345296e-08, + "logits/chosen": -3.4990978240966797, + "logits/rejected": -3.480821132659912, + "logps/chosen": -1.7577602863311768, + "logps/rejected": -1.9620885848999023, + "loss": 1.6048, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.5155205726623535, + "rewards/margins": 0.40865617990493774, + "rewards/rejected": -3.9241771697998047, + "step": 4440 + }, + { + "epoch": 0.7667126119917298, + "grad_norm": 21.958313174082562, + "learning_rate": 3.8736910219608706e-08, + "logits/chosen": -3.4214000701904297, + "logits/rejected": -3.414090394973755, + "logps/chosen": -1.7312610149383545, + "logps/rejected": -1.924970269203186, + "loss": 1.6623, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.462522029876709, + "rewards/margins": 0.38741838932037354, + "rewards/rejected": -3.849940538406372, + "step": 4450 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 25.249166670661687, + "learning_rate": 3.867403520182762e-08, + "logits/chosen": -3.4864845275878906, + "logits/rejected": -3.481853485107422, + "logps/chosen": -1.7768481969833374, + "logps/rejected": -1.9148941040039062, + "loss": 1.7423, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.553696393966675, + "rewards/margins": 0.27609124779701233, + "rewards/rejected": -3.8297882080078125, + "step": 4460 + }, + { + "epoch": 0.770158511371468, + "grad_norm": 21.366831710166228, + "learning_rate": 3.861103652869334e-08, + "logits/chosen": -3.513761043548584, + "logits/rejected": -3.4958271980285645, + "logps/chosen": -1.7565181255340576, + "logps/rejected": -1.955985426902771, + "loss": 1.6398, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.5130362510681152, + "rewards/margins": 0.3989346921443939, + "rewards/rejected": -3.911970853805542, + "step": 4470 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 23.090323008916485, + "learning_rate": 3.854791476990771e-08, + "logits/chosen": -3.5304031372070312, + "logits/rejected": -3.5279440879821777, + "logps/chosen": -1.7302099466323853, + "logps/rejected": -1.8838436603546143, + "loss": 1.6637, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.4604198932647705, + "rewards/margins": 0.3072670102119446, + "rewards/rejected": -3.7676873207092285, + "step": 4480 + }, + { + "epoch": 0.7736044107512061, + "grad_norm": 24.01381760330728, + "learning_rate": 3.848467049628564e-08, + "logits/chosen": -3.456562042236328, + "logits/rejected": -3.4418365955352783, + "logps/chosen": -1.6890380382537842, + "logps/rejected": -1.9459129571914673, + "loss": 1.6067, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.3780760765075684, + "rewards/margins": 0.5137500166893005, + "rewards/rejected": -3.8918259143829346, + "step": 4490 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 21.650731402764695, + "learning_rate": 3.842130427974998e-08, + "logits/chosen": -3.4727115631103516, + "logits/rejected": -3.465339183807373, + "logps/chosen": -1.7669137716293335, + "logps/rejected": -1.8431341648101807, + "loss": 1.7728, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.533827543258667, + "rewards/margins": 0.15244051814079285, + "rewards/rejected": -3.6862683296203613, + "step": 4500 + }, + { + "epoch": 0.7753273604410751, + "eval_logits/chosen": -3.5278546810150146, + "eval_logits/rejected": -3.523972749710083, + "eval_logps/chosen": -1.6191825866699219, + "eval_logps/rejected": -1.7737586498260498, + "eval_loss": 1.6398953199386597, + "eval_rewards/accuracies": 0.615938663482666, + "eval_rewards/chosen": -3.2383651733398438, + "eval_rewards/margins": 0.30915191769599915, + "eval_rewards/rejected": -3.5475172996520996, + "eval_runtime": 156.829, + "eval_samples_per_second": 27.444, + "eval_steps_per_second": 3.43, + "step": 4500 + }, + { + "epoch": 0.7770503101309442, + "grad_norm": 25.16932091040138, + "learning_rate": 3.835781669332631e-08, + "logits/chosen": -3.538320541381836, + "logits/rejected": -3.522437334060669, + "logps/chosen": -1.6471236944198608, + "logps/rejected": -1.9564908742904663, + "loss": 1.4858, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2942473888397217, + "rewards/margins": 0.6187340021133423, + "rewards/rejected": -3.9129817485809326, + "step": 4510 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 25.545599008335817, + "learning_rate": 3.829420831113775e-08, + "logits/chosen": -3.486123561859131, + "logits/rejected": -3.474334239959717, + "logps/chosen": -1.786922812461853, + "logps/rejected": -1.9257822036743164, + "loss": 1.698, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.573845624923706, + "rewards/margins": 0.2777189612388611, + "rewards/rejected": -3.851564407348633, + "step": 4520 + }, + { + "epoch": 0.7804962095106823, + "grad_norm": 23.163580595110975, + "learning_rate": 3.823047970839981e-08, + "logits/chosen": -3.4954915046691895, + "logits/rejected": -3.490344285964966, + "logps/chosen": -1.7623231410980225, + "logps/rejected": -1.8654975891113281, + "loss": 1.7477, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -3.524646282196045, + "rewards/margins": 0.20634841918945312, + "rewards/rejected": -3.7309951782226562, + "step": 4530 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 29.25056040680644, + "learning_rate": 3.816663146141514e-08, + "logits/chosen": -3.4388909339904785, + "logits/rejected": -3.431828022003174, + "logps/chosen": -1.7118180990219116, + "logps/rejected": -1.8997730016708374, + "loss": 1.6731, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.4236361980438232, + "rewards/margins": 0.37590983510017395, + "rewards/rejected": -3.799546003341675, + "step": 4540 + }, + { + "epoch": 0.7839421088904204, + "grad_norm": 21.703201304218425, + "learning_rate": 3.810266414756836e-08, + "logits/chosen": -3.4988293647766113, + "logits/rejected": -3.4855308532714844, + "logps/chosen": -1.700516939163208, + "logps/rejected": -1.8831493854522705, + "loss": 1.5984, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.401033878326416, + "rewards/margins": 0.36526480317115784, + "rewards/rejected": -3.766298770904541, + "step": 4550 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 20.68567739019101, + "learning_rate": 3.803857834532081e-08, + "logits/chosen": -3.4284262657165527, + "logits/rejected": -3.4077751636505127, + "logps/chosen": -1.6368770599365234, + "logps/rejected": -1.889385461807251, + "loss": 1.5454, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.273754119873047, + "rewards/margins": 0.5050166845321655, + "rewards/rejected": -3.778770923614502, + "step": 4560 + }, + { + "epoch": 0.7873880082701585, + "grad_norm": 23.24231437490259, + "learning_rate": 3.7974374634205344e-08, + "logits/chosen": -3.4636197090148926, + "logits/rejected": -3.450129985809326, + "logps/chosen": -1.7170547246932983, + "logps/rejected": -1.9685401916503906, + "loss": 1.54, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.4341094493865967, + "rewards/margins": 0.5029706954956055, + "rewards/rejected": -3.9370803833007812, + "step": 4570 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 19.737404047474065, + "learning_rate": 3.791005359482106e-08, + "logits/chosen": -3.43738055229187, + "logits/rejected": -3.421517848968506, + "logps/chosen": -1.6412599086761475, + "logps/rejected": -1.7955795526504517, + "loss": 1.656, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.282519817352295, + "rewards/margins": 0.3086393475532532, + "rewards/rejected": -3.5911591053009033, + "step": 4580 + }, + { + "epoch": 0.7908339076498966, + "grad_norm": 25.631339701067773, + "learning_rate": 3.7845615808828055e-08, + "logits/chosen": -3.498211622238159, + "logits/rejected": -3.482539653778076, + "logps/chosen": -1.7987697124481201, + "logps/rejected": -1.9813737869262695, + "loss": 1.6984, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.5975394248962402, + "rewards/margins": 0.3652076721191406, + "rewards/rejected": -3.962747573852539, + "step": 4590 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 28.24635039313412, + "learning_rate": 3.7781061858942206e-08, + "logits/chosen": -3.4256882667541504, + "logits/rejected": -3.414107084274292, + "logps/chosen": -1.7632591724395752, + "logps/rejected": -2.0241665840148926, + "loss": 1.5063, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5265183448791504, + "rewards/margins": 0.5218142867088318, + "rewards/rejected": -4.048333168029785, + "step": 4600 + }, + { + "epoch": 0.7925568573397657, + "eval_logits/chosen": -3.541055917739868, + "eval_logits/rejected": -3.5372695922851562, + "eval_logps/chosen": -1.6300381422042847, + "eval_logps/rejected": -1.7856383323669434, + "eval_loss": 1.6390634775161743, + "eval_rewards/accuracies": 0.616403341293335, + "eval_rewards/chosen": -3.2600762844085693, + "eval_rewards/margins": 0.31120002269744873, + "eval_rewards/rejected": -3.5712766647338867, + "eval_runtime": 156.8091, + "eval_samples_per_second": 27.447, + "eval_steps_per_second": 3.431, + "step": 4600 + }, + { + "epoch": 0.7942798070296347, + "grad_norm": 27.962934028277196, + "learning_rate": 3.7716392328929864e-08, + "logits/chosen": -3.441448211669922, + "logits/rejected": -3.435244083404541, + "logps/chosen": -1.7777827978134155, + "logps/rejected": -1.9410136938095093, + "loss": 1.7129, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.555565595626831, + "rewards/margins": 0.32646211981773376, + "rewards/rejected": -3.8820273876190186, + "step": 4610 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 22.365062944076932, + "learning_rate": 3.765160780360254e-08, + "logits/chosen": -3.4482455253601074, + "logits/rejected": -3.437147617340088, + "logps/chosen": -1.7255197763442993, + "logps/rejected": -1.931583046913147, + "loss": 1.6096, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.4510395526885986, + "rewards/margins": 0.4121263921260834, + "rewards/rejected": -3.863166093826294, + "step": 4620 + }, + { + "epoch": 0.7977257064093728, + "grad_norm": 14.4814912152773, + "learning_rate": 3.7586708868811703e-08, + "logits/chosen": -3.4853720664978027, + "logits/rejected": -3.4670910835266113, + "logps/chosen": -1.7347654104232788, + "logps/rejected": -2.068977117538452, + "loss": 1.4856, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.4695308208465576, + "rewards/margins": 0.6684234738349915, + "rewards/rejected": -4.137954235076904, + "step": 4630 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 22.184198553203547, + "learning_rate": 3.7521696111443416e-08, + "logits/chosen": -3.489283323287964, + "logits/rejected": -3.4801902770996094, + "logps/chosen": -1.8257777690887451, + "logps/rejected": -2.014604091644287, + "loss": 1.6109, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6515555381774902, + "rewards/margins": 0.3776525855064392, + "rewards/rejected": -4.029208183288574, + "step": 4640 + }, + { + "epoch": 0.801171605789111, + "grad_norm": 19.634952377810666, + "learning_rate": 3.7456570119413035e-08, + "logits/chosen": -3.4838459491729736, + "logits/rejected": -3.466510057449341, + "logps/chosen": -1.710087776184082, + "logps/rejected": -1.948891043663025, + "loss": 1.5723, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.420175552368164, + "rewards/margins": 0.4776064455509186, + "rewards/rejected": -3.89778208732605, + "step": 4650 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 24.159670564637857, + "learning_rate": 3.739133148165994e-08, + "logits/chosen": -3.484649658203125, + "logits/rejected": -3.4753518104553223, + "logps/chosen": -1.745147466659546, + "logps/rejected": -1.9329309463500977, + "loss": 1.6197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.490294933319092, + "rewards/margins": 0.3755660653114319, + "rewards/rejected": -3.8658618927001953, + "step": 4660 + }, + { + "epoch": 0.8046175051688491, + "grad_norm": 18.329401316401903, + "learning_rate": 3.732598078814215e-08, + "logits/chosen": -3.4815421104431152, + "logits/rejected": -3.4675464630126953, + "logps/chosen": -1.70163893699646, + "logps/rejected": -1.8006588220596313, + "loss": 1.7661, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.40327787399292, + "rewards/margins": 0.19803957641124725, + "rewards/rejected": -3.6013176441192627, + "step": 4670 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 23.61763261151395, + "learning_rate": 3.7260518629831006e-08, + "logits/chosen": -3.457897901535034, + "logits/rejected": -3.4399447441101074, + "logps/chosen": -1.7409474849700928, + "logps/rejected": -1.983466386795044, + "loss": 1.5402, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.4818949699401855, + "rewards/margins": 0.4850376546382904, + "rewards/rejected": -3.966932773590088, + "step": 4680 + }, + { + "epoch": 0.8080634045485872, + "grad_norm": 27.31950550068022, + "learning_rate": 3.7194945598705865e-08, + "logits/chosen": -3.496598720550537, + "logits/rejected": -3.485285520553589, + "logps/chosen": -1.816908836364746, + "logps/rejected": -2.0231215953826904, + "loss": 1.6561, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.633817672729492, + "rewards/margins": 0.4124256670475006, + "rewards/rejected": -4.046243190765381, + "step": 4690 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 20.59605736601348, + "learning_rate": 3.712926228774868e-08, + "logits/chosen": -3.47697377204895, + "logits/rejected": -3.472442626953125, + "logps/chosen": -1.7706187963485718, + "logps/rejected": -2.021730422973633, + "loss": 1.5586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.5412375926971436, + "rewards/margins": 0.5022231340408325, + "rewards/rejected": -4.043460845947266, + "step": 4700 + }, + { + "epoch": 0.8097863542384562, + "eval_logits/chosen": -3.539290189743042, + "eval_logits/rejected": -3.535507917404175, + "eval_logps/chosen": -1.642861008644104, + "eval_logps/rejected": -1.799922227859497, + "eval_loss": 1.637628436088562, + "eval_rewards/accuracies": 0.6198884844779968, + "eval_rewards/chosen": -3.285722017288208, + "eval_rewards/margins": 0.31412220001220703, + "eval_rewards/rejected": -3.599844455718994, + "eval_runtime": 157.0897, + "eval_samples_per_second": 27.398, + "eval_steps_per_second": 3.425, + "step": 4700 + }, + { + "epoch": 0.8115093039283253, + "grad_norm": 25.092197079794445, + "learning_rate": 3.70634692909387e-08, + "logits/chosen": -3.5112056732177734, + "logits/rejected": -3.4885916709899902, + "logps/chosen": -1.7630201578140259, + "logps/rejected": -1.9670699834823608, + "loss": 1.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5260403156280518, + "rewards/margins": 0.4081002175807953, + "rewards/rejected": -3.9341399669647217, + "step": 4710 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 24.15747028157343, + "learning_rate": 3.699756720324706e-08, + "logits/chosen": -3.426949977874756, + "logits/rejected": -3.409846067428589, + "logps/chosen": -1.7289247512817383, + "logps/rejected": -1.9112657308578491, + "loss": 1.6656, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.4578495025634766, + "rewards/margins": 0.3646816611289978, + "rewards/rejected": -3.8225314617156982, + "step": 4720 + }, + { + "epoch": 0.8149552033080634, + "grad_norm": 22.888282780729174, + "learning_rate": 3.693155662063141e-08, + "logits/chosen": -3.4228241443634033, + "logits/rejected": -3.4096455574035645, + "logps/chosen": -1.7416683435440063, + "logps/rejected": -1.9405412673950195, + "loss": 1.6078, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.4833366870880127, + "rewards/margins": 0.39774569869041443, + "rewards/rejected": -3.881082534790039, + "step": 4730 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 21.53766435766926, + "learning_rate": 3.686543814003053e-08, + "logits/chosen": -3.4720726013183594, + "logits/rejected": -3.4658150672912598, + "logps/chosen": -1.7034873962402344, + "logps/rejected": -1.899416208267212, + "loss": 1.6194, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.4069747924804688, + "rewards/margins": 0.3918575942516327, + "rewards/rejected": -3.798832416534424, + "step": 4740 + }, + { + "epoch": 0.8184011026878015, + "grad_norm": 23.185185862868565, + "learning_rate": 3.6799212359358935e-08, + "logits/chosen": -3.4602248668670654, + "logits/rejected": -3.4486796855926514, + "logps/chosen": -1.7329816818237305, + "logps/rejected": -1.9003452062606812, + "loss": 1.6222, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.465963363647461, + "rewards/margins": 0.3347272276878357, + "rewards/rejected": -3.8006904125213623, + "step": 4750 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 27.807829075472956, + "learning_rate": 3.673287987750146e-08, + "logits/chosen": -3.4656288623809814, + "logits/rejected": -3.4522769451141357, + "logps/chosen": -1.7993364334106445, + "logps/rejected": -2.0148868560791016, + "loss": 1.6334, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.598672866821289, + "rewards/margins": 0.4311009347438812, + "rewards/rejected": -4.029773712158203, + "step": 4760 + }, + { + "epoch": 0.8218470020675396, + "grad_norm": 24.605019624012307, + "learning_rate": 3.6666441294307835e-08, + "logits/chosen": -3.5052456855773926, + "logits/rejected": -3.4930596351623535, + "logps/chosen": -1.802587866783142, + "logps/rejected": -1.8732969760894775, + "loss": 1.7799, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.605175733566284, + "rewards/margins": 0.1414179801940918, + "rewards/rejected": -3.746593952178955, + "step": 4770 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 22.097476415534477, + "learning_rate": 3.65998972105873e-08, + "logits/chosen": -3.422297716140747, + "logits/rejected": -3.4105827808380127, + "logps/chosen": -1.7074339389801025, + "logps/rejected": -1.9873342514038086, + "loss": 1.5223, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.414867877960205, + "rewards/margins": 0.5598009824752808, + "rewards/rejected": -3.974668502807617, + "step": 4780 + }, + { + "epoch": 0.8252929014472777, + "grad_norm": 22.906942623598738, + "learning_rate": 3.6533248228103114e-08, + "logits/chosen": -3.489544630050659, + "logits/rejected": -3.4734904766082764, + "logps/chosen": -1.8129308223724365, + "logps/rejected": -1.9461349248886108, + "loss": 1.7047, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.625861644744873, + "rewards/margins": 0.26640844345092773, + "rewards/rejected": -3.8922698497772217, + "step": 4790 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 26.35631584998411, + "learning_rate": 3.6466494949567175e-08, + "logits/chosen": -3.4358341693878174, + "logits/rejected": -3.427278995513916, + "logps/chosen": -1.7311235666275024, + "logps/rejected": -1.8689298629760742, + "loss": 1.6914, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.462247133255005, + "rewards/margins": 0.27561238408088684, + "rewards/rejected": -3.7378597259521484, + "step": 4800 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -3.533709764480591, + "eval_logits/rejected": -3.5299253463745117, + "eval_logps/chosen": -1.6558111906051636, + "eval_logps/rejected": -1.814075231552124, + "eval_loss": 1.6362500190734863, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -3.311622381210327, + "eval_rewards/margins": 0.3165280818939209, + "eval_rewards/rejected": -3.628150463104248, + "eval_runtime": 156.8903, + "eval_samples_per_second": 27.433, + "eval_steps_per_second": 3.429, + "step": 4800 + }, + { + "epoch": 0.8287388008270159, + "grad_norm": 22.0897131444851, + "learning_rate": 3.639963797863449e-08, + "logits/chosen": -3.4377055168151855, + "logits/rejected": -3.418684482574463, + "logps/chosen": -1.7359832525253296, + "logps/rejected": -1.9693584442138672, + "loss": 1.5688, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.471966505050659, + "rewards/margins": 0.4667505621910095, + "rewards/rejected": -3.9387168884277344, + "step": 4810 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 19.972853843803314, + "learning_rate": 3.633267791989782e-08, + "logits/chosen": -3.462580442428589, + "logits/rejected": -3.4562346935272217, + "logps/chosen": -1.743162751197815, + "logps/rejected": -1.932983636856079, + "loss": 1.631, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.48632550239563, + "rewards/margins": 0.3796413540840149, + "rewards/rejected": -3.865967273712158, + "step": 4820 + }, + { + "epoch": 0.832184700206754, + "grad_norm": 24.06900449458421, + "learning_rate": 3.626561537888214e-08, + "logits/chosen": -3.4726531505584717, + "logits/rejected": -3.458211898803711, + "logps/chosen": -1.7607024908065796, + "logps/rejected": -1.9663240909576416, + "loss": 1.6151, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.521404981613159, + "rewards/margins": 0.41124311089515686, + "rewards/rejected": -3.932648181915283, + "step": 4830 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 26.63693239889512, + "learning_rate": 3.6198450962039146e-08, + "logits/chosen": -3.448951005935669, + "logits/rejected": -3.431655168533325, + "logps/chosen": -1.7882194519042969, + "logps/rejected": -2.002066135406494, + "loss": 1.6102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.5764389038085938, + "rewards/margins": 0.42769306898117065, + "rewards/rejected": -4.004132270812988, + "step": 4840 + }, + { + "epoch": 0.8356305995864921, + "grad_norm": 21.2176300603693, + "learning_rate": 3.613118527674184e-08, + "logits/chosen": -3.4935638904571533, + "logits/rejected": -3.48449444770813, + "logps/chosen": -1.7603130340576172, + "logps/rejected": -1.9639613628387451, + "loss": 1.6094, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.5206260681152344, + "rewards/margins": 0.4072962701320648, + "rewards/rejected": -3.9279227256774902, + "step": 4850 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 23.519209684724846, + "learning_rate": 3.6063818931279e-08, + "logits/chosen": -3.496922731399536, + "logits/rejected": -3.4792823791503906, + "logps/chosen": -1.855291724205017, + "logps/rejected": -1.9591763019561768, + "loss": 1.7602, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.710583448410034, + "rewards/margins": 0.20776872336864471, + "rewards/rejected": -3.9183526039123535, + "step": 4860 + }, + { + "epoch": 0.8390764989662302, + "grad_norm": 25.968287696035368, + "learning_rate": 3.599635253484967e-08, + "logits/chosen": -3.5060863494873047, + "logits/rejected": -3.491917371749878, + "logps/chosen": -1.7921829223632812, + "logps/rejected": -1.9217307567596436, + "loss": 1.6939, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.5843658447265625, + "rewards/margins": 0.2590958774089813, + "rewards/rejected": -3.843461513519287, + "step": 4870 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 20.342417459406143, + "learning_rate": 3.5928786697557667e-08, + "logits/chosen": -3.42718243598938, + "logits/rejected": -3.407146453857422, + "logps/chosen": -1.7716490030288696, + "logps/rejected": -2.024527072906494, + "loss": 1.588, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.5432980060577393, + "rewards/margins": 0.5057557821273804, + "rewards/rejected": -4.049054145812988, + "step": 4880 + }, + { + "epoch": 0.8425223983459683, + "grad_norm": 22.05079827459098, + "learning_rate": 3.586112203040607e-08, + "logits/chosen": -3.463400363922119, + "logits/rejected": -3.451770305633545, + "logps/chosen": -1.7569997310638428, + "logps/rejected": -1.987011194229126, + "loss": 1.6056, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.5139994621276855, + "rewards/margins": 0.4600231647491455, + "rewards/rejected": -3.974022388458252, + "step": 4890 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 24.527821712958072, + "learning_rate": 3.579335914529166e-08, + "logits/chosen": -3.4503045082092285, + "logits/rejected": -3.432219982147217, + "logps/chosen": -1.7779268026351929, + "logps/rejected": -2.0501151084899902, + "loss": 1.5487, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.5558536052703857, + "rewards/margins": 0.5443761348724365, + "rewards/rejected": -4.1002302169799805, + "step": 4900 + }, + { + "epoch": 0.8442453480358374, + "eval_logits/chosen": -3.532886028289795, + "eval_logits/rejected": -3.529111862182617, + "eval_logps/chosen": -1.6698484420776367, + "eval_logps/rejected": -1.8292793035507202, + "eval_loss": 1.634961485862732, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.3396968841552734, + "eval_rewards/margins": 0.3188614845275879, + "eval_rewards/rejected": -3.6585586071014404, + "eval_runtime": 157.0016, + "eval_samples_per_second": 27.414, + "eval_steps_per_second": 3.427, + "step": 4900 + }, + { + "epoch": 0.8459682977257064, + "grad_norm": 22.070221340236923, + "learning_rate": 3.572549865499944e-08, + "logits/chosen": -3.5362162590026855, + "logits/rejected": -3.519587755203247, + "logps/chosen": -1.7273504734039307, + "logps/rejected": -1.9634937047958374, + "loss": 1.5742, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.4547009468078613, + "rewards/margins": 0.4722861349582672, + "rewards/rejected": -3.926987409591675, + "step": 4910 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 23.27363049524079, + "learning_rate": 3.5657541173197025e-08, + "logits/chosen": -3.4286937713623047, + "logits/rejected": -3.418917179107666, + "logps/chosen": -1.8358824253082275, + "logps/rejected": -2.0630476474761963, + "loss": 1.6165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.671764850616455, + "rewards/margins": 0.4543303847312927, + "rewards/rejected": -4.126095294952393, + "step": 4920 + }, + { + "epoch": 0.8494141971054445, + "grad_norm": 21.81085049727869, + "learning_rate": 3.558948731442918e-08, + "logits/chosen": -3.542125701904297, + "logits/rejected": -3.5372776985168457, + "logps/chosen": -1.8373435735702515, + "logps/rejected": -2.067091703414917, + "loss": 1.6007, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.674687147140503, + "rewards/margins": 0.45949673652648926, + "rewards/rejected": -4.134183406829834, + "step": 4930 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 28.427914364917974, + "learning_rate": 3.5521337694112176e-08, + "logits/chosen": -3.4940924644470215, + "logits/rejected": -3.4768013954162598, + "logps/chosen": -1.8528038263320923, + "logps/rejected": -2.111603021621704, + "loss": 1.5587, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.7056076526641846, + "rewards/margins": 0.5175986289978027, + "rewards/rejected": -4.223206043243408, + "step": 4940 + }, + { + "epoch": 0.8528600964851827, + "grad_norm": 22.721537492676806, + "learning_rate": 3.5453092928528286e-08, + "logits/chosen": -3.3774540424346924, + "logits/rejected": -3.3683905601501465, + "logps/chosen": -1.8292700052261353, + "logps/rejected": -1.9855585098266602, + "loss": 1.7224, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.6585400104522705, + "rewards/margins": 0.3125772178173065, + "rewards/rejected": -3.9711170196533203, + "step": 4950 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 23.33947624845728, + "learning_rate": 3.538475363482017e-08, + "logits/chosen": -3.4455342292785645, + "logits/rejected": -3.4446632862091064, + "logps/chosen": -1.8580598831176758, + "logps/rejected": -2.0527493953704834, + "loss": 1.637, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.7161197662353516, + "rewards/margins": 0.38937920331954956, + "rewards/rejected": -4.105498790740967, + "step": 4960 + }, + { + "epoch": 0.8563059958649207, + "grad_norm": 26.932839213879493, + "learning_rate": 3.531632043098533e-08, + "logits/chosen": -3.44385027885437, + "logits/rejected": -3.4385972023010254, + "logps/chosen": -1.8527345657348633, + "logps/rejected": -2.060098171234131, + "loss": 1.6353, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7054691314697266, + "rewards/margins": 0.41472750902175903, + "rewards/rejected": -4.120196342468262, + "step": 4970 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 19.351637756743056, + "learning_rate": 3.524779393587049e-08, + "logits/chosen": -3.484941005706787, + "logits/rejected": -3.4818978309631348, + "logps/chosen": -1.78236985206604, + "logps/rejected": -1.937793493270874, + "loss": 1.6772, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.56473970413208, + "rewards/margins": 0.3108476400375366, + "rewards/rejected": -3.875586986541748, + "step": 4980 + }, + { + "epoch": 0.8597518952446589, + "grad_norm": 19.542357691151665, + "learning_rate": 3.517917476916604e-08, + "logits/chosen": -3.4269585609436035, + "logits/rejected": -3.4220938682556152, + "logps/chosen": -1.7877781391143799, + "logps/rejected": -1.9918041229248047, + "loss": 1.6545, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.5755562782287598, + "rewards/margins": 0.4080522656440735, + "rewards/rejected": -3.9836082458496094, + "step": 4990 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 21.503169971220245, + "learning_rate": 3.511046355140036e-08, + "logits/chosen": -3.4475300312042236, + "logits/rejected": -3.4365737438201904, + "logps/chosen": -1.834779977798462, + "logps/rejected": -1.9941127300262451, + "loss": 1.7545, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.669559955596924, + "rewards/margins": 0.3186659514904022, + "rewards/rejected": -3.9882254600524902, + "step": 5000 + }, + { + "epoch": 0.8614748449345279, + "eval_logits/chosen": -3.525754928588867, + "eval_logits/rejected": -3.521970748901367, + "eval_logps/chosen": -1.6749658584594727, + "eval_logps/rejected": -1.8349937200546265, + "eval_loss": 1.6343132257461548, + "eval_rewards/accuracies": 0.6208178400993347, + "eval_rewards/chosen": -3.3499317169189453, + "eval_rewards/margins": 0.32005611062049866, + "eval_rewards/rejected": -3.669987440109253, + "eval_runtime": 156.9288, + "eval_samples_per_second": 27.426, + "eval_steps_per_second": 3.428, + "step": 5000 + }, + { + "epoch": 0.8631977946243969, + "grad_norm": 23.482326020554645, + "learning_rate": 3.5041660903934306e-08, + "logits/chosen": -3.449277877807617, + "logits/rejected": -3.434354782104492, + "logps/chosen": -1.7843444347381592, + "logps/rejected": -1.9947608709335327, + "loss": 1.5855, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.5686888694763184, + "rewards/margins": 0.42083272337913513, + "rewards/rejected": -3.9895217418670654, + "step": 5010 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 23.977748460882307, + "learning_rate": 3.4972767448955513e-08, + "logits/chosen": -3.436682939529419, + "logits/rejected": -3.422436237335205, + "logps/chosen": -1.7571443319320679, + "logps/rejected": -1.9808824062347412, + "loss": 1.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5142886638641357, + "rewards/margins": 0.44747620820999146, + "rewards/rejected": -3.9617648124694824, + "step": 5020 + }, + { + "epoch": 0.8666436940041351, + "grad_norm": 22.345948678599367, + "learning_rate": 3.490378380947279e-08, + "logits/chosen": -3.4007182121276855, + "logits/rejected": -3.395232677459717, + "logps/chosen": -1.8646204471588135, + "logps/rejected": -2.009065628051758, + "loss": 1.6958, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.729240894317627, + "rewards/margins": 0.28889042139053345, + "rewards/rejected": -4.018131256103516, + "step": 5030 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 26.311796903425392, + "learning_rate": 3.483471060931051e-08, + "logits/chosen": -3.5531868934631348, + "logits/rejected": -3.529979705810547, + "logps/chosen": -1.7528127431869507, + "logps/rejected": -1.9295215606689453, + "loss": 1.6343, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.5056254863739014, + "rewards/margins": 0.35341760516166687, + "rewards/rejected": -3.8590431213378906, + "step": 5040 + }, + { + "epoch": 0.8700895933838731, + "grad_norm": 22.427144466443423, + "learning_rate": 3.476554847310294e-08, + "logits/chosen": -3.4700767993927, + "logits/rejected": -3.4561607837677, + "logps/chosen": -1.7912003993988037, + "logps/rejected": -2.0692501068115234, + "loss": 1.5332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.5824007987976074, + "rewards/margins": 0.5560997128486633, + "rewards/rejected": -4.138500213623047, + "step": 5050 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 25.23930941282526, + "learning_rate": 3.4696298026288585e-08, + "logits/chosen": -3.3803367614746094, + "logits/rejected": -3.375462293624878, + "logps/chosen": -1.8426294326782227, + "logps/rejected": -2.039417266845703, + "loss": 1.643, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.6852588653564453, + "rewards/margins": 0.39357537031173706, + "rewards/rejected": -4.078834533691406, + "step": 5060 + }, + { + "epoch": 0.8735354927636113, + "grad_norm": 24.055505582945006, + "learning_rate": 3.462695989510459e-08, + "logits/chosen": -3.4592792987823486, + "logits/rejected": -3.4443161487579346, + "logps/chosen": -1.7633063793182373, + "logps/rejected": -1.9285686016082764, + "loss": 1.6687, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.5266127586364746, + "rewards/margins": 0.33052510023117065, + "rewards/rejected": -3.8571372032165527, + "step": 5070 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 23.412439053646228, + "learning_rate": 3.4557534706580996e-08, + "logits/chosen": -3.562913179397583, + "logits/rejected": -3.543750762939453, + "logps/chosen": -1.7919371128082275, + "logps/rejected": -1.960331678390503, + "loss": 1.6713, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.583874225616455, + "rewards/margins": 0.3367890417575836, + "rewards/rejected": -3.920663356781006, + "step": 5080 + }, + { + "epoch": 0.8769813921433495, + "grad_norm": 22.927770654934342, + "learning_rate": 3.448802308853515e-08, + "logits/chosen": -3.491680860519409, + "logits/rejected": -3.47367525100708, + "logps/chosen": -1.7646631002426147, + "logps/rejected": -2.040038824081421, + "loss": 1.5229, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.5293262004852295, + "rewards/margins": 0.5507515072822571, + "rewards/rejected": -4.080077648162842, + "step": 5090 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 24.692481424124082, + "learning_rate": 3.441842566956595e-08, + "logits/chosen": -3.391857624053955, + "logits/rejected": -3.3718929290771484, + "logps/chosen": -1.7796776294708252, + "logps/rejected": -2.0360095500946045, + "loss": 1.5632, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.5593552589416504, + "rewards/margins": 0.512663722038269, + "rewards/rejected": -4.072019100189209, + "step": 5100 + }, + { + "epoch": 0.8787043418332184, + "eval_logits/chosen": -3.5313193798065186, + "eval_logits/rejected": -3.5275869369506836, + "eval_logps/chosen": -1.6906180381774902, + "eval_logps/rejected": -1.8524922132492065, + "eval_loss": 1.6321707963943481, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.3812360763549805, + "eval_rewards/margins": 0.323748379945755, + "eval_rewards/rejected": -3.704984426498413, + "eval_runtime": 157.0152, + "eval_samples_per_second": 27.411, + "eval_steps_per_second": 3.426, + "step": 5100 + }, + { + "epoch": 0.8804272915230875, + "grad_norm": 20.091934298860323, + "learning_rate": 3.434874307904822e-08, + "logits/chosen": -3.4726576805114746, + "logits/rejected": -3.451878070831299, + "logps/chosen": -1.823591947555542, + "logps/rejected": -2.0702402591705322, + "loss": 1.6193, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.647183895111084, + "rewards/margins": 0.49329671263694763, + "rewards/rejected": -4.1404805183410645, + "step": 5110 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 25.66992132735328, + "learning_rate": 3.427897594712699e-08, + "logits/chosen": -3.5178382396698, + "logits/rejected": -3.501192092895508, + "logps/chosen": -1.7830708026885986, + "logps/rejected": -1.9700489044189453, + "loss": 1.6495, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.5661416053771973, + "rewards/margins": 0.3739566206932068, + "rewards/rejected": -3.9400978088378906, + "step": 5120 + }, + { + "epoch": 0.8838731909028257, + "grad_norm": 23.564021064084905, + "learning_rate": 3.4209124904711807e-08, + "logits/chosen": -3.5170187950134277, + "logits/rejected": -3.499253749847412, + "logps/chosen": -1.78726327419281, + "logps/rejected": -1.9784603118896484, + "loss": 1.6313, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.57452654838562, + "rewards/margins": 0.3823941946029663, + "rewards/rejected": -3.956920623779297, + "step": 5130 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 19.290290458156893, + "learning_rate": 3.413919058347102e-08, + "logits/chosen": -3.4532470703125, + "logits/rejected": -3.433908462524414, + "logps/chosen": -1.8081576824188232, + "logps/rejected": -2.078204393386841, + "loss": 1.5691, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.6163153648376465, + "rewards/margins": 0.5400930643081665, + "rewards/rejected": -4.156408786773682, + "step": 5140 + }, + { + "epoch": 0.8873190902825637, + "grad_norm": 24.048640163328088, + "learning_rate": 3.40691736158261e-08, + "logits/chosen": -3.4991211891174316, + "logits/rejected": -3.495278835296631, + "logps/chosen": -1.8295818567276, + "logps/rejected": -1.9931118488311768, + "loss": 1.6488, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.6591637134552, + "rewards/margins": 0.3270602524280548, + "rewards/rejected": -3.9862236976623535, + "step": 5150 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 25.35321747433887, + "learning_rate": 3.399907463494585e-08, + "logits/chosen": -3.46759033203125, + "logits/rejected": -3.4492619037628174, + "logps/chosen": -1.855194330215454, + "logps/rejected": -2.0973479747772217, + "loss": 1.5843, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.710388660430908, + "rewards/margins": 0.4843073785305023, + "rewards/rejected": -4.194695949554443, + "step": 5160 + }, + { + "epoch": 0.8907649896623019, + "grad_norm": 27.07118927755968, + "learning_rate": 3.392889427474077e-08, + "logits/chosen": -3.458486557006836, + "logits/rejected": -3.44599986076355, + "logps/chosen": -1.7667970657348633, + "logps/rejected": -1.9593870639801025, + "loss": 1.6213, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.5335941314697266, + "rewards/margins": 0.3851797878742218, + "rewards/rejected": -3.918774127960205, + "step": 5170 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 35.26354581278769, + "learning_rate": 3.385863316985726e-08, + "logits/chosen": -3.5173115730285645, + "logits/rejected": -3.513126850128174, + "logps/chosen": -1.8368217945098877, + "logps/rejected": -1.9807236194610596, + "loss": 1.6729, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.6736435890197754, + "rewards/margins": 0.2878040373325348, + "rewards/rejected": -3.961447238922119, + "step": 5180 + }, + { + "epoch": 0.8942108890420399, + "grad_norm": 18.677050528045243, + "learning_rate": 3.3788291955671886e-08, + "logits/chosen": -3.4471518993377686, + "logits/rejected": -3.440293788909912, + "logps/chosen": -1.8185749053955078, + "logps/rejected": -2.0595288276672363, + "loss": 1.6074, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6371498107910156, + "rewards/margins": 0.48190706968307495, + "rewards/rejected": -4.119057655334473, + "step": 5190 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 23.123203480852855, + "learning_rate": 3.371787126828568e-08, + "logits/chosen": -3.470381259918213, + "logits/rejected": -3.4595398902893066, + "logps/chosen": -1.8523486852645874, + "logps/rejected": -2.1444194316864014, + "loss": 1.5213, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.704697370529175, + "rewards/margins": 0.5841414332389832, + "rewards/rejected": -4.288838863372803, + "step": 5200 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -3.519916296005249, + "eval_logits/rejected": -3.516139507293701, + "eval_logps/chosen": -1.7042819261550903, + "eval_logps/rejected": -1.8668677806854248, + "eval_loss": 1.6312977075576782, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.4085638523101807, + "eval_rewards/margins": 0.32517150044441223, + "eval_rewards/rejected": -3.7337355613708496, + "eval_runtime": 156.7107, + "eval_samples_per_second": 27.465, + "eval_steps_per_second": 3.433, + "step": 5200 + }, + { + "epoch": 0.8976567884217781, + "grad_norm": 26.654533244037623, + "learning_rate": 3.3647371744518336e-08, + "logits/chosen": -3.4601433277130127, + "logits/rejected": -3.4550070762634277, + "logps/chosen": -1.8672834634780884, + "logps/rejected": -1.955216646194458, + "loss": 1.7748, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.7345669269561768, + "rewards/margins": 0.17586641013622284, + "rewards/rejected": -3.910433292388916, + "step": 5210 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 23.48385049296065, + "learning_rate": 3.3576794021902476e-08, + "logits/chosen": -3.474984645843506, + "logits/rejected": -3.4691004753112793, + "logps/chosen": -1.8507674932479858, + "logps/rejected": -2.0187880992889404, + "loss": 1.6888, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.7015349864959717, + "rewards/margins": 0.3360414206981659, + "rewards/rejected": -4.037576198577881, + "step": 5220 + }, + { + "epoch": 0.9011026878015161, + "grad_norm": 20.30110098949633, + "learning_rate": 3.350613873867788e-08, + "logits/chosen": -3.445544481277466, + "logits/rejected": -3.439443588256836, + "logps/chosen": -1.7777087688446045, + "logps/rejected": -2.043583393096924, + "loss": 1.5198, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.555417537689209, + "rewards/margins": 0.53174889087677, + "rewards/rejected": -4.087166786193848, + "step": 5230 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 23.866199101420335, + "learning_rate": 3.343540653378571e-08, + "logits/chosen": -3.43855619430542, + "logits/rejected": -3.425544023513794, + "logps/chosen": -1.780425786972046, + "logps/rejected": -2.085043430328369, + "loss": 1.4592, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.560851573944092, + "rewards/margins": 0.6092349886894226, + "rewards/rejected": -4.170086860656738, + "step": 5240 + }, + { + "epoch": 0.9045485871812543, + "grad_norm": 18.368321046394197, + "learning_rate": 3.336459804686275e-08, + "logits/chosen": -3.4174113273620605, + "logits/rejected": -3.410311460494995, + "logps/chosen": -1.798803687095642, + "logps/rejected": -1.9860894680023193, + "loss": 1.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.597607374191284, + "rewards/margins": 0.3745712637901306, + "rewards/rejected": -3.9721789360046387, + "step": 5250 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 24.77755361297784, + "learning_rate": 3.3293713918235594e-08, + "logits/chosen": -3.4742958545684814, + "logits/rejected": -3.45210599899292, + "logps/chosen": -1.8229835033416748, + "logps/rejected": -2.079397201538086, + "loss": 1.5275, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6459670066833496, + "rewards/margins": 0.5128265619277954, + "rewards/rejected": -4.158794403076172, + "step": 5260 + }, + { + "epoch": 0.9079944865609925, + "grad_norm": 21.224348772086223, + "learning_rate": 3.3222754788914874e-08, + "logits/chosen": -3.5248515605926514, + "logits/rejected": -3.5174949169158936, + "logps/chosen": -1.801378607749939, + "logps/rejected": -2.012326240539551, + "loss": 1.6078, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.602757215499878, + "rewards/margins": 0.42189493775367737, + "rewards/rejected": -4.024652481079102, + "step": 5270 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 27.49494069933336, + "learning_rate": 3.315172130058946e-08, + "logits/chosen": -3.4629783630371094, + "logits/rejected": -3.4470486640930176, + "logps/chosen": -1.917335867881775, + "logps/rejected": -2.0798962116241455, + "loss": 1.7019, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.83467173576355, + "rewards/margins": 0.3251202702522278, + "rewards/rejected": -4.159792423248291, + "step": 5280 + }, + { + "epoch": 0.9114403859407305, + "grad_norm": 24.21135842067587, + "learning_rate": 3.308061409562065e-08, + "logits/chosen": -3.4218509197235107, + "logits/rejected": -3.396721601486206, + "logps/chosen": -1.7689292430877686, + "logps/rejected": -2.0648210048675537, + "loss": 1.4911, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.537858486175537, + "rewards/margins": 0.591783881187439, + "rewards/rejected": -4.129642009735107, + "step": 5290 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 25.773850992143156, + "learning_rate": 3.300943381703639e-08, + "logits/chosen": -3.434553623199463, + "logits/rejected": -3.4255855083465576, + "logps/chosen": -1.8069871664047241, + "logps/rejected": -2.082049608230591, + "loss": 1.5913, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.6139743328094482, + "rewards/margins": 0.5501248240470886, + "rewards/rejected": -4.164099216461182, + "step": 5300 + }, + { + "epoch": 0.9131633356305996, + "eval_logits/chosen": -3.527522563934326, + "eval_logits/rejected": -3.523801565170288, + "eval_logps/chosen": -1.7130297422409058, + "eval_logps/rejected": -1.87640380859375, + "eval_loss": 1.6305195093154907, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.4260594844818115, + "eval_rewards/margins": 0.3267482817173004, + "eval_rewards/rejected": -3.7528076171875, + "eval_runtime": 156.8945, + "eval_samples_per_second": 27.432, + "eval_steps_per_second": 3.429, + "step": 5300 + }, + { + "epoch": 0.9148862853204687, + "grad_norm": 22.5697137759111, + "learning_rate": 3.293818110852541e-08, + "logits/chosen": -3.5153770446777344, + "logits/rejected": -3.5095317363739014, + "logps/chosen": -1.854914665222168, + "logps/rejected": -1.942670464515686, + "loss": 1.7612, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.709829330444336, + "rewards/margins": 0.17551138997077942, + "rewards/rejected": -3.885340929031372, + "step": 5310 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 23.71567823314758, + "learning_rate": 3.286685661443144e-08, + "logits/chosen": -3.4561047554016113, + "logits/rejected": -3.4287009239196777, + "logps/chosen": -1.81234872341156, + "logps/rejected": -2.0797834396362305, + "loss": 1.5332, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.62469744682312, + "rewards/margins": 0.5348696112632751, + "rewards/rejected": -4.159566879272461, + "step": 5320 + }, + { + "epoch": 0.9183321847002067, + "grad_norm": 23.18707284228407, + "learning_rate": 3.279546097974738e-08, + "logits/chosen": -3.4201388359069824, + "logits/rejected": -3.4221673011779785, + "logps/chosen": -1.8354524374008179, + "logps/rejected": -2.073591709136963, + "loss": 1.662, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.6709048748016357, + "rewards/margins": 0.47627848386764526, + "rewards/rejected": -4.147183418273926, + "step": 5330 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 17.528944664642363, + "learning_rate": 3.272399485010943e-08, + "logits/chosen": -3.4467742443084717, + "logits/rejected": -3.4188027381896973, + "logps/chosen": -1.7887439727783203, + "logps/rejected": -2.1692209243774414, + "loss": 1.3935, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.5774879455566406, + "rewards/margins": 0.7609542012214661, + "rewards/rejected": -4.338441848754883, + "step": 5340 + }, + { + "epoch": 0.9217780840799449, + "grad_norm": 26.495234633005794, + "learning_rate": 3.265245887179133e-08, + "logits/chosen": -3.413238048553467, + "logits/rejected": -3.3924782276153564, + "logps/chosen": -1.8511384725570679, + "logps/rejected": -2.1196560859680176, + "loss": 1.5505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.7022769451141357, + "rewards/margins": 0.5370352864265442, + "rewards/rejected": -4.239312171936035, + "step": 5350 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 23.46845567097907, + "learning_rate": 3.2580853691698416e-08, + "logits/chosen": -3.4921023845672607, + "logits/rejected": -3.486508846282959, + "logps/chosen": -1.7721364498138428, + "logps/rejected": -2.045105457305908, + "loss": 1.5267, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.5442728996276855, + "rewards/margins": 0.5459376573562622, + "rewards/rejected": -4.090210914611816, + "step": 5360 + }, + { + "epoch": 0.9252239834596829, + "grad_norm": 26.311284297956107, + "learning_rate": 3.2509179957361865e-08, + "logits/chosen": -3.4313418865203857, + "logits/rejected": -3.4230151176452637, + "logps/chosen": -1.8184692859649658, + "logps/rejected": -2.067195415496826, + "loss": 1.5648, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.6369385719299316, + "rewards/margins": 0.49745216965675354, + "rewards/rejected": -4.134390830993652, + "step": 5370 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 24.075724830007495, + "learning_rate": 3.2437438316932765e-08, + "logits/chosen": -3.494786500930786, + "logits/rejected": -3.4722418785095215, + "logps/chosen": -1.8038562536239624, + "logps/rejected": -2.128657579421997, + "loss": 1.4941, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.607712507247925, + "rewards/margins": 0.6496024131774902, + "rewards/rejected": -4.257315158843994, + "step": 5380 + }, + { + "epoch": 0.9286698828394211, + "grad_norm": 24.216953616697193, + "learning_rate": 3.2365629419176294e-08, + "logits/chosen": -3.4475083351135254, + "logits/rejected": -3.4246983528137207, + "logps/chosen": -1.8919894695281982, + "logps/rejected": -2.0561435222625732, + "loss": 1.6625, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.7839789390563965, + "rewards/margins": 0.32830825448036194, + "rewards/rejected": -4.1122870445251465, + "step": 5390 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 23.046600036675528, + "learning_rate": 3.2293753913465856e-08, + "logits/chosen": -3.465648651123047, + "logits/rejected": -3.4570412635803223, + "logps/chosen": -1.7800188064575195, + "logps/rejected": -2.09928560256958, + "loss": 1.4784, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.560037612915039, + "rewards/margins": 0.6385337114334106, + "rewards/rejected": -4.19857120513916, + "step": 5400 + }, + { + "epoch": 0.9303928325292902, + "eval_logits/chosen": -3.5203566551208496, + "eval_logits/rejected": -3.516618013381958, + "eval_logps/chosen": -1.7217143774032593, + "eval_logps/rejected": -1.8859609365463257, + "eval_loss": 1.6297142505645752, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.4434287548065186, + "eval_rewards/margins": 0.32849326729774475, + "eval_rewards/rejected": -3.7719218730926514, + "eval_runtime": 156.8806, + "eval_samples_per_second": 27.435, + "eval_steps_per_second": 3.429, + "step": 5400 + }, + { + "epoch": 0.9321157822191593, + "grad_norm": 27.432916068113855, + "learning_rate": 3.2221812449777164e-08, + "logits/chosen": -3.4750492572784424, + "logits/rejected": -3.4684929847717285, + "logps/chosen": -1.9076690673828125, + "logps/rejected": -1.9822807312011719, + "loss": 1.7837, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.815338134765625, + "rewards/margins": 0.14922332763671875, + "rewards/rejected": -3.9645614624023438, + "step": 5410 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 31.548177955727024, + "learning_rate": 3.214980567868242e-08, + "logits/chosen": -3.494518280029297, + "logits/rejected": -3.48295259475708, + "logps/chosen": -1.8320766687393188, + "logps/rejected": -2.0443382263183594, + "loss": 1.5922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.6641533374786377, + "rewards/margins": 0.42452383041381836, + "rewards/rejected": -4.088676452636719, + "step": 5420 + }, + { + "epoch": 0.9355616815988973, + "grad_norm": 22.615790816547293, + "learning_rate": 3.2077734251344407e-08, + "logits/chosen": -3.451200008392334, + "logits/rejected": -3.4436163902282715, + "logps/chosen": -1.983635663986206, + "logps/rejected": -2.1832242012023926, + "loss": 1.6754, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.967271327972412, + "rewards/margins": 0.3991771936416626, + "rewards/rejected": -4.366448402404785, + "step": 5430 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 29.82551306105765, + "learning_rate": 3.200559881951059e-08, + "logits/chosen": -3.4564788341522217, + "logits/rejected": -3.446925640106201, + "logps/chosen": -1.901158332824707, + "logps/rejected": -2.111189842224121, + "loss": 1.6072, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.802316665649414, + "rewards/margins": 0.42006316781044006, + "rewards/rejected": -4.222379684448242, + "step": 5440 + }, + { + "epoch": 0.9390075809786355, + "grad_norm": 24.196715242305, + "learning_rate": 3.193340003550722e-08, + "logits/chosen": -3.4036948680877686, + "logits/rejected": -3.3952178955078125, + "logps/chosen": -1.901444673538208, + "logps/rejected": -2.038902759552002, + "loss": 1.7012, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.802889347076416, + "rewards/margins": 0.2749166488647461, + "rewards/rejected": -4.077805519104004, + "step": 5450 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 25.13560084649081, + "learning_rate": 3.186113855223348e-08, + "logits/chosen": -3.4810194969177246, + "logits/rejected": -3.4727416038513184, + "logps/chosen": -1.8953689336776733, + "logps/rejected": -2.019958972930908, + "loss": 1.7373, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.7907378673553467, + "rewards/margins": 0.24918052554130554, + "rewards/rejected": -4.039917945861816, + "step": 5460 + }, + { + "epoch": 0.9424534803583735, + "grad_norm": 28.38128737218778, + "learning_rate": 3.1788815023155517e-08, + "logits/chosen": -3.4398322105407715, + "logits/rejected": -3.427842617034912, + "logps/chosen": -1.8805954456329346, + "logps/rejected": -2.0653817653656006, + "loss": 1.6554, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.761190891265869, + "rewards/margins": 0.3695727288722992, + "rewards/rejected": -4.130763530731201, + "step": 5470 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 23.45126910911809, + "learning_rate": 3.171643010230057e-08, + "logits/chosen": -3.45288348197937, + "logits/rejected": -3.4391732215881348, + "logps/chosen": -1.8427280187606812, + "logps/rejected": -2.080603837966919, + "loss": 1.5714, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.6854560375213623, + "rewards/margins": 0.4757510721683502, + "rewards/rejected": -4.161207675933838, + "step": 5480 + }, + { + "epoch": 0.9458993797381117, + "grad_norm": 22.76004973007053, + "learning_rate": 3.1643984444251056e-08, + "logits/chosen": -3.4439144134521484, + "logits/rejected": -3.427624464035034, + "logps/chosen": -1.846697449684143, + "logps/rejected": -2.0807290077209473, + "loss": 1.6071, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.693394899368286, + "rewards/margins": 0.46806278824806213, + "rewards/rejected": -4.1614580154418945, + "step": 5490 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 21.686210554917487, + "learning_rate": 3.157147870413864e-08, + "logits/chosen": -3.491223096847534, + "logits/rejected": -3.4809367656707764, + "logps/chosen": -1.8210760354995728, + "logps/rejected": -2.0948100090026855, + "loss": 1.5188, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6421520709991455, + "rewards/margins": 0.5474672913551331, + "rewards/rejected": -4.189620018005371, + "step": 5500 + }, + { + "epoch": 0.9476223294279807, + "eval_logits/chosen": -3.508429765701294, + "eval_logits/rejected": -3.504648447036743, + "eval_logps/chosen": -1.7244927883148193, + "eval_logps/rejected": -1.8888623714447021, + "eval_loss": 1.6296459436416626, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.4489855766296387, + "eval_rewards/margins": 0.3287389278411865, + "eval_rewards/rejected": -3.7777247428894043, + "eval_runtime": 156.8463, + "eval_samples_per_second": 27.441, + "eval_steps_per_second": 3.43, + "step": 5500 + }, + { + "epoch": 0.9493452791178497, + "grad_norm": 22.480056600371512, + "learning_rate": 3.149891353763832e-08, + "logits/chosen": -3.443314790725708, + "logits/rejected": -3.433872938156128, + "logps/chosen": -1.8664919137954712, + "logps/rejected": -2.0479071140289307, + "loss": 1.6598, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.7329838275909424, + "rewards/margins": 0.3628304600715637, + "rewards/rejected": -4.095814228057861, + "step": 5510 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 26.34371615027587, + "learning_rate": 3.142628960096246e-08, + "logits/chosen": -3.4306321144104004, + "logits/rejected": -3.414475679397583, + "logps/chosen": -1.7662547826766968, + "logps/rejected": -2.004430055618286, + "loss": 1.5858, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.5325095653533936, + "rewards/margins": 0.47635045647621155, + "rewards/rejected": -4.008860111236572, + "step": 5520 + }, + { + "epoch": 0.9527911784975879, + "grad_norm": 24.69820768193378, + "learning_rate": 3.1353607550854935e-08, + "logits/chosen": -3.4556031227111816, + "logits/rejected": -3.4372177124023438, + "logps/chosen": -1.837798833847046, + "logps/rejected": -2.076709747314453, + "loss": 1.6088, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.675597667694092, + "rewards/margins": 0.4778224527835846, + "rewards/rejected": -4.153419494628906, + "step": 5530 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 22.80679210394031, + "learning_rate": 3.12808680445851e-08, + "logits/chosen": -3.441251754760742, + "logits/rejected": -3.4437785148620605, + "logps/chosen": -1.8699767589569092, + "logps/rejected": -2.1061248779296875, + "loss": 1.607, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.7399535179138184, + "rewards/margins": 0.4722967743873596, + "rewards/rejected": -4.212249755859375, + "step": 5540 + }, + { + "epoch": 0.956237077877326, + "grad_norm": 25.041001165882857, + "learning_rate": 3.120807173994194e-08, + "logits/chosen": -3.368950605392456, + "logits/rejected": -3.359483242034912, + "logps/chosen": -1.8627344369888306, + "logps/rejected": -2.063649892807007, + "loss": 1.5949, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.725468873977661, + "rewards/margins": 0.40183085203170776, + "rewards/rejected": -4.127299785614014, + "step": 5550 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 24.42391891179974, + "learning_rate": 3.1135219295228014e-08, + "logits/chosen": -3.450377941131592, + "logits/rejected": -3.436337947845459, + "logps/chosen": -1.8749735355377197, + "logps/rejected": -2.064882516860962, + "loss": 1.6355, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.7499470710754395, + "rewards/margins": 0.3798181414604187, + "rewards/rejected": -4.129765033721924, + "step": 5560 + }, + { + "epoch": 0.9596829772570641, + "grad_norm": 22.017687386743894, + "learning_rate": 3.1062311369253604e-08, + "logits/chosen": -3.4696338176727295, + "logits/rejected": -3.4641032218933105, + "logps/chosen": -1.7417465448379517, + "logps/rejected": -2.0727834701538086, + "loss": 1.4422, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.4834930896759033, + "rewards/margins": 0.6620740294456482, + "rewards/rejected": -4.145566940307617, + "step": 5570 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 23.081295603036544, + "learning_rate": 3.0989348621330694e-08, + "logits/chosen": -3.3996875286102295, + "logits/rejected": -3.3915317058563232, + "logps/chosen": -1.913442850112915, + "logps/rejected": -2.1058075428009033, + "loss": 1.6597, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.82688570022583, + "rewards/margins": 0.3847295343875885, + "rewards/rejected": -4.211615085601807, + "step": 5580 + }, + { + "epoch": 0.9631288766368022, + "grad_norm": 21.27563532909938, + "learning_rate": 3.091633171126703e-08, + "logits/chosen": -3.5047526359558105, + "logits/rejected": -3.4806740283966064, + "logps/chosen": -1.7807114124298096, + "logps/rejected": -2.169731616973877, + "loss": 1.3863, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.561422824859619, + "rewards/margins": 0.7780405282974243, + "rewards/rejected": -4.339463233947754, + "step": 5590 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 24.126556458717722, + "learning_rate": 3.0843261299360165e-08, + "logits/chosen": -3.477612257003784, + "logits/rejected": -3.476505756378174, + "logps/chosen": -1.897589087486267, + "logps/rejected": -2.0767481327056885, + "loss": 1.6448, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.795178174972534, + "rewards/margins": 0.35831841826438904, + "rewards/rejected": -4.153496265411377, + "step": 5600 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -3.5116043090820312, + "eval_logits/rejected": -3.507857084274292, + "eval_logps/chosen": -1.726783275604248, + "eval_logps/rejected": -1.892014980316162, + "eval_loss": 1.628503680229187, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.453566551208496, + "eval_rewards/margins": 0.3304632008075714, + "eval_rewards/rejected": -3.784029960632324, + "eval_runtime": 157.0458, + "eval_samples_per_second": 27.406, + "eval_steps_per_second": 3.426, + "step": 5600 + }, + { + "epoch": 0.9665747760165403, + "grad_norm": 21.770878338189856, + "learning_rate": 3.077013804639144e-08, + "logits/chosen": -3.4865195751190186, + "logits/rejected": -3.476513624191284, + "logps/chosen": -1.835084319114685, + "logps/rejected": -2.208655834197998, + "loss": 1.4075, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.67016863822937, + "rewards/margins": 0.7471426725387573, + "rewards/rejected": -4.417311668395996, + "step": 5610 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 28.056301609555593, + "learning_rate": 3.069696261362008e-08, + "logits/chosen": -3.4394924640655518, + "logits/rejected": -3.4271035194396973, + "logps/chosen": -1.8765770196914673, + "logps/rejected": -2.0235655307769775, + "loss": 1.6639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.7531540393829346, + "rewards/margins": 0.29397720098495483, + "rewards/rejected": -4.047131061553955, + "step": 5620 + }, + { + "epoch": 0.9700206753962785, + "grad_norm": 23.462158731921292, + "learning_rate": 3.062373566277715e-08, + "logits/chosen": -3.4689953327178955, + "logits/rejected": -3.453500270843506, + "logps/chosen": -1.835611343383789, + "logps/rejected": -1.9687402248382568, + "loss": 1.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.671222686767578, + "rewards/margins": 0.2662583589553833, + "rewards/rejected": -3.9374804496765137, + "step": 5630 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 26.82361117873529, + "learning_rate": 3.0550457856059594e-08, + "logits/chosen": -3.457603931427002, + "logits/rejected": -3.449570417404175, + "logps/chosen": -1.7727670669555664, + "logps/rejected": -2.003467321395874, + "loss": 1.5793, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.545534133911133, + "rewards/margins": 0.46140056848526, + "rewards/rejected": -4.006934642791748, + "step": 5640 + }, + { + "epoch": 0.9734665747760165, + "grad_norm": 27.307882908107704, + "learning_rate": 3.047712985612428e-08, + "logits/chosen": -3.3996150493621826, + "logits/rejected": -3.3957576751708984, + "logps/chosen": -1.8862718343734741, + "logps/rejected": -1.9974828958511353, + "loss": 1.7437, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.7725436687469482, + "rewards/margins": 0.2224218100309372, + "rewards/rejected": -3.9949657917022705, + "step": 5650 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 24.177885181258883, + "learning_rate": 3.040375232608194e-08, + "logits/chosen": -3.401787281036377, + "logits/rejected": -3.399714708328247, + "logps/chosen": -1.8606195449829102, + "logps/rejected": -1.9279016256332397, + "loss": 1.7903, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.7212390899658203, + "rewards/margins": 0.13456431031227112, + "rewards/rejected": -3.8558032512664795, + "step": 5660 + }, + { + "epoch": 0.9769124741557547, + "grad_norm": 23.138716520346335, + "learning_rate": 3.033032592949125e-08, + "logits/chosen": -3.4348037242889404, + "logits/rejected": -3.420405864715576, + "logps/chosen": -1.809854507446289, + "logps/rejected": -2.042843818664551, + "loss": 1.5637, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.619709014892578, + "rewards/margins": 0.46597838401794434, + "rewards/rejected": -4.085687637329102, + "step": 5670 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 23.992188096337262, + "learning_rate": 3.025685133035275e-08, + "logits/chosen": -3.4628615379333496, + "logits/rejected": -3.4410738945007324, + "logps/chosen": -1.9035263061523438, + "logps/rejected": -2.198092222213745, + "loss": 1.4623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.8070526123046875, + "rewards/margins": 0.5891319513320923, + "rewards/rejected": -4.39618444442749, + "step": 5680 + }, + { + "epoch": 0.9803583735354927, + "grad_norm": 23.335011036519447, + "learning_rate": 3.0183329193102894e-08, + "logits/chosen": -3.479776382446289, + "logits/rejected": -3.4632720947265625, + "logps/chosen": -1.8539769649505615, + "logps/rejected": -1.9922962188720703, + "loss": 1.6705, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.707953929901123, + "rewards/margins": 0.2766384184360504, + "rewards/rejected": -3.9845924377441406, + "step": 5690 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 27.85466847945764, + "learning_rate": 3.0109760182608054e-08, + "logits/chosen": -3.3461086750030518, + "logits/rejected": -3.334786891937256, + "logps/chosen": -1.8702213764190674, + "logps/rejected": -2.032609462738037, + "loss": 1.6912, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.7404427528381348, + "rewards/margins": 0.3247763216495514, + "rewards/rejected": -4.065218925476074, + "step": 5700 + }, + { + "epoch": 0.9820813232253618, + "eval_logits/chosen": -3.5151638984680176, + "eval_logits/rejected": -3.511449098587036, + "eval_logps/chosen": -1.7320307493209839, + "eval_logps/rejected": -1.898160457611084, + "eval_loss": 1.6277430057525635, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.4640614986419678, + "eval_rewards/margins": 0.3322596549987793, + "eval_rewards/rejected": -3.796320915222168, + "eval_runtime": 156.9958, + "eval_samples_per_second": 27.415, + "eval_steps_per_second": 3.427, + "step": 5700 + }, + { + "epoch": 0.9838042729152309, + "grad_norm": 25.288215657450817, + "learning_rate": 3.0036144964158425e-08, + "logits/chosen": -3.4685661792755127, + "logits/rejected": -3.4522743225097656, + "logps/chosen": -1.875012993812561, + "logps/rejected": -2.1395294666290283, + "loss": 1.5234, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.750025987625122, + "rewards/margins": 0.5290330648422241, + "rewards/rejected": -4.279058933258057, + "step": 5710 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 24.45229293787202, + "learning_rate": 2.9962484203462114e-08, + "logits/chosen": -3.458885669708252, + "logits/rejected": -3.446441173553467, + "logps/chosen": -1.8547947406768799, + "logps/rejected": -2.0839366912841797, + "loss": 1.6295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.7095894813537598, + "rewards/margins": 0.45828431844711304, + "rewards/rejected": -4.167873382568359, + "step": 5720 + }, + { + "epoch": 0.987250172294969, + "grad_norm": 24.21727757095132, + "learning_rate": 2.988877856663905e-08, + "logits/chosen": -3.482609272003174, + "logits/rejected": -3.4758262634277344, + "logps/chosen": -1.8282434940338135, + "logps/rejected": -2.0329113006591797, + "loss": 1.6221, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.656486988067627, + "rewards/margins": 0.40933576226234436, + "rewards/rejected": -4.065822601318359, + "step": 5730 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 21.771930400819205, + "learning_rate": 2.9815028720214984e-08, + "logits/chosen": -3.4262423515319824, + "logits/rejected": -3.409010410308838, + "logps/chosen": -1.9302985668182373, + "logps/rejected": -2.1985549926757812, + "loss": 1.5441, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8605971336364746, + "rewards/margins": 0.5365130305290222, + "rewards/rejected": -4.3971099853515625, + "step": 5740 + }, + { + "epoch": 0.9906960716747071, + "grad_norm": 26.19161604505015, + "learning_rate": 2.974123533111545e-08, + "logits/chosen": -3.5193488597869873, + "logits/rejected": -3.5023579597473145, + "logps/chosen": -1.929686188697815, + "logps/rejected": -2.1155102252960205, + "loss": 1.6228, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.85937237739563, + "rewards/margins": 0.3716481328010559, + "rewards/rejected": -4.231020450592041, + "step": 5750 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 22.879103461663533, + "learning_rate": 2.9667399066659754e-08, + "logits/chosen": -3.4188640117645264, + "logits/rejected": -3.404308795928955, + "logps/chosen": -1.8672975301742554, + "logps/rejected": -2.115143060684204, + "loss": 1.582, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.7345950603485107, + "rewards/margins": 0.4956907331943512, + "rewards/rejected": -4.230286121368408, + "step": 5760 + }, + { + "epoch": 0.9941419710544452, + "grad_norm": 21.58004546185313, + "learning_rate": 2.959352059455492e-08, + "logits/chosen": -3.4075675010681152, + "logits/rejected": -3.397751569747925, + "logps/chosen": -1.8189910650253296, + "logps/rejected": -2.0612027645111084, + "loss": 1.534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.637982130050659, + "rewards/margins": 0.4844236373901367, + "rewards/rejected": -4.122405529022217, + "step": 5770 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 26.007180706054623, + "learning_rate": 2.9519600582889654e-08, + "logits/chosen": -3.433157444000244, + "logits/rejected": -3.422635555267334, + "logps/chosen": -1.8221524953842163, + "logps/rejected": -2.101191997528076, + "loss": 1.5121, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.6443049907684326, + "rewards/margins": 0.5580787658691406, + "rewards/rejected": -4.202383995056152, + "step": 5780 + }, + { + "epoch": 0.9975878704341833, + "grad_norm": 25.62039163724806, + "learning_rate": 2.944563970012831e-08, + "logits/chosen": -3.388105869293213, + "logits/rejected": -3.3706886768341064, + "logps/chosen": -1.8484153747558594, + "logps/rejected": -2.1404757499694824, + "loss": 1.5763, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.6968307495117188, + "rewards/margins": 0.5841199159622192, + "rewards/rejected": -4.280951499938965, + "step": 5790 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 29.305292852224163, + "learning_rate": 2.937163861510486e-08, + "logits/chosen": -3.429935932159424, + "logits/rejected": -3.4159095287323, + "logps/chosen": -1.8138043880462646, + "logps/rejected": -2.1608500480651855, + "loss": 1.4687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6276087760925293, + "rewards/margins": 0.6940909624099731, + "rewards/rejected": -4.321700096130371, + "step": 5800 + }, + { + "epoch": 0.9993108201240524, + "eval_logits/chosen": -3.5001158714294434, + "eval_logits/rejected": -3.496344804763794, + "eval_logps/chosen": -1.7407336235046387, + "eval_logps/rejected": -1.9080629348754883, + "eval_loss": 1.626598834991455, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.4814672470092773, + "eval_rewards/margins": 0.3346584141254425, + "eval_rewards/rejected": -3.8161258697509766, + "eval_runtime": 156.8336, + "eval_samples_per_second": 27.443, + "eval_steps_per_second": 3.43, + "step": 5800 + }, + { + "epoch": 1.0010337698139213, + "grad_norm": 19.074283720600217, + "learning_rate": 2.92975979970168e-08, + "logits/chosen": -3.477383852005005, + "logits/rejected": -3.4689598083496094, + "logps/chosen": -1.9524990320205688, + "logps/rejected": -2.1251003742218018, + "loss": 1.6742, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.9049980640411377, + "rewards/margins": 0.34520265460014343, + "rewards/rejected": -4.2502007484436035, + "step": 5810 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 28.599669977115482, + "learning_rate": 2.9223518515419147e-08, + "logits/chosen": -3.486095428466797, + "logits/rejected": -3.4694766998291016, + "logps/chosen": -1.8832248449325562, + "logps/rejected": -2.137240171432495, + "loss": 1.55, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.7664496898651123, + "rewards/margins": 0.5080300569534302, + "rewards/rejected": -4.27448034286499, + "step": 5820 + }, + { + "epoch": 1.0044796691936595, + "grad_norm": 23.630832861892266, + "learning_rate": 2.914940084021836e-08, + "logits/chosen": -3.4008426666259766, + "logits/rejected": -3.3829505443573, + "logps/chosen": -1.8125168085098267, + "logps/rejected": -2.121213912963867, + "loss": 1.493, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.6250336170196533, + "rewards/margins": 0.6173943877220154, + "rewards/rejected": -4.242427825927734, + "step": 5830 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 21.481046031290845, + "learning_rate": 2.9075245641666278e-08, + "logits/chosen": -3.417980909347534, + "logits/rejected": -3.406219482421875, + "logps/chosen": -1.842913269996643, + "logps/rejected": -2.225130558013916, + "loss": 1.4198, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.685826539993286, + "rewards/margins": 0.7644348740577698, + "rewards/rejected": -4.450261116027832, + "step": 5840 + }, + { + "epoch": 1.0079255685733977, + "grad_norm": 24.80943310460254, + "learning_rate": 2.9001053590354075e-08, + "logits/chosen": -3.4700305461883545, + "logits/rejected": -3.4580581188201904, + "logps/chosen": -1.801783800125122, + "logps/rejected": -2.134066104888916, + "loss": 1.4751, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.603567600250244, + "rewards/margins": 0.6645644903182983, + "rewards/rejected": -4.268132209777832, + "step": 5850 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 25.924038071995827, + "learning_rate": 2.8926825357206174e-08, + "logits/chosen": -3.392603635787964, + "logits/rejected": -3.385313034057617, + "logps/chosen": -1.927787184715271, + "logps/rejected": -2.1839494705200195, + "loss": 1.6092, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.855574369430542, + "rewards/margins": 0.5123244524002075, + "rewards/rejected": -4.367898941040039, + "step": 5860 + }, + { + "epoch": 1.0113714679531358, + "grad_norm": 24.513730805754744, + "learning_rate": 2.8852561613474213e-08, + "logits/chosen": -3.403587818145752, + "logits/rejected": -3.3922696113586426, + "logps/chosen": -1.9038444757461548, + "logps/rejected": -2.21995210647583, + "loss": 1.5805, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.8076889514923096, + "rewards/margins": 0.6322157382965088, + "rewards/rejected": -4.43990421295166, + "step": 5870 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 25.705905724393045, + "learning_rate": 2.8778263030730937e-08, + "logits/chosen": -3.451251983642578, + "logits/rejected": -3.444547653198242, + "logps/chosen": -1.8582789897918701, + "logps/rejected": -2.0448267459869385, + "loss": 1.6544, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.7165579795837402, + "rewards/margins": 0.3730953335762024, + "rewards/rejected": -4.089653491973877, + "step": 5880 + }, + { + "epoch": 1.014817367332874, + "grad_norm": 24.632481399640593, + "learning_rate": 2.8703930280864165e-08, + "logits/chosen": -3.4650187492370605, + "logits/rejected": -3.4573676586151123, + "logps/chosen": -1.8261702060699463, + "logps/rejected": -2.0655648708343506, + "loss": 1.5858, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.6523404121398926, + "rewards/margins": 0.47878989577293396, + "rewards/rejected": -4.131129741668701, + "step": 5890 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 20.774153192161798, + "learning_rate": 2.8629564036070662e-08, + "logits/chosen": -3.4153225421905518, + "logits/rejected": -3.4047656059265137, + "logps/chosen": -1.8864538669586182, + "logps/rejected": -2.1383233070373535, + "loss": 1.5634, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.7729077339172363, + "rewards/margins": 0.5037394762039185, + "rewards/rejected": -4.276646614074707, + "step": 5900 + }, + { + "epoch": 1.016540317022743, + "eval_logits/chosen": -3.497972011566162, + "eval_logits/rejected": -3.49422550201416, + "eval_logps/chosen": -1.7443777322769165, + "eval_logps/rejected": -1.9121525287628174, + "eval_loss": 1.626189947128296, + "eval_rewards/accuracies": 0.6231412887573242, + "eval_rewards/chosen": -3.488755464553833, + "eval_rewards/margins": 0.33554959297180176, + "eval_rewards/rejected": -3.8243050575256348, + "eval_runtime": 156.9512, + "eval_samples_per_second": 27.423, + "eval_steps_per_second": 3.428, + "step": 5900 + }, + { + "epoch": 1.018263266712612, + "grad_norm": 24.97340497532853, + "learning_rate": 2.8555164968850108e-08, + "logits/chosen": -3.406592607498169, + "logits/rejected": -3.4084973335266113, + "logps/chosen": -1.8783107995986938, + "logps/rejected": -2.0737977027893066, + "loss": 1.6129, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7566215991973877, + "rewards/margins": 0.3909732699394226, + "rewards/rejected": -4.147595405578613, + "step": 5910 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 25.919478710430383, + "learning_rate": 2.848073375199901e-08, + "logits/chosen": -3.4497196674346924, + "logits/rejected": -3.441281795501709, + "logps/chosen": -1.8622204065322876, + "logps/rejected": -2.09135103225708, + "loss": 1.6281, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.724440813064575, + "rewards/margins": 0.4582609236240387, + "rewards/rejected": -4.18270206451416, + "step": 5920 + }, + { + "epoch": 1.02170916609235, + "grad_norm": 24.139150972904417, + "learning_rate": 2.8406271058604575e-08, + "logits/chosen": -3.443845272064209, + "logits/rejected": -3.441237688064575, + "logps/chosen": -1.8999731540679932, + "logps/rejected": -2.0663833618164062, + "loss": 1.6628, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.7999463081359863, + "rewards/margins": 0.33282095193862915, + "rewards/rejected": -4.1327667236328125, + "step": 5930 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 19.664272882110275, + "learning_rate": 2.8331777562038677e-08, + "logits/chosen": -3.4517855644226074, + "logits/rejected": -3.4285035133361816, + "logps/chosen": -1.8013420104980469, + "logps/rejected": -2.1896469593048096, + "loss": 1.41, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6026840209960938, + "rewards/margins": 0.7766100168228149, + "rewards/rejected": -4.379293918609619, + "step": 5940 + }, + { + "epoch": 1.0251550654720882, + "grad_norm": 21.78299231434559, + "learning_rate": 2.8257253935951754e-08, + "logits/chosen": -3.3678860664367676, + "logits/rejected": -3.367457628250122, + "logps/chosen": -1.8912885189056396, + "logps/rejected": -2.0855536460876465, + "loss": 1.6459, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.7825770378112793, + "rewards/margins": 0.38853010535240173, + "rewards/rejected": -4.171107292175293, + "step": 5950 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 30.700697066583697, + "learning_rate": 2.8182700854266677e-08, + "logits/chosen": -3.4003825187683105, + "logits/rejected": -3.372554063796997, + "logps/chosen": -1.8274500370025635, + "logps/rejected": -2.081688404083252, + "loss": 1.5383, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.654900074005127, + "rewards/margins": 0.5084772109985352, + "rewards/rejected": -4.163376808166504, + "step": 5960 + }, + { + "epoch": 1.0286009648518264, + "grad_norm": 24.426358092375974, + "learning_rate": 2.8108118991172713e-08, + "logits/chosen": -3.3900794982910156, + "logits/rejected": -3.38141131401062, + "logps/chosen": -1.9039766788482666, + "logps/rejected": -2.126392364501953, + "loss": 1.5982, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.807953357696533, + "rewards/margins": 0.4448317587375641, + "rewards/rejected": -4.252784729003906, + "step": 5970 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 27.41112934448588, + "learning_rate": 2.8033509021119394e-08, + "logits/chosen": -3.399916410446167, + "logits/rejected": -3.3990578651428223, + "logps/chosen": -1.840656042098999, + "logps/rejected": -2.082005739212036, + "loss": 1.595, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.681312084197998, + "rewards/margins": 0.4826989769935608, + "rewards/rejected": -4.164011478424072, + "step": 5980 + }, + { + "epoch": 1.0320468642315643, + "grad_norm": 23.28974782120899, + "learning_rate": 2.7958871618810432e-08, + "logits/chosen": -3.442197799682617, + "logits/rejected": -3.4227020740509033, + "logps/chosen": -1.8498620986938477, + "logps/rejected": -2.1586685180664062, + "loss": 1.5335, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.6997241973876953, + "rewards/margins": 0.6176121830940247, + "rewards/rejected": -4.3173370361328125, + "step": 5990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 23.536685132341713, + "learning_rate": 2.7884207459197584e-08, + "logits/chosen": -3.4320194721221924, + "logits/rejected": -3.4225456714630127, + "logps/chosen": -1.8912874460220337, + "logps/rejected": -2.1405069828033447, + "loss": 1.5389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7825748920440674, + "rewards/margins": 0.4984396994113922, + "rewards/rejected": -4.2810139656066895, + "step": 6000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -3.50722074508667, + "eval_logits/rejected": -3.5035240650177, + "eval_logps/chosen": -1.7475978136062622, + "eval_logps/rejected": -1.9156893491744995, + "eval_loss": 1.6257579326629639, + "eval_rewards/accuracies": 0.6233736276626587, + "eval_rewards/chosen": -3.4951956272125244, + "eval_rewards/margins": 0.33618319034576416, + "eval_rewards/rejected": -3.831378698348999, + "eval_runtime": 156.6875, + "eval_samples_per_second": 27.469, + "eval_steps_per_second": 3.434, + "step": 6000 + }, + { + "epoch": 1.0354927636113025, + "grad_norm": 26.23781139054016, + "learning_rate": 2.780951721747461e-08, + "logits/chosen": -3.429410934448242, + "logits/rejected": -3.420280933380127, + "logps/chosen": -1.9090255498886108, + "logps/rejected": -2.135810136795044, + "loss": 1.6307, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.8180510997772217, + "rewards/margins": 0.4535690248012543, + "rewards/rejected": -4.271620273590088, + "step": 6010 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 21.494519207349686, + "learning_rate": 2.7734801569071104e-08, + "logits/chosen": -3.530848264694214, + "logits/rejected": -3.511420726776123, + "logps/chosen": -1.8210960626602173, + "logps/rejected": -2.026543140411377, + "loss": 1.6498, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.6421921253204346, + "rewards/margins": 0.4108942449092865, + "rewards/rejected": -4.053086280822754, + "step": 6020 + }, + { + "epoch": 1.0389386629910407, + "grad_norm": 23.930028169463007, + "learning_rate": 2.766006118964644e-08, + "logits/chosen": -3.3164703845977783, + "logits/rejected": -3.31402587890625, + "logps/chosen": -1.9535490274429321, + "logps/rejected": -2.115110158920288, + "loss": 1.7733, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.9070980548858643, + "rewards/margins": 0.32312247157096863, + "rewards/rejected": -4.230220317840576, + "step": 6030 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 37.893524140386816, + "learning_rate": 2.7585296755083613e-08, + "logits/chosen": -3.446110248565674, + "logits/rejected": -3.4356002807617188, + "logps/chosen": -1.9076626300811768, + "logps/rejected": -2.0944972038269043, + "loss": 1.6637, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.8153252601623535, + "rewards/margins": 0.3736687898635864, + "rewards/rejected": -4.188994407653809, + "step": 6040 + }, + { + "epoch": 1.0423845623707788, + "grad_norm": 22.600813871043794, + "learning_rate": 2.751050894148317e-08, + "logits/chosen": -3.3773860931396484, + "logits/rejected": -3.362931728363037, + "logps/chosen": -1.9419740438461304, + "logps/rejected": -2.093290328979492, + "loss": 1.7129, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.8839480876922607, + "rewards/margins": 0.30263298749923706, + "rewards/rejected": -4.186580657958984, + "step": 6050 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 24.39805773535005, + "learning_rate": 2.7435698425157065e-08, + "logits/chosen": -3.424407958984375, + "logits/rejected": -3.4114749431610107, + "logps/chosen": -1.8529952764511108, + "logps/rejected": -2.066490888595581, + "loss": 1.5953, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7059905529022217, + "rewards/margins": 0.42699113488197327, + "rewards/rejected": -4.132981777191162, + "step": 6060 + }, + { + "epoch": 1.045830461750517, + "grad_norm": 27.969370141486593, + "learning_rate": 2.7360865882622558e-08, + "logits/chosen": -3.44199800491333, + "logits/rejected": -3.4312336444854736, + "logps/chosen": -1.8919410705566406, + "logps/rejected": -2.101566791534424, + "loss": 1.5985, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7838821411132812, + "rewards/margins": 0.4192514419555664, + "rewards/rejected": -4.203133583068848, + "step": 6070 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 23.460489683430584, + "learning_rate": 2.7286011990596092e-08, + "logits/chosen": -3.411041259765625, + "logits/rejected": -3.3989486694335938, + "logps/chosen": -1.9078261852264404, + "logps/rejected": -2.1355011463165283, + "loss": 1.6063, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.815652370452881, + "rewards/margins": 0.45534953474998474, + "rewards/rejected": -4.271002292633057, + "step": 6080 + }, + { + "epoch": 1.049276361130255, + "grad_norm": 26.216497526430526, + "learning_rate": 2.7211137425987175e-08, + "logits/chosen": -3.4547970294952393, + "logits/rejected": -3.4468891620635986, + "logps/chosen": -1.8301016092300415, + "logps/rejected": -2.054182529449463, + "loss": 1.5458, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.660203218460083, + "rewards/margins": 0.4481622278690338, + "rewards/rejected": -4.108365058898926, + "step": 6090 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 23.298143672367512, + "learning_rate": 2.7136242865892268e-08, + "logits/chosen": -3.426401138305664, + "logits/rejected": -3.42529034614563, + "logps/chosen": -1.9562355279922485, + "logps/rejected": -2.0616672039031982, + "loss": 1.7463, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.912471055984497, + "rewards/margins": 0.21086356043815613, + "rewards/rejected": -4.1233344078063965, + "step": 6100 + }, + { + "epoch": 1.050999310820124, + "eval_logits/chosen": -3.496652364730835, + "eval_logits/rejected": -3.492926836013794, + "eval_logps/chosen": -1.7541241645812988, + "eval_logps/rejected": -1.9226609468460083, + "eval_loss": 1.6252524852752686, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.5082483291625977, + "eval_rewards/margins": 0.3370734453201294, + "eval_rewards/rejected": -3.8453218936920166, + "eval_runtime": 156.9703, + "eval_samples_per_second": 27.419, + "eval_steps_per_second": 3.427, + "step": 6100 + }, + { + "epoch": 1.052722260509993, + "grad_norm": 24.223531615785042, + "learning_rate": 2.7061328987588627e-08, + "logits/chosen": -3.4404990673065186, + "logits/rejected": -3.4312336444854736, + "logps/chosen": -1.8657020330429077, + "logps/rejected": -2.0769143104553223, + "loss": 1.5788, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7314040660858154, + "rewards/margins": 0.4224247336387634, + "rewards/rejected": -4.1538286209106445, + "step": 6110 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 18.6569178792697, + "learning_rate": 2.698639646852824e-08, + "logits/chosen": -3.4975762367248535, + "logits/rejected": -3.4713234901428223, + "logps/chosen": -1.7752326726913452, + "logps/rejected": -2.175679922103882, + "loss": 1.3756, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5504653453826904, + "rewards/margins": 0.8008943796157837, + "rewards/rejected": -4.351359844207764, + "step": 6120 + }, + { + "epoch": 1.0561681598897312, + "grad_norm": 26.276483384691023, + "learning_rate": 2.6911445986331633e-08, + "logits/chosen": -3.4394097328186035, + "logits/rejected": -3.4263694286346436, + "logps/chosen": -1.88412344455719, + "logps/rejected": -2.210101366043091, + "loss": 1.476, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.76824688911438, + "rewards/margins": 0.6519559621810913, + "rewards/rejected": -4.420202732086182, + "step": 6130 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 26.564349727605897, + "learning_rate": 2.68364782187818e-08, + "logits/chosen": -3.458695888519287, + "logits/rejected": -3.4527955055236816, + "logps/chosen": -1.959539771080017, + "logps/rejected": -2.103034496307373, + "loss": 1.7347, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.919079542160034, + "rewards/margins": 0.28698983788490295, + "rewards/rejected": -4.206068992614746, + "step": 6140 + }, + { + "epoch": 1.0596140592694694, + "grad_norm": 24.21342855825975, + "learning_rate": 2.676149384381803e-08, + "logits/chosen": -3.397584915161133, + "logits/rejected": -3.3889026641845703, + "logps/chosen": -1.9308334589004517, + "logps/rejected": -2.074594020843506, + "loss": 1.6873, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.8616669178009033, + "rewards/margins": 0.28752079606056213, + "rewards/rejected": -4.149188041687012, + "step": 6150 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 25.06883042468319, + "learning_rate": 2.66864935395298e-08, + "logits/chosen": -3.3745269775390625, + "logits/rejected": -3.3683676719665527, + "logps/chosen": -1.8180949687957764, + "logps/rejected": -2.0215163230895996, + "loss": 1.6042, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.6361899375915527, + "rewards/margins": 0.40684255957603455, + "rewards/rejected": -4.043032646179199, + "step": 6160 + }, + { + "epoch": 1.0630599586492075, + "grad_norm": 25.854869367526174, + "learning_rate": 2.6611477984150627e-08, + "logits/chosen": -3.444981336593628, + "logits/rejected": -3.435189723968506, + "logps/chosen": -1.962073564529419, + "logps/rejected": -2.1542935371398926, + "loss": 1.609, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.924147129058838, + "rewards/margins": 0.3844393789768219, + "rewards/rejected": -4.308587074279785, + "step": 6170 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 25.36834713278627, + "learning_rate": 2.6536447856051963e-08, + "logits/chosen": -3.455448865890503, + "logits/rejected": -3.4410884380340576, + "logps/chosen": -1.88705575466156, + "logps/rejected": -2.1044223308563232, + "loss": 1.5958, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.77411150932312, + "rewards/margins": 0.43473249673843384, + "rewards/rejected": -4.2088446617126465, + "step": 6180 + }, + { + "epoch": 1.0665058580289455, + "grad_norm": 23.304252823764156, + "learning_rate": 2.646140383373704e-08, + "logits/chosen": -3.4283642768859863, + "logits/rejected": -3.414384126663208, + "logps/chosen": -1.9061508178710938, + "logps/rejected": -2.1185643672943115, + "loss": 1.6208, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8123016357421875, + "rewards/margins": 0.42482733726501465, + "rewards/rejected": -4.237128734588623, + "step": 6190 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 26.198656505706957, + "learning_rate": 2.638634659583472e-08, + "logits/chosen": -3.3760695457458496, + "logits/rejected": -3.3652126789093018, + "logps/chosen": -1.9306923151016235, + "logps/rejected": -2.2100319862365723, + "loss": 1.5582, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.861384630203247, + "rewards/margins": 0.5586789846420288, + "rewards/rejected": -4.4200639724731445, + "step": 6200 + }, + { + "epoch": 1.0682288077188147, + "eval_logits/chosen": -3.49644136428833, + "eval_logits/rejected": -3.492737054824829, + "eval_logps/chosen": -1.7610955238342285, + "eval_logps/rejected": -1.9302524328231812, + "eval_loss": 1.6247905492782593, + "eval_rewards/accuracies": 0.6231412887573242, + "eval_rewards/chosen": -3.522191047668457, + "eval_rewards/margins": 0.3383132815361023, + "eval_rewards/rejected": -3.8605048656463623, + "eval_runtime": 157.4917, + "eval_samples_per_second": 27.328, + "eval_steps_per_second": 3.416, + "step": 6200 + }, + { + "epoch": 1.0699517574086836, + "grad_norm": 25.077516569880284, + "learning_rate": 2.6311276821093382e-08, + "logits/chosen": -3.4414830207824707, + "logits/rejected": -3.425611972808838, + "logps/chosen": -1.8718812465667725, + "logps/rejected": -2.1334996223449707, + "loss": 1.4955, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.743762493133545, + "rewards/margins": 0.5232369899749756, + "rewards/rejected": -4.266999244689941, + "step": 6210 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 23.031062244448684, + "learning_rate": 2.62361951883748e-08, + "logits/chosen": -3.4298770427703857, + "logits/rejected": -3.4193503856658936, + "logps/chosen": -1.873711347579956, + "logps/rejected": -2.1638543605804443, + "loss": 1.5182, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.747422695159912, + "rewards/margins": 0.5802858471870422, + "rewards/rejected": -4.327708721160889, + "step": 6220 + }, + { + "epoch": 1.0733976567884218, + "grad_norm": 21.515230898244056, + "learning_rate": 2.616110237664793e-08, + "logits/chosen": -3.515364408493042, + "logits/rejected": -3.5071632862091064, + "logps/chosen": -1.9086408615112305, + "logps/rejected": -2.1381278038024902, + "loss": 1.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.817281723022461, + "rewards/margins": 0.45897355675697327, + "rewards/rejected": -4.2762556076049805, + "step": 6230 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 23.679766185530138, + "learning_rate": 2.608599906498287e-08, + "logits/chosen": -3.400257110595703, + "logits/rejected": -3.383641004562378, + "logps/chosen": -1.9049596786499023, + "logps/rejected": -2.1673707962036133, + "loss": 1.5881, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.8099193572998047, + "rewards/margins": 0.5248219966888428, + "rewards/rejected": -4.334741592407227, + "step": 6240 + }, + { + "epoch": 1.07684355616816, + "grad_norm": 23.842781849181858, + "learning_rate": 2.6010885932544646e-08, + "logits/chosen": -3.4542598724365234, + "logits/rejected": -3.442675828933716, + "logps/chosen": -1.9199516773223877, + "logps/rejected": -2.0925049781799316, + "loss": 1.6732, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.8399033546447754, + "rewards/margins": 0.34510672092437744, + "rewards/rejected": -4.185009956359863, + "step": 6250 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 22.351211833840626, + "learning_rate": 2.59357636585871e-08, + "logits/chosen": -3.3841166496276855, + "logits/rejected": -3.3736701011657715, + "logps/chosen": -1.9004634618759155, + "logps/rejected": -2.0839884281158447, + "loss": 1.6407, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.800926923751831, + "rewards/margins": 0.36705002188682556, + "rewards/rejected": -4.1679768562316895, + "step": 6260 + }, + { + "epoch": 1.080289455547898, + "grad_norm": 20.458864322162952, + "learning_rate": 2.5860632922446733e-08, + "logits/chosen": -3.54130220413208, + "logits/rejected": -3.539015293121338, + "logps/chosen": -1.8605903387069702, + "logps/rejected": -2.0223145484924316, + "loss": 1.6877, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.7211806774139404, + "rewards/margins": 0.3234478533267975, + "rewards/rejected": -4.044629096984863, + "step": 6270 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 20.861743632499394, + "learning_rate": 2.578549440353659e-08, + "logits/chosen": -3.35361909866333, + "logits/rejected": -3.3385798931121826, + "logps/chosen": -1.8334972858428955, + "logps/rejected": -2.098980665206909, + "loss": 1.5023, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.666994571685791, + "rewards/margins": 0.5309672355651855, + "rewards/rejected": -4.197961330413818, + "step": 6280 + }, + { + "epoch": 1.083735354927636, + "grad_norm": 21.21722189777394, + "learning_rate": 2.5710348781340068e-08, + "logits/chosen": -3.4000906944274902, + "logits/rejected": -3.383531093597412, + "logps/chosen": -1.8272804021835327, + "logps/rejected": -2.2217767238616943, + "loss": 1.4163, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.6545608043670654, + "rewards/margins": 0.7889924049377441, + "rewards/rejected": -4.443553447723389, + "step": 6290 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 25.456240729536226, + "learning_rate": 2.5635196735404818e-08, + "logits/chosen": -3.447422742843628, + "logits/rejected": -3.4313158988952637, + "logps/chosen": -1.937988519668579, + "logps/rejected": -2.2520275115966797, + "loss": 1.556, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.875977039337158, + "rewards/margins": 0.6280778646469116, + "rewards/rejected": -4.504055023193359, + "step": 6300 + }, + { + "epoch": 1.0854583046175053, + "eval_logits/chosen": -3.50801944732666, + "eval_logits/rejected": -3.5043792724609375, + "eval_logps/chosen": -1.7653207778930664, + "eval_logps/rejected": -1.935317039489746, + "eval_loss": 1.6239352226257324, + "eval_rewards/accuracies": 0.6233736276626587, + "eval_rewards/chosen": -3.530641555786133, + "eval_rewards/margins": 0.33999258279800415, + "eval_rewards/rejected": -3.870634078979492, + "eval_runtime": 157.4397, + "eval_samples_per_second": 27.337, + "eval_steps_per_second": 3.417, + "step": 6300 + }, + { + "epoch": 1.0871812543073742, + "grad_norm": 26.659538314813936, + "learning_rate": 2.556003894533658e-08, + "logits/chosen": -3.4263978004455566, + "logits/rejected": -3.4090981483459473, + "logps/chosen": -1.8155367374420166, + "logps/rejected": -2.1240029335021973, + "loss": 1.482, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.631073474884033, + "rewards/margins": 0.6169317364692688, + "rewards/rejected": -4.2480058670043945, + "step": 6310 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 23.473408204425034, + "learning_rate": 2.548487609079305e-08, + "logits/chosen": -3.4062061309814453, + "logits/rejected": -3.393658399581909, + "logps/chosen": -1.8910404443740845, + "logps/rejected": -2.1283631324768066, + "loss": 1.5696, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.782080888748169, + "rewards/margins": 0.4746461510658264, + "rewards/rejected": -4.256726264953613, + "step": 6320 + }, + { + "epoch": 1.0906271536871124, + "grad_norm": 22.626320576081902, + "learning_rate": 2.5409708851477683e-08, + "logits/chosen": -3.4128944873809814, + "logits/rejected": -3.402402400970459, + "logps/chosen": -1.9138180017471313, + "logps/rejected": -2.305133819580078, + "loss": 1.4217, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.8276360034942627, + "rewards/margins": 0.782631516456604, + "rewards/rejected": -4.610267639160156, + "step": 6330 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 23.088150806322417, + "learning_rate": 2.533453790713363e-08, + "logits/chosen": -3.401547908782959, + "logits/rejected": -3.3910603523254395, + "logps/chosen": -1.8892333507537842, + "logps/rejected": -2.1121861934661865, + "loss": 1.5641, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7784667015075684, + "rewards/margins": 0.4459056258201599, + "rewards/rejected": -4.224372386932373, + "step": 6340 + }, + { + "epoch": 1.0940730530668505, + "grad_norm": 22.312477381681088, + "learning_rate": 2.5259363937537526e-08, + "logits/chosen": -3.4055087566375732, + "logits/rejected": -3.3999106884002686, + "logps/chosen": -1.9376161098480225, + "logps/rejected": -2.1454520225524902, + "loss": 1.6414, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.875232219696045, + "rewards/margins": 0.4156716465950012, + "rewards/rejected": -4.2909040451049805, + "step": 6350 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 25.64007761337631, + "learning_rate": 2.518418762249336e-08, + "logits/chosen": -3.4236416816711426, + "logits/rejected": -3.4210102558135986, + "logps/chosen": -1.9194122552871704, + "logps/rejected": -2.1161863803863525, + "loss": 1.6272, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.838824510574341, + "rewards/margins": 0.39354830980300903, + "rewards/rejected": -4.232372760772705, + "step": 6360 + }, + { + "epoch": 1.0975189524465885, + "grad_norm": 23.951048296000966, + "learning_rate": 2.5109009641826344e-08, + "logits/chosen": -3.4351603984832764, + "logits/rejected": -3.433767795562744, + "logps/chosen": -1.8643081188201904, + "logps/rejected": -1.9683082103729248, + "loss": 1.7622, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.728616237640381, + "rewards/margins": 0.20800037682056427, + "rewards/rejected": -3.9366164207458496, + "step": 6370 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 26.762398247208974, + "learning_rate": 2.5033830675376744e-08, + "logits/chosen": -3.45196270942688, + "logits/rejected": -3.446587085723877, + "logps/chosen": -1.930763602256775, + "logps/rejected": -2.1500697135925293, + "loss": 1.5821, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.86152720451355, + "rewards/margins": 0.4386124014854431, + "rewards/rejected": -4.300139427185059, + "step": 6380 + }, + { + "epoch": 1.1009648518263266, + "grad_norm": 23.30529468718343, + "learning_rate": 2.4958651402993735e-08, + "logits/chosen": -3.480130672454834, + "logits/rejected": -3.4615283012390137, + "logps/chosen": -1.8706308603286743, + "logps/rejected": -2.1229002475738525, + "loss": 1.5593, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7412617206573486, + "rewards/margins": 0.5045387148857117, + "rewards/rejected": -4.245800495147705, + "step": 6390 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 26.37719522865988, + "learning_rate": 2.4883472504529286e-08, + "logits/chosen": -3.4308059215545654, + "logits/rejected": -3.4186038970947266, + "logps/chosen": -1.9104280471801758, + "logps/rejected": -2.1576361656188965, + "loss": 1.6222, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8208560943603516, + "rewards/margins": 0.49441656470298767, + "rewards/rejected": -4.315272331237793, + "step": 6400 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -3.492280960083008, + "eval_logits/rejected": -3.48857045173645, + "eval_logps/chosen": -1.7684823274612427, + "eval_logps/rejected": -1.9389570951461792, + "eval_loss": 1.623610019683838, + "eval_rewards/accuracies": 0.6240706443786621, + "eval_rewards/chosen": -3.5369646549224854, + "eval_rewards/margins": 0.3409496247768402, + "eval_rewards/rejected": -3.8779141902923584, + "eval_runtime": 157.671, + "eval_samples_per_second": 27.297, + "eval_steps_per_second": 3.412, + "step": 6400 + }, + { + "epoch": 1.1044107512060648, + "grad_norm": 26.481768786151584, + "learning_rate": 2.4808294659831937e-08, + "logits/chosen": -3.5006306171417236, + "logits/rejected": -3.4863364696502686, + "logps/chosen": -1.8835376501083374, + "logps/rejected": -2.121093988418579, + "loss": 1.5345, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.767075300216675, + "rewards/margins": 0.47511330246925354, + "rewards/rejected": -4.242187976837158, + "step": 6410 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 29.5264721336175, + "learning_rate": 2.473311854874075e-08, + "logits/chosen": -3.45835542678833, + "logits/rejected": -3.4507765769958496, + "logps/chosen": -1.9326547384262085, + "logps/rejected": -2.155423641204834, + "loss": 1.6372, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.865309476852417, + "rewards/margins": 0.44553714990615845, + "rewards/rejected": -4.310847282409668, + "step": 6420 + }, + { + "epoch": 1.107856650585803, + "grad_norm": 21.958034263611975, + "learning_rate": 2.4657944851079078e-08, + "logits/chosen": -3.4142768383026123, + "logits/rejected": -3.4055047035217285, + "logps/chosen": -1.8407742977142334, + "logps/rejected": -2.0247392654418945, + "loss": 1.6929, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.681548595428467, + "rewards/margins": 0.3679301142692566, + "rewards/rejected": -4.049478530883789, + "step": 6430 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 24.330616550508303, + "learning_rate": 2.4582774246648447e-08, + "logits/chosen": -3.4002785682678223, + "logits/rejected": -3.3899548053741455, + "logps/chosen": -1.9636898040771484, + "logps/rejected": -2.158742666244507, + "loss": 1.6566, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.927379608154297, + "rewards/margins": 0.3901059329509735, + "rewards/rejected": -4.317485332489014, + "step": 6440 + }, + { + "epoch": 1.111302549965541, + "grad_norm": 20.15906408424054, + "learning_rate": 2.4507607415222437e-08, + "logits/chosen": -3.411196231842041, + "logits/rejected": -3.3931374549865723, + "logps/chosen": -1.913351058959961, + "logps/rejected": -2.1704511642456055, + "loss": 1.5516, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.826702117919922, + "rewards/margins": 0.5142003893852234, + "rewards/rejected": -4.340902328491211, + "step": 6450 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 26.95121833936693, + "learning_rate": 2.443244503654047e-08, + "logits/chosen": -3.406850814819336, + "logits/rejected": -3.4077370166778564, + "logps/chosen": -1.8686326742172241, + "logps/rejected": -2.0990982055664062, + "loss": 1.5489, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.7372653484344482, + "rewards/margins": 0.4609312117099762, + "rewards/rejected": -4.1981964111328125, + "step": 6460 + }, + { + "epoch": 1.114748449345279, + "grad_norm": 24.119810584886398, + "learning_rate": 2.4357287790301757e-08, + "logits/chosen": -3.3775393962860107, + "logits/rejected": -3.3662426471710205, + "logps/chosen": -1.8917232751846313, + "logps/rejected": -2.1614763736724854, + "loss": 1.5402, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.7834465503692627, + "rewards/margins": 0.5395061373710632, + "rewards/rejected": -4.322952747344971, + "step": 6470 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 25.105745305141653, + "learning_rate": 2.4282136356159026e-08, + "logits/chosen": -3.433812379837036, + "logits/rejected": -3.41306734085083, + "logps/chosen": -1.8526582717895508, + "logps/rejected": -2.1101412773132324, + "loss": 1.6047, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.7053165435791016, + "rewards/margins": 0.514965832233429, + "rewards/rejected": -4.220282554626465, + "step": 6480 + }, + { + "epoch": 1.1181943487250172, + "grad_norm": 23.49594314800217, + "learning_rate": 2.4206991413712514e-08, + "logits/chosen": -3.532194137573242, + "logits/rejected": -3.524559736251831, + "logps/chosen": -1.887905478477478, + "logps/rejected": -2.0991828441619873, + "loss": 1.5873, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.775810956954956, + "rewards/margins": 0.4225550591945648, + "rewards/rejected": -4.198365688323975, + "step": 6490 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 24.971302098408405, + "learning_rate": 2.4131853642503697e-08, + "logits/chosen": -3.428556442260742, + "logits/rejected": -3.422470808029175, + "logps/chosen": -1.9920654296875, + "logps/rejected": -2.076946496963501, + "loss": 1.807, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.984130859375, + "rewards/margins": 0.16976282000541687, + "rewards/rejected": -4.153892993927002, + "step": 6500 + }, + { + "epoch": 1.1199172984148862, + "eval_logits/chosen": -3.485499143600464, + "eval_logits/rejected": -3.4817657470703125, + "eval_logps/chosen": -1.7731761932373047, + "eval_logps/rejected": -1.9441922903060913, + "eval_loss": 1.6233266592025757, + "eval_rewards/accuracies": 0.6233736276626587, + "eval_rewards/chosen": -3.5463523864746094, + "eval_rewards/margins": 0.34203246235847473, + "eval_rewards/rejected": -3.8883845806121826, + "eval_runtime": 157.6842, + "eval_samples_per_second": 27.295, + "eval_steps_per_second": 3.412, + "step": 6500 + }, + { + "epoch": 1.1216402481047554, + "grad_norm": 25.757149037133075, + "learning_rate": 2.4056723722009246e-08, + "logits/chosen": -3.449256420135498, + "logits/rejected": -3.4242050647735596, + "logps/chosen": -1.8602228164672852, + "logps/rejected": -2.1611621379852295, + "loss": 1.4584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.7204456329345703, + "rewards/margins": 0.6018784046173096, + "rewards/rejected": -4.322324275970459, + "step": 6510 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 25.001497722511658, + "learning_rate": 2.3981602331634804e-08, + "logits/chosen": -3.406628131866455, + "logits/rejected": -3.392561674118042, + "logps/chosen": -1.8334674835205078, + "logps/rejected": -2.1209278106689453, + "loss": 1.4867, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6669349670410156, + "rewards/margins": 0.5749204754829407, + "rewards/rejected": -4.241855621337891, + "step": 6520 + }, + { + "epoch": 1.1250861474844935, + "grad_norm": 22.05246045246644, + "learning_rate": 2.3906490150708893e-08, + "logits/chosen": -3.3990931510925293, + "logits/rejected": -3.383315324783325, + "logps/chosen": -1.8670612573623657, + "logps/rejected": -2.1215057373046875, + "loss": 1.539, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.7341225147247314, + "rewards/margins": 0.5088890790939331, + "rewards/rejected": -4.243011474609375, + "step": 6530 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 25.305953172604923, + "learning_rate": 2.383138785847674e-08, + "logits/chosen": -3.440575122833252, + "logits/rejected": -3.4270176887512207, + "logps/chosen": -1.8891582489013672, + "logps/rejected": -2.0730395317077637, + "loss": 1.6515, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7783164978027344, + "rewards/margins": 0.367762953042984, + "rewards/rejected": -4.146079063415527, + "step": 6540 + }, + { + "epoch": 1.1285320468642315, + "grad_norm": 26.449837659021505, + "learning_rate": 2.3756296134094176e-08, + "logits/chosen": -3.3629696369171143, + "logits/rejected": -3.351616621017456, + "logps/chosen": -1.950439453125, + "logps/rejected": -2.241163730621338, + "loss": 1.5233, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.90087890625, + "rewards/margins": 0.5814481973648071, + "rewards/rejected": -4.482327461242676, + "step": 6550 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 22.557453260788375, + "learning_rate": 2.368121565662142e-08, + "logits/chosen": -3.4474921226501465, + "logits/rejected": -3.424931049346924, + "logps/chosen": -1.8617607355117798, + "logps/rejected": -2.1908140182495117, + "loss": 1.4659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7235214710235596, + "rewards/margins": 0.6581061482429504, + "rewards/rejected": -4.381628036499023, + "step": 6560 + }, + { + "epoch": 1.1319779462439696, + "grad_norm": 26.255210300960098, + "learning_rate": 2.3606147105017038e-08, + "logits/chosen": -3.4377987384796143, + "logits/rejected": -3.422377109527588, + "logps/chosen": -1.9296767711639404, + "logps/rejected": -2.224959135055542, + "loss": 1.49, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.859353542327881, + "rewards/margins": 0.5905646085739136, + "rewards/rejected": -4.449918270111084, + "step": 6570 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 19.461475912036008, + "learning_rate": 2.35310911581317e-08, + "logits/chosen": -3.4443840980529785, + "logits/rejected": -3.425375461578369, + "logps/chosen": -1.8490747213363647, + "logps/rejected": -2.08895206451416, + "loss": 1.6048, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6981494426727295, + "rewards/margins": 0.4797547459602356, + "rewards/rejected": -4.17790412902832, + "step": 6580 + }, + { + "epoch": 1.1354238456237078, + "grad_norm": 21.635361750907, + "learning_rate": 2.3456048494702132e-08, + "logits/chosen": -3.4119491577148438, + "logits/rejected": -3.400381088256836, + "logps/chosen": -1.8835827112197876, + "logps/rejected": -2.143942356109619, + "loss": 1.5218, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.767165422439575, + "rewards/margins": 0.5207185745239258, + "rewards/rejected": -4.287884712219238, + "step": 6590 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 31.618565683779003, + "learning_rate": 2.3381019793344898e-08, + "logits/chosen": -3.4728996753692627, + "logits/rejected": -3.4640707969665527, + "logps/chosen": -1.9594961404800415, + "logps/rejected": -2.0875051021575928, + "loss": 1.746, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.918992280960083, + "rewards/margins": 0.25601816177368164, + "rewards/rejected": -4.1750102043151855, + "step": 6600 + }, + { + "epoch": 1.1371467953135768, + "eval_logits/chosen": -3.4891669750213623, + "eval_logits/rejected": -3.4854750633239746, + "eval_logps/chosen": -1.7765134572982788, + "eval_logps/rejected": -1.9479124546051025, + "eval_loss": 1.6231111288070679, + "eval_rewards/accuracies": 0.624535322189331, + "eval_rewards/chosen": -3.5530269145965576, + "eval_rewards/margins": 0.34279850125312805, + "eval_rewards/rejected": -3.895824909210205, + "eval_runtime": 157.4575, + "eval_samples_per_second": 27.334, + "eval_steps_per_second": 3.417, + "step": 6600 + }, + { + "epoch": 1.138869745003446, + "grad_norm": 22.25400532984802, + "learning_rate": 2.330600573255034e-08, + "logits/chosen": -3.4429943561553955, + "logits/rejected": -3.4320926666259766, + "logps/chosen": -1.8756192922592163, + "logps/rejected": -2.1362249851226807, + "loss": 1.5043, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.7512385845184326, + "rewards/margins": 0.5212109684944153, + "rewards/rejected": -4.272449970245361, + "step": 6610 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 28.445031366674645, + "learning_rate": 2.3231006990676365e-08, + "logits/chosen": -3.4300742149353027, + "logits/rejected": -3.4185516834259033, + "logps/chosen": -1.9972951412200928, + "logps/rejected": -2.216991662979126, + "loss": 1.6157, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.9945902824401855, + "rewards/margins": 0.43939271569252014, + "rewards/rejected": -4.433983325958252, + "step": 6620 + }, + { + "epoch": 1.1423156443831841, + "grad_norm": 22.963683703172627, + "learning_rate": 2.3156024245942392e-08, + "logits/chosen": -3.4114105701446533, + "logits/rejected": -3.396883487701416, + "logps/chosen": -1.8753890991210938, + "logps/rejected": -2.051189661026001, + "loss": 1.6332, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.7507781982421875, + "rewards/margins": 0.3516008257865906, + "rewards/rejected": -4.102379322052002, + "step": 6630 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 23.87493046738113, + "learning_rate": 2.3081058176423148e-08, + "logits/chosen": -3.4403011798858643, + "logits/rejected": -3.420198917388916, + "logps/chosen": -1.9041345119476318, + "logps/rejected": -2.259666919708252, + "loss": 1.4307, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.8082690238952637, + "rewards/margins": 0.7110651731491089, + "rewards/rejected": -4.519333839416504, + "step": 6640 + }, + { + "epoch": 1.145761543762922, + "grad_norm": 27.54338433561203, + "learning_rate": 2.3006109460042562e-08, + "logits/chosen": -3.461383819580078, + "logits/rejected": -3.4505279064178467, + "logps/chosen": -1.928259253501892, + "logps/rejected": -2.225426435470581, + "loss": 1.5255, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.856518507003784, + "rewards/margins": 0.5943342447280884, + "rewards/rejected": -4.450852870941162, + "step": 6650 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 22.828594312406647, + "learning_rate": 2.293117877456766e-08, + "logits/chosen": -3.4791464805603027, + "logits/rejected": -3.4677276611328125, + "logps/chosen": -1.8202745914459229, + "logps/rejected": -2.139392375946045, + "loss": 1.4487, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.6405491828918457, + "rewards/margins": 0.6382359862327576, + "rewards/rejected": -4.27878475189209, + "step": 6660 + }, + { + "epoch": 1.1492074431426602, + "grad_norm": 24.413380732971667, + "learning_rate": 2.2856266797602393e-08, + "logits/chosen": -3.416748046875, + "logits/rejected": -3.4172587394714355, + "logps/chosen": -1.916229248046875, + "logps/rejected": -2.188417673110962, + "loss": 1.5022, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.83245849609375, + "rewards/margins": 0.5443769693374634, + "rewards/rejected": -4.376835346221924, + "step": 6670 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 24.254526358313736, + "learning_rate": 2.2781374206581543e-08, + "logits/chosen": -3.4309706687927246, + "logits/rejected": -3.4124674797058105, + "logps/chosen": -1.915658950805664, + "logps/rejected": -2.253605604171753, + "loss": 1.4667, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.831317901611328, + "rewards/margins": 0.6758934259414673, + "rewards/rejected": -4.507211208343506, + "step": 6680 + }, + { + "epoch": 1.1526533425223984, + "grad_norm": 29.700040847656194, + "learning_rate": 2.2706501678764558e-08, + "logits/chosen": -3.4105143547058105, + "logits/rejected": -3.3985724449157715, + "logps/chosen": -1.9789812564849854, + "logps/rejected": -2.2342097759246826, + "loss": 1.5656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9579625129699707, + "rewards/margins": 0.5104572176933289, + "rewards/rejected": -4.468419551849365, + "step": 6690 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 26.076961346851398, + "learning_rate": 2.26316498912295e-08, + "logits/chosen": -3.4311022758483887, + "logits/rejected": -3.4240036010742188, + "logps/chosen": -1.9232776165008545, + "logps/rejected": -2.2123000621795654, + "loss": 1.5871, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.846555233001709, + "rewards/margins": 0.5780445337295532, + "rewards/rejected": -4.424600124359131, + "step": 6700 + }, + { + "epoch": 1.1543762922122673, + "eval_logits/chosen": -3.485368251800537, + "eval_logits/rejected": -3.4816720485687256, + "eval_logps/chosen": -1.77727210521698, + "eval_logps/rejected": -1.9489160776138306, + "eval_loss": 1.6229164600372314, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.55454421043396, + "eval_rewards/margins": 0.3432879149913788, + "eval_rewards/rejected": -3.897832155227661, + "eval_runtime": 157.458, + "eval_samples_per_second": 27.334, + "eval_steps_per_second": 3.417, + "step": 6700 + }, + { + "epoch": 1.1560992419021365, + "grad_norm": 24.015615147071895, + "learning_rate": 2.2556819520866827e-08, + "logits/chosen": -3.422043561935425, + "logits/rejected": -3.4073777198791504, + "logps/chosen": -1.8918098211288452, + "logps/rejected": -2.155973196029663, + "loss": 1.5911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.7836196422576904, + "rewards/margins": 0.5283266305923462, + "rewards/rejected": -4.311946392059326, + "step": 6710 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 25.379997143851515, + "learning_rate": 2.2482011244373356e-08, + "logits/chosen": -3.4165942668914795, + "logits/rejected": -3.4128177165985107, + "logps/chosen": -1.8420839309692383, + "logps/rejected": -2.080439805984497, + "loss": 1.5742, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.6841678619384766, + "rewards/margins": 0.4767116904258728, + "rewards/rejected": -4.160879611968994, + "step": 6720 + }, + { + "epoch": 1.1595451412818747, + "grad_norm": 24.323849455174184, + "learning_rate": 2.2407225738246073e-08, + "logits/chosen": -3.3932223320007324, + "logits/rejected": -3.3844313621520996, + "logps/chosen": -1.926422357559204, + "logps/rejected": -2.1044890880584717, + "loss": 1.6821, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.852844715118408, + "rewards/margins": 0.3561330735683441, + "rewards/rejected": -4.208978176116943, + "step": 6730 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 25.936669370657253, + "learning_rate": 2.233246367877609e-08, + "logits/chosen": -3.4416210651397705, + "logits/rejected": -3.4372668266296387, + "logps/chosen": -1.9254367351531982, + "logps/rejected": -2.1106340885162354, + "loss": 1.6676, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.8508734703063965, + "rewards/margins": 0.37039458751678467, + "rewards/rejected": -4.221268177032471, + "step": 6740 + }, + { + "epoch": 1.1629910406616126, + "grad_norm": 25.517256048533326, + "learning_rate": 2.2257725742042437e-08, + "logits/chosen": -3.458127498626709, + "logits/rejected": -3.4507155418395996, + "logps/chosen": -1.9434999227523804, + "logps/rejected": -2.208829164505005, + "loss": 1.5668, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.8869998455047607, + "rewards/margins": 0.5306581258773804, + "rewards/rejected": -4.41765832901001, + "step": 6750 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 28.07815704054573, + "learning_rate": 2.2183012603906064e-08, + "logits/chosen": -3.4188218116760254, + "logits/rejected": -3.404370069503784, + "logps/chosen": -1.8502967357635498, + "logps/rejected": -2.043362855911255, + "loss": 1.6382, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7005934715270996, + "rewards/margins": 0.3861318528652191, + "rewards/rejected": -4.08672571182251, + "step": 6760 + }, + { + "epoch": 1.1664369400413508, + "grad_norm": 27.150353045008107, + "learning_rate": 2.2108324940003607e-08, + "logits/chosen": -3.428650379180908, + "logits/rejected": -3.425046920776367, + "logps/chosen": -1.8983867168426514, + "logps/rejected": -2.139829635620117, + "loss": 1.5795, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.7967734336853027, + "rewards/margins": 0.4828854501247406, + "rewards/rejected": -4.279659271240234, + "step": 6770 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 27.20576438943625, + "learning_rate": 2.2033663425741377e-08, + "logits/chosen": -3.4347453117370605, + "logits/rejected": -3.4195542335510254, + "logps/chosen": -1.9862568378448486, + "logps/rejected": -2.1624908447265625, + "loss": 1.6724, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.9725136756896973, + "rewards/margins": 0.3524686396121979, + "rewards/rejected": -4.324981689453125, + "step": 6780 + }, + { + "epoch": 1.169882839421089, + "grad_norm": 23.779610359609993, + "learning_rate": 2.1959028736289184e-08, + "logits/chosen": -3.4271883964538574, + "logits/rejected": -3.4186477661132812, + "logps/chosen": -1.949845552444458, + "logps/rejected": -2.26304030418396, + "loss": 1.4985, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.899691104888916, + "rewards/margins": 0.6263898015022278, + "rewards/rejected": -4.52608060836792, + "step": 6790 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 21.227110612424173, + "learning_rate": 2.1884421546574288e-08, + "logits/chosen": -3.3856475353240967, + "logits/rejected": -3.3704590797424316, + "logps/chosen": -1.8847612142562866, + "logps/rejected": -2.1443991661071777, + "loss": 1.5459, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.7695224285125732, + "rewards/margins": 0.5192757844924927, + "rewards/rejected": -4.2887983322143555, + "step": 6800 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -3.48073148727417, + "eval_logits/rejected": -3.4770267009735107, + "eval_logps/chosen": -1.7835712432861328, + "eval_logps/rejected": -1.9558632373809814, + "eval_loss": 1.6222071647644043, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.5671424865722656, + "eval_rewards/margins": 0.34458374977111816, + "eval_rewards/rejected": -3.911726474761963, + "eval_runtime": 157.732, + "eval_samples_per_second": 27.287, + "eval_steps_per_second": 3.411, + "step": 6800 + }, + { + "epoch": 1.173328738800827, + "grad_norm": 25.11601471763464, + "learning_rate": 2.180984253127523e-08, + "logits/chosen": -3.422598361968994, + "logits/rejected": -3.4122977256774902, + "logps/chosen": -1.9278514385223389, + "logps/rejected": -2.0730233192443848, + "loss": 1.6969, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.8557028770446777, + "rewards/margins": 0.2903437614440918, + "rewards/rejected": -4.1460466384887695, + "step": 6810 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 22.75272210439096, + "learning_rate": 2.173529236481581e-08, + "logits/chosen": -3.468090772628784, + "logits/rejected": -3.451240062713623, + "logps/chosen": -1.917641282081604, + "logps/rejected": -2.2519688606262207, + "loss": 1.4514, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.835282564163208, + "rewards/margins": 0.6686547994613647, + "rewards/rejected": -4.503937721252441, + "step": 6820 + }, + { + "epoch": 1.176774638180565, + "grad_norm": 30.112147234687846, + "learning_rate": 2.1660771721358898e-08, + "logits/chosen": -3.4728915691375732, + "logits/rejected": -3.4671969413757324, + "logps/chosen": -1.9501558542251587, + "logps/rejected": -2.1013197898864746, + "loss": 1.7078, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.9003117084503174, + "rewards/margins": 0.3023281693458557, + "rewards/rejected": -4.202639579772949, + "step": 6830 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 21.996366488539635, + "learning_rate": 2.1586281274800433e-08, + "logits/chosen": -3.4565796852111816, + "logits/rejected": -3.4434902667999268, + "logps/chosen": -1.8902994394302368, + "logps/rejected": -2.109687328338623, + "loss": 1.563, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.7805988788604736, + "rewards/margins": 0.43877601623535156, + "rewards/rejected": -4.219374656677246, + "step": 6840 + }, + { + "epoch": 1.1802205375603032, + "grad_norm": 25.22751878758884, + "learning_rate": 2.1511821698763248e-08, + "logits/chosen": -3.3564560413360596, + "logits/rejected": -3.341444492340088, + "logps/chosen": -1.907942533493042, + "logps/rejected": -2.2341294288635254, + "loss": 1.4624, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.815885066986084, + "rewards/margins": 0.6523740887641907, + "rewards/rejected": -4.468258857727051, + "step": 6850 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 26.155419710244, + "learning_rate": 2.143739366659102e-08, + "logits/chosen": -3.4247093200683594, + "logits/rejected": -3.4043967723846436, + "logps/chosen": -1.9265403747558594, + "logps/rejected": -2.1271448135375977, + "loss": 1.6112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.8530807495117188, + "rewards/margins": 0.4012090563774109, + "rewards/rejected": -4.254289627075195, + "step": 6860 + }, + { + "epoch": 1.1836664369400414, + "grad_norm": 26.16785599900131, + "learning_rate": 2.1362997851342184e-08, + "logits/chosen": -3.392116069793701, + "logits/rejected": -3.3862385749816895, + "logps/chosen": -2.0076956748962402, + "logps/rejected": -2.175105333328247, + "loss": 1.6983, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.0153913497924805, + "rewards/margins": 0.33481907844543457, + "rewards/rejected": -4.350210666656494, + "step": 6870 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 25.135004012430752, + "learning_rate": 2.1288634925783816e-08, + "logits/chosen": -3.4581406116485596, + "logits/rejected": -3.441523790359497, + "logps/chosen": -1.9023135900497437, + "logps/rejected": -2.1638169288635254, + "loss": 1.5768, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.8046271800994873, + "rewards/margins": 0.5230072140693665, + "rewards/rejected": -4.327633857727051, + "step": 6880 + }, + { + "epoch": 1.1871123363197795, + "grad_norm": 23.716114414134516, + "learning_rate": 2.1214305562385588e-08, + "logits/chosen": -3.396627426147461, + "logits/rejected": -3.3821895122528076, + "logps/chosen": -1.8179162740707397, + "logps/rejected": -2.167005777359009, + "loss": 1.4699, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.6358325481414795, + "rewards/margins": 0.6981797814369202, + "rewards/rejected": -4.334011554718018, + "step": 6890 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 27.11339078150157, + "learning_rate": 2.1140010433313643e-08, + "logits/chosen": -3.4456264972686768, + "logits/rejected": -3.4342048168182373, + "logps/chosen": -1.8990799188613892, + "logps/rejected": -2.141753911972046, + "loss": 1.5606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.7981598377227783, + "rewards/margins": 0.4853484034538269, + "rewards/rejected": -4.283507823944092, + "step": 6900 + }, + { + "epoch": 1.1888352860096485, + "eval_logits/chosen": -3.488023042678833, + "eval_logits/rejected": -3.4843647480010986, + "eval_logps/chosen": -1.7879215478897095, + "eval_logps/rejected": -1.9602603912353516, + "eval_loss": 1.6225370168685913, + "eval_rewards/accuracies": 0.6236059665679932, + "eval_rewards/chosen": -3.575843095779419, + "eval_rewards/margins": 0.3446771800518036, + "eval_rewards/rejected": -3.920520782470703, + "eval_runtime": 157.7163, + "eval_samples_per_second": 27.289, + "eval_steps_per_second": 3.411, + "step": 6900 + }, + { + "epoch": 1.1905582356995175, + "grad_norm": 31.14047424180216, + "learning_rate": 2.106575021042457e-08, + "logits/chosen": -3.455052614212036, + "logits/rejected": -3.443563461303711, + "logps/chosen": -1.9286575317382812, + "logps/rejected": -2.199587821960449, + "loss": 1.5279, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.8573150634765625, + "rewards/margins": 0.5418601036071777, + "rewards/rejected": -4.399175643920898, + "step": 6910 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 34.599301286520586, + "learning_rate": 2.099152556525926e-08, + "logits/chosen": -3.4898228645324707, + "logits/rejected": -3.4740116596221924, + "logps/chosen": -1.9430936574935913, + "logps/rejected": -2.143200159072876, + "loss": 1.6365, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.8861873149871826, + "rewards/margins": 0.40021243691444397, + "rewards/rejected": -4.286400318145752, + "step": 6920 + }, + { + "epoch": 1.1940041350792556, + "grad_norm": 27.411021710401464, + "learning_rate": 2.0917337169036925e-08, + "logits/chosen": -3.374633312225342, + "logits/rejected": -3.3623108863830566, + "logps/chosen": -1.8794822692871094, + "logps/rejected": -2.143115758895874, + "loss": 1.5461, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7589645385742188, + "rewards/margins": 0.5272667407989502, + "rewards/rejected": -4.286231517791748, + "step": 6930 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 28.760239002091282, + "learning_rate": 2.0843185692648913e-08, + "logits/chosen": -3.3728859424591064, + "logits/rejected": -3.3476004600524902, + "logps/chosen": -1.9603859186172485, + "logps/rejected": -2.253317356109619, + "loss": 1.5094, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.920771837234497, + "rewards/margins": 0.5858623385429382, + "rewards/rejected": -4.506634712219238, + "step": 6940 + }, + { + "epoch": 1.1974500344589938, + "grad_norm": 22.693599869751246, + "learning_rate": 2.076907180665276e-08, + "logits/chosen": -3.4144721031188965, + "logits/rejected": -3.404813289642334, + "logps/chosen": -1.8786613941192627, + "logps/rejected": -2.133204936981201, + "loss": 1.5841, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.7573227882385254, + "rewards/margins": 0.5090863704681396, + "rewards/rejected": -4.266409873962402, + "step": 6950 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 28.422836412488593, + "learning_rate": 2.0694996181266027e-08, + "logits/chosen": -3.512488842010498, + "logits/rejected": -3.490853786468506, + "logps/chosen": -2.0001771450042725, + "logps/rejected": -2.2146267890930176, + "loss": 1.613, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.000354290008545, + "rewards/margins": 0.4288991391658783, + "rewards/rejected": -4.429253578186035, + "step": 6960 + }, + { + "epoch": 1.200895933838732, + "grad_norm": 23.01232277580447, + "learning_rate": 2.0620959486360313e-08, + "logits/chosen": -3.461475372314453, + "logits/rejected": -3.443598508834839, + "logps/chosen": -1.865735411643982, + "logps/rejected": -2.0902888774871826, + "loss": 1.5913, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.731470823287964, + "rewards/margins": 0.44910717010498047, + "rewards/rejected": -4.180577754974365, + "step": 6970 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 22.109953638862066, + "learning_rate": 2.0546962391455128e-08, + "logits/chosen": -3.393242597579956, + "logits/rejected": -3.3779423236846924, + "logps/chosen": -1.9013702869415283, + "logps/rejected": -2.1755242347717285, + "loss": 1.51, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8027405738830566, + "rewards/margins": 0.5483077168464661, + "rewards/rejected": -4.351048469543457, + "step": 6980 + }, + { + "epoch": 1.20434183321847, + "grad_norm": 23.382822495472677, + "learning_rate": 2.0473005565711924e-08, + "logits/chosen": -3.3502793312072754, + "logits/rejected": -3.344498872756958, + "logps/chosen": -2.007148027420044, + "logps/rejected": -2.144912004470825, + "loss": 1.7142, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.014296054840088, + "rewards/margins": 0.2755277156829834, + "rewards/rejected": -4.28982400894165, + "step": 6990 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 28.878151604665096, + "learning_rate": 2.039908967792795e-08, + "logits/chosen": -3.5284423828125, + "logits/rejected": -3.514420986175537, + "logps/chosen": -2.017059087753296, + "logps/rejected": -2.2914133071899414, + "loss": 1.5876, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.034118175506592, + "rewards/margins": 0.5487080812454224, + "rewards/rejected": -4.582826614379883, + "step": 7000 + }, + { + "epoch": 1.206064782908339, + "eval_logits/chosen": -3.4769036769866943, + "eval_logits/rejected": -3.4732091426849365, + "eval_logps/chosen": -1.792240858078003, + "eval_logps/rejected": -1.965267539024353, + "eval_loss": 1.621833324432373, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.584481716156006, + "eval_rewards/margins": 0.3460537791252136, + "eval_rewards/rejected": -3.930535078048706, + "eval_runtime": 157.6467, + "eval_samples_per_second": 27.302, + "eval_steps_per_second": 3.413, + "step": 7000 + }, + { + "epoch": 1.207787732598208, + "grad_norm": 24.097126915678892, + "learning_rate": 2.0325215396530286e-08, + "logits/chosen": -3.4171881675720215, + "logits/rejected": -3.402376174926758, + "logps/chosen": -1.973687767982483, + "logps/rejected": -2.1972060203552246, + "loss": 1.6074, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.947375535964966, + "rewards/margins": 0.4470368027687073, + "rewards/rejected": -4.394412040710449, + "step": 7010 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 21.500704748057913, + "learning_rate": 2.025138338956974e-08, + "logits/chosen": -3.416609287261963, + "logits/rejected": -3.402470350265503, + "logps/chosen": -1.9237077236175537, + "logps/rejected": -2.2703490257263184, + "loss": 1.4767, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.8474154472351074, + "rewards/margins": 0.6932830214500427, + "rewards/rejected": -4.540698051452637, + "step": 7020 + }, + { + "epoch": 1.2112336319779462, + "grad_norm": 22.128101060466896, + "learning_rate": 2.0177594324714838e-08, + "logits/chosen": -3.4267895221710205, + "logits/rejected": -3.4196677207946777, + "logps/chosen": -1.915442705154419, + "logps/rejected": -2.1596169471740723, + "loss": 1.5726, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.830885410308838, + "rewards/margins": 0.4883483350276947, + "rewards/rejected": -4.3192338943481445, + "step": 7030 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 23.794962214892053, + "learning_rate": 2.0103848869245765e-08, + "logits/chosen": -3.3892951011657715, + "logits/rejected": -3.3781750202178955, + "logps/chosen": -1.9256702661514282, + "logps/rejected": -2.191035270690918, + "loss": 1.5283, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8513405323028564, + "rewards/margins": 0.5307302474975586, + "rewards/rejected": -4.382070541381836, + "step": 7040 + }, + { + "epoch": 1.2146795313576844, + "grad_norm": 25.250022085313393, + "learning_rate": 2.0030147690048372e-08, + "logits/chosen": -3.379009246826172, + "logits/rejected": -3.3662917613983154, + "logps/chosen": -1.8982080221176147, + "logps/rejected": -2.1766180992126465, + "loss": 1.5242, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.7964160442352295, + "rewards/margins": 0.5568206906318665, + "rewards/rejected": -4.353236198425293, + "step": 7050 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 29.62631957214788, + "learning_rate": 1.995649145360809e-08, + "logits/chosen": -3.449963331222534, + "logits/rejected": -3.4420857429504395, + "logps/chosen": -2.019956111907959, + "logps/rejected": -2.245516061782837, + "loss": 1.6465, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.039912223815918, + "rewards/margins": 0.4511200785636902, + "rewards/rejected": -4.491032123565674, + "step": 7060 + }, + { + "epoch": 1.2181254307374225, + "grad_norm": 23.985224027210187, + "learning_rate": 1.988288082600392e-08, + "logits/chosen": -3.4297282695770264, + "logits/rejected": -3.4130959510803223, + "logps/chosen": -1.9000587463378906, + "logps/rejected": -2.190361976623535, + "loss": 1.612, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.8001174926757812, + "rewards/margins": 0.5806065797805786, + "rewards/rejected": -4.38072395324707, + "step": 7070 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 22.466763379393644, + "learning_rate": 1.980931647290246e-08, + "logits/chosen": -3.4621529579162598, + "logits/rejected": -3.446842670440674, + "logps/chosen": -1.9374840259552002, + "logps/rejected": -2.28418231010437, + "loss": 1.4312, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.8749680519104004, + "rewards/margins": 0.6933965086936951, + "rewards/rejected": -4.56836462020874, + "step": 7080 + }, + { + "epoch": 1.2215713301171607, + "grad_norm": 32.36656977239231, + "learning_rate": 1.97357990595518e-08, + "logits/chosen": -3.4793872833251953, + "logits/rejected": -3.4739081859588623, + "logps/chosen": -2.0465245246887207, + "logps/rejected": -2.290170192718506, + "loss": 1.6671, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.093049049377441, + "rewards/margins": 0.4872911870479584, + "rewards/rejected": -4.580340385437012, + "step": 7090 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 21.439760503753135, + "learning_rate": 1.9662329250775585e-08, + "logits/chosen": -3.3986358642578125, + "logits/rejected": -3.3883895874023438, + "logps/chosen": -1.9401174783706665, + "logps/rejected": -2.1346325874328613, + "loss": 1.6316, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.880234956741333, + "rewards/margins": 0.38902994990348816, + "rewards/rejected": -4.269265174865723, + "step": 7100 + }, + { + "epoch": 1.2232942798070296, + "eval_logits/chosen": -3.4777565002441406, + "eval_logits/rejected": -3.4740800857543945, + "eval_logps/chosen": -1.7912368774414062, + "eval_logps/rejected": -1.9647005796432495, + "eval_loss": 1.6215013265609741, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.5824737548828125, + "eval_rewards/margins": 0.34692734479904175, + "eval_rewards/rejected": -3.929401159286499, + "eval_runtime": 157.5962, + "eval_samples_per_second": 27.31, + "eval_steps_per_second": 3.414, + "step": 7100 + }, + { + "epoch": 1.2250172294968986, + "grad_norm": 21.254663789563867, + "learning_rate": 1.9588907710966942e-08, + "logits/chosen": -3.430662155151367, + "logits/rejected": -3.410324811935425, + "logps/chosen": -1.880415916442871, + "logps/rejected": -2.1714255809783936, + "loss": 1.4729, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.760831832885742, + "rewards/margins": 0.5820191502571106, + "rewards/rejected": -4.342851161956787, + "step": 7110 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 27.066333596234486, + "learning_rate": 1.951553510408252e-08, + "logits/chosen": -3.433607816696167, + "logits/rejected": -3.4108378887176514, + "logps/chosen": -2.0027976036071777, + "logps/rejected": -2.2430906295776367, + "loss": 1.6151, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.0055952072143555, + "rewards/margins": 0.4805859923362732, + "rewards/rejected": -4.486181259155273, + "step": 7120 + }, + { + "epoch": 1.2284631288766368, + "grad_norm": 21.50075127131134, + "learning_rate": 1.9442212093636433e-08, + "logits/chosen": -3.3498611450195312, + "logits/rejected": -3.3346786499023438, + "logps/chosen": -1.8847160339355469, + "logps/rejected": -2.117307186126709, + "loss": 1.5697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.7694320678710938, + "rewards/margins": 0.4651823043823242, + "rewards/rejected": -4.234614372253418, + "step": 7130 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 24.586267200027912, + "learning_rate": 1.936893934269433e-08, + "logits/chosen": -3.422037124633789, + "logits/rejected": -3.4198176860809326, + "logps/chosen": -1.909598708152771, + "logps/rejected": -2.078902006149292, + "loss": 1.6903, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.819197416305542, + "rewards/margins": 0.33860689401626587, + "rewards/rejected": -4.157804012298584, + "step": 7140 + }, + { + "epoch": 1.231909028256375, + "grad_norm": 29.309676191210244, + "learning_rate": 1.9295717513867323e-08, + "logits/chosen": -3.4672858715057373, + "logits/rejected": -3.454233169555664, + "logps/chosen": -2.017504930496216, + "logps/rejected": -2.189743995666504, + "loss": 1.6753, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.035009860992432, + "rewards/margins": 0.3444775938987732, + "rewards/rejected": -4.379487991333008, + "step": 7150 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 24.158548765768018, + "learning_rate": 1.922254726930607e-08, + "logits/chosen": -3.409709930419922, + "logits/rejected": -3.389134168624878, + "logps/chosen": -1.901498794555664, + "logps/rejected": -2.279109001159668, + "loss": 1.4182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.802997589111328, + "rewards/margins": 0.7552205324172974, + "rewards/rejected": -4.558218002319336, + "step": 7160 + }, + { + "epoch": 1.235354927636113, + "grad_norm": 24.413440603189493, + "learning_rate": 1.9149429270694706e-08, + "logits/chosen": -3.3829448223114014, + "logits/rejected": -3.370647430419922, + "logps/chosen": -2.003526210784912, + "logps/rejected": -2.2259905338287354, + "loss": 1.6143, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.007052421569824, + "rewards/margins": 0.4449276030063629, + "rewards/rejected": -4.451981067657471, + "step": 7170 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 28.61177622922394, + "learning_rate": 1.9076364179244935e-08, + "logits/chosen": -3.4648118019104004, + "logits/rejected": -3.4571194648742676, + "logps/chosen": -1.963698387145996, + "logps/rejected": -2.1662113666534424, + "loss": 1.6211, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.927396774291992, + "rewards/margins": 0.4050256609916687, + "rewards/rejected": -4.332422733306885, + "step": 7180 + }, + { + "epoch": 1.2388008270158513, + "grad_norm": 28.517029586687148, + "learning_rate": 1.9003352655689992e-08, + "logits/chosen": -3.3877291679382324, + "logits/rejected": -3.374415636062622, + "logps/chosen": -1.9687414169311523, + "logps/rejected": -2.222626209259033, + "loss": 1.5871, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.9374828338623047, + "rewards/margins": 0.5077705383300781, + "rewards/rejected": -4.445252418518066, + "step": 7190 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 26.119972231386463, + "learning_rate": 1.8930395360278723e-08, + "logits/chosen": -3.3869881629943848, + "logits/rejected": -3.3783411979675293, + "logps/chosen": -1.903299331665039, + "logps/rejected": -2.1868605613708496, + "loss": 1.5, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.806598663330078, + "rewards/margins": 0.5671221613883972, + "rewards/rejected": -4.373721122741699, + "step": 7200 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -3.474452495574951, + "eval_logits/rejected": -3.4707682132720947, + "eval_logps/chosen": -1.7960704565048218, + "eval_logps/rejected": -1.969786286354065, + "eval_loss": 1.6214957237243652, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -3.5921409130096436, + "eval_rewards/margins": 0.3474312424659729, + "eval_rewards/rejected": -3.93957257270813, + "eval_runtime": 157.3351, + "eval_samples_per_second": 27.356, + "eval_steps_per_second": 3.419, + "step": 7200 + }, + { + "epoch": 1.2422467263955892, + "grad_norm": 30.525540057079713, + "learning_rate": 1.885749295276955e-08, + "logits/chosen": -3.4597911834716797, + "logits/rejected": -3.4466140270233154, + "logps/chosen": -1.9704883098602295, + "logps/rejected": -2.149529218673706, + "loss": 1.6787, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.940976619720459, + "rewards/margins": 0.35808196663856506, + "rewards/rejected": -4.299058437347412, + "step": 7210 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 24.55879719838123, + "learning_rate": 1.878464609242457e-08, + "logits/chosen": -3.3795199394226074, + "logits/rejected": -3.3612728118896484, + "logps/chosen": -2.008685827255249, + "logps/rejected": -2.268606662750244, + "loss": 1.5754, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.017371654510498, + "rewards/margins": 0.519841194152832, + "rewards/rejected": -4.537213325500488, + "step": 7220 + }, + { + "epoch": 1.2456926257753274, + "grad_norm": 23.15724872891501, + "learning_rate": 1.8711855438003542e-08, + "logits/chosen": -3.398247480392456, + "logits/rejected": -3.387624740600586, + "logps/chosen": -1.9812662601470947, + "logps/rejected": -2.2011640071868896, + "loss": 1.6361, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9625325202941895, + "rewards/margins": 0.4397953152656555, + "rewards/rejected": -4.402328014373779, + "step": 7230 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 27.437101852460103, + "learning_rate": 1.8639121647757975e-08, + "logits/chosen": -3.4161744117736816, + "logits/rejected": -3.409580707550049, + "logps/chosen": -2.0851187705993652, + "logps/rejected": -2.1906774044036865, + "loss": 1.7574, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.1702375411987305, + "rewards/margins": 0.21111738681793213, + "rewards/rejected": -4.381354808807373, + "step": 7240 + }, + { + "epoch": 1.2491385251550655, + "grad_norm": 29.509978517914323, + "learning_rate": 1.8566445379425115e-08, + "logits/chosen": -3.4558043479919434, + "logits/rejected": -3.4388320446014404, + "logps/chosen": -1.9430596828460693, + "logps/rejected": -2.224996566772461, + "loss": 1.4816, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8861193656921387, + "rewards/margins": 0.5638741254806519, + "rewards/rejected": -4.449993133544922, + "step": 7250 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 24.593548950610973, + "learning_rate": 1.849382729022207e-08, + "logits/chosen": -3.4446864128112793, + "logits/rejected": -3.427727222442627, + "logps/chosen": -1.926947832107544, + "logps/rejected": -2.213374376296997, + "loss": 1.5212, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.853895664215088, + "rewards/margins": 0.5728530287742615, + "rewards/rejected": -4.426748752593994, + "step": 7260 + }, + { + "epoch": 1.2525844245348035, + "grad_norm": 26.9400382798036, + "learning_rate": 1.8421268036839798e-08, + "logits/chosen": -3.45466947555542, + "logits/rejected": -3.438530445098877, + "logps/chosen": -1.9682533740997314, + "logps/rejected": -2.226792812347412, + "loss": 1.5918, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.936506748199463, + "rewards/margins": 0.5170789957046509, + "rewards/rejected": -4.453585624694824, + "step": 7270 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 26.001555985843428, + "learning_rate": 1.834876827543721e-08, + "logits/chosen": -3.464675188064575, + "logits/rejected": -3.4466967582702637, + "logps/chosen": -1.945900559425354, + "logps/rejected": -2.272921323776245, + "loss": 1.4796, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.891801118850708, + "rewards/margins": 0.6540418863296509, + "rewards/rejected": -4.54584264755249, + "step": 7280 + }, + { + "epoch": 1.2560303239145416, + "grad_norm": 25.177088211615217, + "learning_rate": 1.827632866163525e-08, + "logits/chosen": -3.339931011199951, + "logits/rejected": -3.332799196243286, + "logps/chosen": -2.0049502849578857, + "logps/rejected": -2.201293706893921, + "loss": 1.6511, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.0099005699157715, + "rewards/margins": 0.39268702268600464, + "rewards/rejected": -4.402587413787842, + "step": 7290 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 28.713083258793436, + "learning_rate": 1.8203949850510903e-08, + "logits/chosen": -3.3153464794158936, + "logits/rejected": -3.3056511878967285, + "logps/chosen": -1.9276103973388672, + "logps/rejected": -2.1816084384918213, + "loss": 1.5617, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.8552207946777344, + "rewards/margins": 0.5079960823059082, + "rewards/rejected": -4.363216876983643, + "step": 7300 + }, + { + "epoch": 1.2577532736044108, + "eval_logits/chosen": -3.475193738937378, + "eval_logits/rejected": -3.4715256690979004, + "eval_logps/chosen": -1.801399827003479, + "eval_logps/rejected": -1.9757202863693237, + "eval_loss": 1.6209033727645874, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -3.602799654006958, + "eval_rewards/margins": 0.3486405611038208, + "eval_rewards/rejected": -3.9514405727386475, + "eval_runtime": 157.6214, + "eval_samples_per_second": 27.306, + "eval_steps_per_second": 3.413, + "step": 7300 + }, + { + "epoch": 1.2594762232942798, + "grad_norm": 26.206677320761408, + "learning_rate": 1.8131632496591348e-08, + "logits/chosen": -3.438716173171997, + "logits/rejected": -3.4253768920898438, + "logps/chosen": -1.9984452724456787, + "logps/rejected": -2.2209372520446777, + "loss": 1.621, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9968905448913574, + "rewards/margins": 0.44498366117477417, + "rewards/rejected": -4.4418745040893555, + "step": 7310 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 21.898411379783532, + "learning_rate": 1.8059377253847973e-08, + "logits/chosen": -3.4455208778381348, + "logits/rejected": -3.431145429611206, + "logps/chosen": -1.9408166408538818, + "logps/rejected": -2.1463263034820557, + "loss": 1.6414, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.8816332817077637, + "rewards/margins": 0.4110191762447357, + "rewards/rejected": -4.292652606964111, + "step": 7320 + }, + { + "epoch": 1.262922122674018, + "grad_norm": 26.674050417893888, + "learning_rate": 1.798718477569051e-08, + "logits/chosen": -3.407689332962036, + "logits/rejected": -3.392961025238037, + "logps/chosen": -1.887722373008728, + "logps/rejected": -2.216830253601074, + "loss": 1.4635, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.775444746017456, + "rewards/margins": 0.6582151055335999, + "rewards/rejected": -4.433660507202148, + "step": 7330 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 23.129096805836305, + "learning_rate": 1.791505571496109e-08, + "logits/chosen": -3.435410261154175, + "logits/rejected": -3.421022415161133, + "logps/chosen": -1.9085111618041992, + "logps/rejected": -2.1902804374694824, + "loss": 1.502, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.8170223236083984, + "rewards/margins": 0.5635384321212769, + "rewards/rejected": -4.380560874938965, + "step": 7340 + }, + { + "epoch": 1.266368022053756, + "grad_norm": 22.962708217603698, + "learning_rate": 1.7842990723928375e-08, + "logits/chosen": -3.4568963050842285, + "logits/rejected": -3.441082000732422, + "logps/chosen": -1.8931972980499268, + "logps/rejected": -2.124358654022217, + "loss": 1.5659, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.7863945960998535, + "rewards/margins": 0.4623230993747711, + "rewards/rejected": -4.248717308044434, + "step": 7350 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 20.651668921576057, + "learning_rate": 1.7770990454281608e-08, + "logits/chosen": -3.413623332977295, + "logits/rejected": -3.401118755340576, + "logps/chosen": -1.956380844116211, + "logps/rejected": -2.1796910762786865, + "loss": 1.6046, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.912761688232422, + "rewards/margins": 0.44662031531333923, + "rewards/rejected": -4.359382152557373, + "step": 7360 + }, + { + "epoch": 1.269813921433494, + "grad_norm": 23.917117964178978, + "learning_rate": 1.7699055557124793e-08, + "logits/chosen": -3.347390651702881, + "logits/rejected": -3.333719253540039, + "logps/chosen": -1.9537763595581055, + "logps/rejected": -2.22810435295105, + "loss": 1.5587, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.907552719116211, + "rewards/margins": 0.5486559867858887, + "rewards/rejected": -4.4562087059021, + "step": 7370 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 24.476396806418634, + "learning_rate": 1.7627186682970725e-08, + "logits/chosen": -3.4004147052764893, + "logits/rejected": -3.390237808227539, + "logps/chosen": -1.9585773944854736, + "logps/rejected": -2.2399234771728516, + "loss": 1.5472, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9171547889709473, + "rewards/margins": 0.562692403793335, + "rewards/rejected": -4.479846954345703, + "step": 7380 + }, + { + "epoch": 1.2732598208132322, + "grad_norm": 32.982928513457345, + "learning_rate": 1.755538448173518e-08, + "logits/chosen": -3.381897449493408, + "logits/rejected": -3.3701858520507812, + "logps/chosen": -1.9472194910049438, + "logps/rejected": -2.205005645751953, + "loss": 1.5625, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.8944389820098877, + "rewards/margins": 0.515572190284729, + "rewards/rejected": -4.410011291503906, + "step": 7390 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 26.450966835725108, + "learning_rate": 1.7483649602730987e-08, + "logits/chosen": -3.3891148567199707, + "logits/rejected": -3.36848783493042, + "logps/chosen": -2.011809825897217, + "logps/rejected": -2.3253235816955566, + "loss": 1.5496, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.023619651794434, + "rewards/margins": 0.627027153968811, + "rewards/rejected": -4.650647163391113, + "step": 7400 + }, + { + "epoch": 1.2749827705031014, + "eval_logits/chosen": -3.4813437461853027, + "eval_logits/rejected": -3.477717876434326, + "eval_logps/chosen": -1.8051471710205078, + "eval_logps/rejected": -1.9797757863998413, + "eval_loss": 1.62063467502594, + "eval_rewards/accuracies": 0.6240706443786621, + "eval_rewards/chosen": -3.6102943420410156, + "eval_rewards/margins": 0.3492574691772461, + "eval_rewards/rejected": -3.9595515727996826, + "eval_runtime": 157.6009, + "eval_samples_per_second": 27.309, + "eval_steps_per_second": 3.414, + "step": 7400 + }, + { + "epoch": 1.2767057201929704, + "grad_norm": 32.05266119130458, + "learning_rate": 1.741198269466219e-08, + "logits/chosen": -3.362455368041992, + "logits/rejected": -3.345698595046997, + "logps/chosen": -1.908453345298767, + "logps/rejected": -2.1892483234405518, + "loss": 1.5373, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.816906690597534, + "rewards/margins": 0.5615895390510559, + "rewards/rejected": -4.3784966468811035, + "step": 7410 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 22.725661696637907, + "learning_rate": 1.7340384405618133e-08, + "logits/chosen": -3.3294517993927, + "logits/rejected": -3.3147213459014893, + "logps/chosen": -1.8588364124298096, + "logps/rejected": -2.1416683197021484, + "loss": 1.5156, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.717672824859619, + "rewards/margins": 0.5656641125679016, + "rewards/rejected": -4.283336639404297, + "step": 7420 + }, + { + "epoch": 1.2801516195727085, + "grad_norm": 26.061897193575327, + "learning_rate": 1.7268855383067683e-08, + "logits/chosen": -3.3665523529052734, + "logits/rejected": -3.349184513092041, + "logps/chosen": -2.0268259048461914, + "logps/rejected": -2.327404499053955, + "loss": 1.5073, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.053651809692383, + "rewards/margins": 0.6011567711830139, + "rewards/rejected": -4.65480899810791, + "step": 7430 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 29.945238355188593, + "learning_rate": 1.7197396273853275e-08, + "logits/chosen": -3.4382128715515137, + "logits/rejected": -3.4236369132995605, + "logps/chosen": -2.0224671363830566, + "logps/rejected": -2.3610904216766357, + "loss": 1.5316, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.044934272766113, + "rewards/margins": 0.6772469878196716, + "rewards/rejected": -4.7221808433532715, + "step": 7440 + }, + { + "epoch": 1.2835975189524467, + "grad_norm": 26.898111740596832, + "learning_rate": 1.7126007724185164e-08, + "logits/chosen": -3.457066774368286, + "logits/rejected": -3.443593978881836, + "logps/chosen": -1.9754078388214111, + "logps/rejected": -2.231896162033081, + "loss": 1.5596, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9508156776428223, + "rewards/margins": 0.5129766464233398, + "rewards/rejected": -4.463792324066162, + "step": 7450 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 28.1463743839738, + "learning_rate": 1.705469037963548e-08, + "logits/chosen": -3.345510482788086, + "logits/rejected": -3.3476569652557373, + "logps/chosen": -1.9962208271026611, + "logps/rejected": -2.1702041625976562, + "loss": 1.6972, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.9924416542053223, + "rewards/margins": 0.34796610474586487, + "rewards/rejected": -4.3404083251953125, + "step": 7460 + }, + { + "epoch": 1.2870434183321846, + "grad_norm": 23.39682075490569, + "learning_rate": 1.698344488513247e-08, + "logits/chosen": -3.403573989868164, + "logits/rejected": -3.3935463428497314, + "logps/chosen": -1.9530484676361084, + "logps/rejected": -2.0903263092041016, + "loss": 1.7105, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.906096935272217, + "rewards/margins": 0.2745550572872162, + "rewards/rejected": -4.180652618408203, + "step": 7470 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 22.9361754451379, + "learning_rate": 1.691227188495461e-08, + "logits/chosen": -3.3874783515930176, + "logits/rejected": -3.3717143535614014, + "logps/chosen": -1.9332491159439087, + "logps/rejected": -2.1151700019836426, + "loss": 1.7023, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.8664982318878174, + "rewards/margins": 0.3638419806957245, + "rewards/rejected": -4.230340003967285, + "step": 7480 + }, + { + "epoch": 1.2904893177119228, + "grad_norm": 24.054673703740576, + "learning_rate": 1.684117202272485e-08, + "logits/chosen": -3.375999927520752, + "logits/rejected": -3.3681678771972656, + "logps/chosen": -1.944657325744629, + "logps/rejected": -2.2016220092773438, + "loss": 1.513, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.889314651489258, + "rewards/margins": 0.5139289498329163, + "rewards/rejected": -4.4032440185546875, + "step": 7490 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 21.163268587852414, + "learning_rate": 1.6770145941404697e-08, + "logits/chosen": -3.374875545501709, + "logits/rejected": -3.36232328414917, + "logps/chosen": -1.8727489709854126, + "logps/rejected": -2.1583404541015625, + "loss": 1.5583, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.745497941970825, + "rewards/margins": 0.571182370185852, + "rewards/rejected": -4.316680908203125, + "step": 7500 + }, + { + "epoch": 1.292212267401792, + "eval_logits/chosen": -3.470045804977417, + "eval_logits/rejected": -3.466383695602417, + "eval_logps/chosen": -1.807773470878601, + "eval_logps/rejected": -1.9827170372009277, + "eval_loss": 1.6202201843261719, + "eval_rewards/accuracies": 0.6229089498519897, + "eval_rewards/chosen": -3.615546941757202, + "eval_rewards/margins": 0.34988635778427124, + "eval_rewards/rejected": -3.9654340744018555, + "eval_runtime": 157.3781, + "eval_samples_per_second": 27.348, + "eval_steps_per_second": 3.419, + "step": 7500 + }, + { + "epoch": 1.293935217091661, + "grad_norm": 27.079062158238912, + "learning_rate": 1.669919428328847e-08, + "logits/chosen": -3.4050498008728027, + "logits/rejected": -3.3841845989227295, + "logps/chosen": -1.9427410364151, + "logps/rejected": -2.1552817821502686, + "loss": 1.5953, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.8854820728302, + "rewards/margins": 0.42508095502853394, + "rewards/rejected": -4.310563564300537, + "step": 7510 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 26.179394262063155, + "learning_rate": 1.66283176899975e-08, + "logits/chosen": -3.3894195556640625, + "logits/rejected": -3.378751754760742, + "logps/chosen": -1.9164543151855469, + "logps/rejected": -2.232022762298584, + "loss": 1.5026, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8329086303710938, + "rewards/margins": 0.6311367750167847, + "rewards/rejected": -4.464045524597168, + "step": 7520 + }, + { + "epoch": 1.297381116471399, + "grad_norm": 21.9974595055495, + "learning_rate": 1.6557516802474246e-08, + "logits/chosen": -3.3551948070526123, + "logits/rejected": -3.3541862964630127, + "logps/chosen": -1.955926537513733, + "logps/rejected": -2.119781494140625, + "loss": 1.6722, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.911853075027466, + "rewards/margins": 0.32771024107933044, + "rewards/rejected": -4.23956298828125, + "step": 7530 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 21.353327765822257, + "learning_rate": 1.648679226097662e-08, + "logits/chosen": -3.4556884765625, + "logits/rejected": -3.4519202709198, + "logps/chosen": -1.909246802330017, + "logps/rejected": -2.1231026649475098, + "loss": 1.6135, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.818493604660034, + "rewards/margins": 0.42771148681640625, + "rewards/rejected": -4.2462053298950195, + "step": 7540 + }, + { + "epoch": 1.3008270158511372, + "grad_norm": 29.476285644447685, + "learning_rate": 1.641614470507207e-08, + "logits/chosen": -3.4134578704833984, + "logits/rejected": -3.4021213054656982, + "logps/chosen": -1.9292001724243164, + "logps/rejected": -2.193645715713501, + "loss": 1.5068, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.858400344848633, + "rewards/margins": 0.5288910269737244, + "rewards/rejected": -4.387291431427002, + "step": 7550 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 30.023265902061187, + "learning_rate": 1.6345574773631897e-08, + "logits/chosen": -3.4435086250305176, + "logits/rejected": -3.4326624870300293, + "logps/chosen": -1.8687032461166382, + "logps/rejected": -2.3042094707489014, + "loss": 1.3859, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.7374064922332764, + "rewards/margins": 0.8710123896598816, + "rewards/rejected": -4.608418941497803, + "step": 7560 + }, + { + "epoch": 1.3042729152308752, + "grad_norm": 27.65828572543692, + "learning_rate": 1.627508310482541e-08, + "logits/chosen": -3.447045087814331, + "logits/rejected": -3.436596393585205, + "logps/chosen": -2.0481534004211426, + "logps/rejected": -2.186713933944702, + "loss": 1.7124, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.096306800842285, + "rewards/margins": 0.27712148427963257, + "rewards/rejected": -4.373427867889404, + "step": 7570 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 27.706272736007474, + "learning_rate": 1.6204670336114223e-08, + "logits/chosen": -3.4160475730895996, + "logits/rejected": -3.4042205810546875, + "logps/chosen": -1.9005578756332397, + "logps/rejected": -2.1121273040771484, + "loss": 1.6073, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.8011157512664795, + "rewards/margins": 0.4231385290622711, + "rewards/rejected": -4.224254608154297, + "step": 7580 + }, + { + "epoch": 1.3077188146106133, + "grad_norm": 32.3250814209345, + "learning_rate": 1.6134337104246395e-08, + "logits/chosen": -3.4417552947998047, + "logits/rejected": -3.4184608459472656, + "logps/chosen": -1.9968112707138062, + "logps/rejected": -2.315031051635742, + "loss": 1.4978, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9936225414276123, + "rewards/margins": 0.6364396810531616, + "rewards/rejected": -4.630062103271484, + "step": 7590 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 26.011833633976003, + "learning_rate": 1.6064084045250787e-08, + "logits/chosen": -3.422646999359131, + "logits/rejected": -3.406566619873047, + "logps/chosen": -1.9720538854599, + "logps/rejected": -2.267103672027588, + "loss": 1.5182, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9441077709198, + "rewards/margins": 0.5900996327400208, + "rewards/rejected": -4.534207344055176, + "step": 7600 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -3.4776861667633057, + "eval_logits/rejected": -3.4740731716156006, + "eval_logps/chosen": -1.809441328048706, + "eval_logps/rejected": -1.9844694137573242, + "eval_loss": 1.620276689529419, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.618882656097412, + "eval_rewards/margins": 0.3500562310218811, + "eval_rewards/rejected": -3.9689388275146484, + "eval_runtime": 157.3566, + "eval_samples_per_second": 27.352, + "eval_steps_per_second": 3.419, + "step": 7600 + }, + { + "epoch": 1.3111647139903515, + "grad_norm": 19.32512348783277, + "learning_rate": 1.5993911794431198e-08, + "logits/chosen": -3.396289110183716, + "logits/rejected": -3.3802058696746826, + "logps/chosen": -1.9614654779434204, + "logps/rejected": -2.270887851715088, + "loss": 1.4969, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.922930955886841, + "rewards/margins": 0.6188446283340454, + "rewards/rejected": -4.541775703430176, + "step": 7610 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 24.85906037798798, + "learning_rate": 1.59238209863607e-08, + "logits/chosen": -3.415778398513794, + "logits/rejected": -3.4012744426727295, + "logps/chosen": -1.9444528818130493, + "logps/rejected": -2.1241352558135986, + "loss": 1.6745, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8889057636260986, + "rewards/margins": 0.35936489701271057, + "rewards/rejected": -4.248270511627197, + "step": 7620 + }, + { + "epoch": 1.3146106133700897, + "grad_norm": 23.466156056914873, + "learning_rate": 1.5853812254875877e-08, + "logits/chosen": -3.382519483566284, + "logits/rejected": -3.387416124343872, + "logps/chosen": -2.0103797912597656, + "logps/rejected": -2.0849499702453613, + "loss": 1.8193, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.020759582519531, + "rewards/margins": 0.14914043247699738, + "rewards/rejected": -4.169899940490723, + "step": 7630 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 29.843754458539543, + "learning_rate": 1.5783886233071076e-08, + "logits/chosen": -3.3521697521209717, + "logits/rejected": -3.340270519256592, + "logps/chosen": -1.952166199684143, + "logps/rejected": -2.1371243000030518, + "loss": 1.6394, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.904332399368286, + "rewards/margins": 0.3699163794517517, + "rewards/rejected": -4.2742486000061035, + "step": 7640 + }, + { + "epoch": 1.3180565127498278, + "grad_norm": 25.89038795622965, + "learning_rate": 1.5714043553292683e-08, + "logits/chosen": -3.4430954456329346, + "logits/rejected": -3.428298234939575, + "logps/chosen": -2.0455856323242188, + "logps/rejected": -2.341370105743408, + "loss": 1.5221, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.0911712646484375, + "rewards/margins": 0.5915690064430237, + "rewards/rejected": -4.682740211486816, + "step": 7650 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 24.16557332226491, + "learning_rate": 1.564428484713345e-08, + "logits/chosen": -3.421440601348877, + "logits/rejected": -3.399498462677002, + "logps/chosen": -1.9118343591690063, + "logps/rejected": -2.283168077468872, + "loss": 1.4319, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8236687183380127, + "rewards/margins": 0.7426677346229553, + "rewards/rejected": -4.566336154937744, + "step": 7660 + }, + { + "epoch": 1.3215024121295658, + "grad_norm": 27.758444859141456, + "learning_rate": 1.5574610745426703e-08, + "logits/chosen": -3.384864330291748, + "logits/rejected": -3.368865966796875, + "logps/chosen": -1.99466073513031, + "logps/rejected": -2.232469081878662, + "loss": 1.6024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.98932147026062, + "rewards/margins": 0.475616991519928, + "rewards/rejected": -4.464938163757324, + "step": 7670 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 23.601115122360813, + "learning_rate": 1.550502187824073e-08, + "logits/chosen": -3.4323248863220215, + "logits/rejected": -3.421978712081909, + "logps/chosen": -1.9353773593902588, + "logps/rejected": -2.1206462383270264, + "loss": 1.6406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.8707547187805176, + "rewards/margins": 0.3705376982688904, + "rewards/rejected": -4.241292476654053, + "step": 7680 + }, + { + "epoch": 1.324948311509304, + "grad_norm": 30.616351933795052, + "learning_rate": 1.543551887487301e-08, + "logits/chosen": -3.480922222137451, + "logits/rejected": -3.457101345062256, + "logps/chosen": -1.8613784313201904, + "logps/rejected": -2.0936036109924316, + "loss": 1.555, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.722756862640381, + "rewards/margins": 0.46445074677467346, + "rewards/rejected": -4.187207221984863, + "step": 7690 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 26.18014506881206, + "learning_rate": 1.536610236384455e-08, + "logits/chosen": -3.4002883434295654, + "logits/rejected": -3.3873798847198486, + "logps/chosen": -1.8997806310653687, + "logps/rejected": -2.194857120513916, + "loss": 1.5097, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.7995612621307373, + "rewards/margins": 0.5901525616645813, + "rewards/rejected": -4.389714241027832, + "step": 7700 + }, + { + "epoch": 1.3266712611991731, + "eval_logits/chosen": -3.469599962234497, + "eval_logits/rejected": -3.465953826904297, + "eval_logps/chosen": -1.813297152519226, + "eval_logps/rejected": -1.9887665510177612, + "eval_loss": 1.6200501918792725, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.626594305038452, + "eval_rewards/margins": 0.3509384095668793, + "eval_rewards/rejected": -3.9775331020355225, + "eval_runtime": 157.5872, + "eval_samples_per_second": 27.312, + "eval_steps_per_second": 3.414, + "step": 7700 + }, + { + "epoch": 1.328394210889042, + "grad_norm": 26.387733959265823, + "learning_rate": 1.5296772972894213e-08, + "logits/chosen": -3.431591033935547, + "logits/rejected": -3.424315929412842, + "logps/chosen": -2.033977508544922, + "logps/rejected": -2.2427258491516113, + "loss": 1.6833, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.067955017089844, + "rewards/margins": 0.41749638319015503, + "rewards/rejected": -4.485451698303223, + "step": 7710 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 25.93849249259125, + "learning_rate": 1.5227531328972994e-08, + "logits/chosen": -3.404116153717041, + "logits/rejected": -3.3853867053985596, + "logps/chosen": -2.0194900035858154, + "logps/rejected": -2.242027759552002, + "loss": 1.5829, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.038980007171631, + "rewards/margins": 0.4450755715370178, + "rewards/rejected": -4.484055519104004, + "step": 7720 + }, + { + "epoch": 1.33184011026878, + "grad_norm": 22.952379101670545, + "learning_rate": 1.5158378058238442e-08, + "logits/chosen": -3.395073413848877, + "logits/rejected": -3.3841896057128906, + "logps/chosen": -1.9951908588409424, + "logps/rejected": -2.1871113777160645, + "loss": 1.627, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9903817176818848, + "rewards/margins": 0.3838415741920471, + "rewards/rejected": -4.374222755432129, + "step": 7730 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 25.363729125929293, + "learning_rate": 1.5089313786048885e-08, + "logits/chosen": -3.385986804962158, + "logits/rejected": -3.3771564960479736, + "logps/chosen": -1.9417400360107422, + "logps/rejected": -2.2417545318603516, + "loss": 1.5069, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8834800720214844, + "rewards/margins": 0.6000281572341919, + "rewards/rejected": -4.483509063720703, + "step": 7740 + }, + { + "epoch": 1.3352860096485184, + "grad_norm": 24.494344561440382, + "learning_rate": 1.5020339136957876e-08, + "logits/chosen": -3.3847453594207764, + "logits/rejected": -3.3666205406188965, + "logps/chosen": -1.9927974939346313, + "logps/rejected": -2.362952470779419, + "loss": 1.4732, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.9855949878692627, + "rewards/margins": 0.7403099536895752, + "rewards/rejected": -4.725904941558838, + "step": 7750 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 27.775884636933036, + "learning_rate": 1.4951454734708456e-08, + "logits/chosen": -3.3428657054901123, + "logits/rejected": -3.331902265548706, + "logps/chosen": -1.8926652669906616, + "logps/rejected": -2.16182279586792, + "loss": 1.5568, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.7853305339813232, + "rewards/margins": 0.5383151769638062, + "rewards/rejected": -4.32364559173584, + "step": 7760 + }, + { + "epoch": 1.3387319090282563, + "grad_norm": 24.199138850695153, + "learning_rate": 1.4882661202227597e-08, + "logits/chosen": -3.3631374835968018, + "logits/rejected": -3.3510169982910156, + "logps/chosen": -1.90177321434021, + "logps/rejected": -2.1965630054473877, + "loss": 1.445, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.80354642868042, + "rewards/margins": 0.5895800590515137, + "rewards/rejected": -4.393126010894775, + "step": 7770 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 30.46381388107373, + "learning_rate": 1.4813959161620502e-08, + "logits/chosen": -3.458533525466919, + "logits/rejected": -3.4487576484680176, + "logps/chosen": -2.0157530307769775, + "logps/rejected": -2.284475326538086, + "loss": 1.5522, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.031506061553955, + "rewards/margins": 0.5374442934989929, + "rewards/rejected": -4.568950653076172, + "step": 7780 + }, + { + "epoch": 1.3421778084079945, + "grad_norm": 29.46843784567394, + "learning_rate": 1.4745349234165017e-08, + "logits/chosen": -3.4063408374786377, + "logits/rejected": -3.3977673053741455, + "logps/chosen": -1.9838600158691406, + "logps/rejected": -2.221179485321045, + "loss": 1.5656, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9677200317382812, + "rewards/margins": 0.4746389389038086, + "rewards/rejected": -4.44235897064209, + "step": 7790 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 26.708655863270007, + "learning_rate": 1.4676832040305984e-08, + "logits/chosen": -3.4402592182159424, + "logits/rejected": -3.4374146461486816, + "logps/chosen": -2.000596523284912, + "logps/rejected": -2.229402542114258, + "loss": 1.5902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.001193046569824, + "rewards/margins": 0.45761117339134216, + "rewards/rejected": -4.458805084228516, + "step": 7800 + }, + { + "epoch": 1.3439007580978635, + "eval_logits/chosen": -3.4675936698913574, + "eval_logits/rejected": -3.4639360904693604, + "eval_logps/chosen": -1.8150808811187744, + "eval_logps/rejected": -1.9909459352493286, + "eval_loss": 1.6198712587356567, + "eval_rewards/accuracies": 0.6226765513420105, + "eval_rewards/chosen": -3.630161762237549, + "eval_rewards/margins": 0.351730078458786, + "eval_rewards/rejected": -3.9818918704986572, + "eval_runtime": 157.3993, + "eval_samples_per_second": 27.344, + "eval_steps_per_second": 3.418, + "step": 7800 + }, + { + "epoch": 1.3456237077877327, + "grad_norm": 26.894381761735506, + "learning_rate": 1.4608408199649686e-08, + "logits/chosen": -3.447389602661133, + "logits/rejected": -3.433283567428589, + "logps/chosen": -1.9880434274673462, + "logps/rejected": -2.242096424102783, + "loss": 1.6205, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9760868549346924, + "rewards/margins": 0.5081058740615845, + "rewards/rejected": -4.484192848205566, + "step": 7810 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 26.441711970463004, + "learning_rate": 1.4540078330958166e-08, + "logits/chosen": -3.423016309738159, + "logits/rejected": -3.4071621894836426, + "logps/chosen": -1.903602957725525, + "logps/rejected": -2.237743377685547, + "loss": 1.5381, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.80720591545105, + "rewards/margins": 0.6682804822921753, + "rewards/rejected": -4.475486755371094, + "step": 7820 + }, + { + "epoch": 1.3490696071674706, + "grad_norm": 29.57146483446172, + "learning_rate": 1.4471843052143696e-08, + "logits/chosen": -3.3673501014709473, + "logits/rejected": -3.362761974334717, + "logps/chosen": -2.0161023139953613, + "logps/rejected": -2.283580780029297, + "loss": 1.5604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.032204627990723, + "rewards/margins": 0.5349570512771606, + "rewards/rejected": -4.567161560058594, + "step": 7830 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 25.047888200810792, + "learning_rate": 1.4403702980263149e-08, + "logits/chosen": -3.3664028644561768, + "logits/rejected": -3.3540115356445312, + "logps/chosen": -1.9785391092300415, + "logps/rejected": -2.2338144779205322, + "loss": 1.5584, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.957078218460083, + "rewards/margins": 0.5105511546134949, + "rewards/rejected": -4.4676289558410645, + "step": 7840 + }, + { + "epoch": 1.3525155065472088, + "grad_norm": 26.83460735461663, + "learning_rate": 1.4335658731512452e-08, + "logits/chosen": -3.360334873199463, + "logits/rejected": -3.344198226928711, + "logps/chosen": -2.0348751544952393, + "logps/rejected": -2.158376932144165, + "loss": 1.7755, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.0697503089904785, + "rewards/margins": 0.2470030039548874, + "rewards/rejected": -4.31675386428833, + "step": 7850 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 18.573520040074147, + "learning_rate": 1.4267710921220974e-08, + "logits/chosen": -3.3762307167053223, + "logits/rejected": -3.3586812019348145, + "logps/chosen": -1.9171648025512695, + "logps/rejected": -2.2861616611480713, + "loss": 1.3882, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.834329605102539, + "rewards/margins": 0.7379932403564453, + "rewards/rejected": -4.572323322296143, + "step": 7860 + }, + { + "epoch": 1.355961405926947, + "grad_norm": 22.016532505221342, + "learning_rate": 1.4199860163846007e-08, + "logits/chosen": -3.3989098072052, + "logits/rejected": -3.3874964714050293, + "logps/chosen": -2.0208065509796143, + "logps/rejected": -2.2618932723999023, + "loss": 1.6054, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.0416131019592285, + "rewards/margins": 0.4821733832359314, + "rewards/rejected": -4.523786544799805, + "step": 7870 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 25.528419680160113, + "learning_rate": 1.4132107072967165e-08, + "logits/chosen": -3.431927442550659, + "logits/rejected": -3.423241376876831, + "logps/chosen": -1.9944210052490234, + "logps/rejected": -2.2488648891448975, + "loss": 1.5633, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.988842010498047, + "rewards/margins": 0.5088878870010376, + "rewards/rejected": -4.497729778289795, + "step": 7880 + }, + { + "epoch": 1.359407305306685, + "grad_norm": 28.5512300686002, + "learning_rate": 1.406445226128088e-08, + "logits/chosen": -3.392622470855713, + "logits/rejected": -3.3828914165496826, + "logps/chosen": -1.9571138620376587, + "logps/rejected": -2.2353758811950684, + "loss": 1.5548, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.9142277240753174, + "rewards/margins": 0.556524395942688, + "rewards/rejected": -4.470751762390137, + "step": 7890 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 24.65755426837438, + "learning_rate": 1.3996896340594791e-08, + "logits/chosen": -3.380554676055908, + "logits/rejected": -3.3789031505584717, + "logps/chosen": -1.9869747161865234, + "logps/rejected": -2.2179667949676514, + "loss": 1.633, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.973949432373047, + "rewards/margins": 0.4619844853878021, + "rewards/rejected": -4.435933589935303, + "step": 7900 + }, + { + "epoch": 1.361130254996554, + "eval_logits/chosen": -3.4657349586486816, + "eval_logits/rejected": -3.462076425552368, + "eval_logps/chosen": -1.8167805671691895, + "eval_logps/rejected": -1.9928126335144043, + "eval_loss": 1.6196552515029907, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.633561134338379, + "eval_rewards/margins": 0.3520638644695282, + "eval_rewards/rejected": -3.9856252670288086, + "eval_runtime": 157.3396, + "eval_samples_per_second": 27.355, + "eval_steps_per_second": 3.419, + "step": 7900 + }, + { + "epoch": 1.3628532046864232, + "grad_norm": 25.247198076011, + "learning_rate": 1.3929439921822333e-08, + "logits/chosen": -3.384026288986206, + "logits/rejected": -3.372326612472534, + "logps/chosen": -2.0295138359069824, + "logps/rejected": -2.209728479385376, + "loss": 1.7106, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.059027671813965, + "rewards/margins": 0.36042895913124084, + "rewards/rejected": -4.419456958770752, + "step": 7910 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 32.79638869985227, + "learning_rate": 1.3862083614977067e-08, + "logits/chosen": -3.38860821723938, + "logits/rejected": -3.3750407695770264, + "logps/chosen": -1.9914357662200928, + "logps/rejected": -2.198970079421997, + "loss": 1.6146, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9828715324401855, + "rewards/margins": 0.41506853699684143, + "rewards/rejected": -4.397940158843994, + "step": 7920 + }, + { + "epoch": 1.3662991040661612, + "grad_norm": 27.673882192445525, + "learning_rate": 1.3794828029167265e-08, + "logits/chosen": -3.420179843902588, + "logits/rejected": -3.404628038406372, + "logps/chosen": -2.0079662799835205, + "logps/rejected": -2.206331968307495, + "loss": 1.6384, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.015932559967041, + "rewards/margins": 0.39673200249671936, + "rewards/rejected": -4.41266393661499, + "step": 7930 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 28.591928687216477, + "learning_rate": 1.3727673772590375e-08, + "logits/chosen": -3.369938373565674, + "logits/rejected": -3.3600196838378906, + "logps/chosen": -2.047882080078125, + "logps/rejected": -2.264216661453247, + "loss": 1.6567, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.09576416015625, + "rewards/margins": 0.43266886472702026, + "rewards/rejected": -4.528433322906494, + "step": 7940 + }, + { + "epoch": 1.3697450034458993, + "grad_norm": 25.092929854025197, + "learning_rate": 1.3660621452527505e-08, + "logits/chosen": -3.338481903076172, + "logits/rejected": -3.3338160514831543, + "logps/chosen": -1.914878487586975, + "logps/rejected": -2.2363333702087402, + "loss": 1.4908, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.82975697517395, + "rewards/margins": 0.6429096460342407, + "rewards/rejected": -4.4726667404174805, + "step": 7950 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 25.476571050967276, + "learning_rate": 1.3593671675337953e-08, + "logits/chosen": -3.350330352783203, + "logits/rejected": -3.336066722869873, + "logps/chosen": -1.9914268255233765, + "logps/rejected": -2.284013271331787, + "loss": 1.5174, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.982853651046753, + "rewards/margins": 0.5851727724075317, + "rewards/rejected": -4.568026542663574, + "step": 7960 + }, + { + "epoch": 1.3731909028256375, + "grad_norm": 25.616682843272127, + "learning_rate": 1.3526825046453705e-08, + "logits/chosen": -3.408673048019409, + "logits/rejected": -3.3916003704071045, + "logps/chosen": -2.00944185256958, + "logps/rejected": -2.283245801925659, + "loss": 1.5278, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.01888370513916, + "rewards/margins": 0.5476077198982239, + "rewards/rejected": -4.566491603851318, + "step": 7970 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 27.65985791044064, + "learning_rate": 1.3460082170373988e-08, + "logits/chosen": -3.435096263885498, + "logits/rejected": -3.4277255535125732, + "logps/chosen": -2.0110349655151367, + "logps/rejected": -2.226926803588867, + "loss": 1.6241, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.022069931030273, + "rewards/margins": 0.43178391456604004, + "rewards/rejected": -4.453853607177734, + "step": 7980 + }, + { + "epoch": 1.3766368022053757, + "grad_norm": 24.310029905850055, + "learning_rate": 1.339344365065973e-08, + "logits/chosen": -3.45619535446167, + "logits/rejected": -3.450610637664795, + "logps/chosen": -1.9777438640594482, + "logps/rejected": -2.2055366039276123, + "loss": 1.6073, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9554877281188965, + "rewards/margins": 0.455585241317749, + "rewards/rejected": -4.411073207855225, + "step": 7990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 30.1440950327072, + "learning_rate": 1.3326910089928244e-08, + "logits/chosen": -3.36493182182312, + "logits/rejected": -3.3616042137145996, + "logps/chosen": -1.9936511516571045, + "logps/rejected": -2.1447365283966064, + "loss": 1.7, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.987302303314209, + "rewards/margins": 0.3021699786186218, + "rewards/rejected": -4.289473056793213, + "step": 8000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -3.467329740524292, + "eval_logits/rejected": -3.463688850402832, + "eval_logps/chosen": -1.8190900087356567, + "eval_logps/rejected": -1.9959551095962524, + "eval_loss": 1.618659496307373, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.6381800174713135, + "eval_rewards/margins": 0.35373014211654663, + "eval_rewards/rejected": -3.991910219192505, + "eval_runtime": 157.4262, + "eval_samples_per_second": 27.34, + "eval_steps_per_second": 3.417, + "step": 8000 + }, + { + "epoch": 1.3800827015851138, + "grad_norm": 24.053695518135942, + "learning_rate": 1.3260482089847603e-08, + "logits/chosen": -3.387152910232544, + "logits/rejected": -3.3760344982147217, + "logps/chosen": -1.9965543746948242, + "logps/rejected": -2.171565532684326, + "loss": 1.6777, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9931087493896484, + "rewards/margins": 0.35002273321151733, + "rewards/rejected": -4.343131065368652, + "step": 8010 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 27.963976294688106, + "learning_rate": 1.3194160251131364e-08, + "logits/chosen": -3.395268678665161, + "logits/rejected": -3.3748748302459717, + "logps/chosen": -2.047935962677002, + "logps/rejected": -2.3080224990844727, + "loss": 1.575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.095871925354004, + "rewards/margins": 0.5201730728149414, + "rewards/rejected": -4.616044998168945, + "step": 8020 + }, + { + "epoch": 1.3835286009648518, + "grad_norm": 25.725248586848185, + "learning_rate": 1.3127945173532989e-08, + "logits/chosen": -3.3948206901550293, + "logits/rejected": -3.387514114379883, + "logps/chosen": -1.9761632680892944, + "logps/rejected": -2.2747063636779785, + "loss": 1.527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.952326536178589, + "rewards/margins": 0.5970865488052368, + "rewards/rejected": -4.549412727355957, + "step": 8030 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 23.313282714777692, + "learning_rate": 1.3061837455840539e-08, + "logits/chosen": -3.367082118988037, + "logits/rejected": -3.3515219688415527, + "logps/chosen": -1.9621782302856445, + "logps/rejected": -2.1950974464416504, + "loss": 1.5812, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.924356460571289, + "rewards/margins": 0.4658389091491699, + "rewards/rejected": -4.390194892883301, + "step": 8040 + }, + { + "epoch": 1.38697450034459, + "grad_norm": 20.39497102298315, + "learning_rate": 1.2995837695871186e-08, + "logits/chosen": -3.4119842052459717, + "logits/rejected": -3.4028518199920654, + "logps/chosen": -1.903454065322876, + "logps/rejected": -2.1905739307403564, + "loss": 1.4988, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.806908130645752, + "rewards/margins": 0.5742399096488953, + "rewards/rejected": -4.381147861480713, + "step": 8050 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 24.53310877651224, + "learning_rate": 1.2929946490465854e-08, + "logits/chosen": -3.4353528022766113, + "logits/rejected": -3.413501262664795, + "logps/chosen": -1.9212827682495117, + "logps/rejected": -2.2820916175842285, + "loss": 1.4468, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8425655364990234, + "rewards/margins": 0.7216181755065918, + "rewards/rejected": -4.564183235168457, + "step": 8060 + }, + { + "epoch": 1.390420399724328, + "grad_norm": 25.394271917864298, + "learning_rate": 1.2864164435483777e-08, + "logits/chosen": -3.3801655769348145, + "logits/rejected": -3.364374876022339, + "logps/chosen": -1.9843060970306396, + "logps/rejected": -2.261038064956665, + "loss": 1.5964, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.9686121940612793, + "rewards/margins": 0.553463339805603, + "rewards/rejected": -4.52207612991333, + "step": 8070 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 30.860595978065348, + "learning_rate": 1.2798492125797144e-08, + "logits/chosen": -3.3658390045166016, + "logits/rejected": -3.3628525733947754, + "logps/chosen": -1.9341175556182861, + "logps/rejected": -2.181978940963745, + "loss": 1.5477, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8682351112365723, + "rewards/margins": 0.4957226812839508, + "rewards/rejected": -4.36395788192749, + "step": 8080 + }, + { + "epoch": 1.3938662991040662, + "grad_norm": 24.43054848111554, + "learning_rate": 1.273293015528571e-08, + "logits/chosen": -3.3655686378479004, + "logits/rejected": -3.349841594696045, + "logps/chosen": -1.951073408126831, + "logps/rejected": -2.283658742904663, + "loss": 1.4598, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.902146816253662, + "rewards/margins": 0.6651704907417297, + "rewards/rejected": -4.567317485809326, + "step": 8090 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 31.872028842176366, + "learning_rate": 1.2667479116831437e-08, + "logits/chosen": -3.3853225708007812, + "logits/rejected": -3.381335496902466, + "logps/chosen": -1.93051016330719, + "logps/rejected": -2.1573562622070312, + "loss": 1.5817, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.86102032661438, + "rewards/margins": 0.45369282364845276, + "rewards/rejected": -4.3147125244140625, + "step": 8100 + }, + { + "epoch": 1.3955892487939352, + "eval_logits/chosen": -3.4697673320770264, + "eval_logits/rejected": -3.4661450386047363, + "eval_logps/chosen": -1.8204541206359863, + "eval_logps/rejected": -1.9973745346069336, + "eval_loss": 1.6185389757156372, + "eval_rewards/accuracies": 0.6215148568153381, + "eval_rewards/chosen": -3.6409082412719727, + "eval_rewards/margins": 0.3538404703140259, + "eval_rewards/rejected": -3.994749069213867, + "eval_runtime": 157.4141, + "eval_samples_per_second": 27.342, + "eval_steps_per_second": 3.418, + "step": 8100 + }, + { + "epoch": 1.3973121984838044, + "grad_norm": 24.39914421145924, + "learning_rate": 1.2602139602313067e-08, + "logits/chosen": -3.412815809249878, + "logits/rejected": -3.392493724822998, + "logps/chosen": -1.9878780841827393, + "logps/rejected": -2.2591867446899414, + "loss": 1.5109, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.9757561683654785, + "rewards/margins": 0.5426172018051147, + "rewards/rejected": -4.518373489379883, + "step": 8110 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 25.552881776758507, + "learning_rate": 1.2536912202600907e-08, + "logits/chosen": -3.37101411819458, + "logits/rejected": -3.362802505493164, + "logps/chosen": -1.9416500329971313, + "logps/rejected": -2.152076244354248, + "loss": 1.6527, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.8833000659942627, + "rewards/margins": 0.4208518862724304, + "rewards/rejected": -4.304152488708496, + "step": 8120 + }, + { + "epoch": 1.4007580978635423, + "grad_norm": 26.749095501051336, + "learning_rate": 1.2471797507551324e-08, + "logits/chosen": -3.379565715789795, + "logits/rejected": -3.3698620796203613, + "logps/chosen": -1.9378643035888672, + "logps/rejected": -2.1496639251708984, + "loss": 1.6165, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.8757286071777344, + "rewards/margins": 0.4235994219779968, + "rewards/rejected": -4.299327850341797, + "step": 8130 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 26.027369181588593, + "learning_rate": 1.2406796106001527e-08, + "logits/chosen": -3.3630118370056152, + "logits/rejected": -3.3486838340759277, + "logps/chosen": -1.9520385265350342, + "logps/rejected": -2.2691187858581543, + "loss": 1.5069, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.9040770530700684, + "rewards/margins": 0.6341603994369507, + "rewards/rejected": -4.538237571716309, + "step": 8140 + }, + { + "epoch": 1.4042039972432805, + "grad_norm": 28.04967188535622, + "learning_rate": 1.2341908585764196e-08, + "logits/chosen": -3.4158425331115723, + "logits/rejected": -3.4048938751220703, + "logps/chosen": -1.9750175476074219, + "logps/rejected": -2.2691116333007812, + "loss": 1.5301, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9500350952148438, + "rewards/margins": 0.5881873965263367, + "rewards/rejected": -4.5382232666015625, + "step": 8150 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 25.35037653196265, + "learning_rate": 1.2277135533622174e-08, + "logits/chosen": -3.3781495094299316, + "logits/rejected": -3.3682217597961426, + "logps/chosen": -2.063432216644287, + "logps/rejected": -2.2864997386932373, + "loss": 1.5856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.126864433288574, + "rewards/margins": 0.4461355209350586, + "rewards/rejected": -4.572999477386475, + "step": 8160 + }, + { + "epoch": 1.4076498966230186, + "grad_norm": 33.291780061516015, + "learning_rate": 1.2212477535323157e-08, + "logits/chosen": -3.4032561779022217, + "logits/rejected": -3.3913605213165283, + "logps/chosen": -2.0481019020080566, + "logps/rejected": -2.1088626384735107, + "loss": 1.8471, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.096203804016113, + "rewards/margins": 0.12152198702096939, + "rewards/rejected": -4.2177252769470215, + "step": 8170 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 32.344292013653856, + "learning_rate": 1.2147935175574404e-08, + "logits/chosen": -3.3981502056121826, + "logits/rejected": -3.3869540691375732, + "logps/chosen": -1.991193413734436, + "logps/rejected": -2.2101128101348877, + "loss": 1.6149, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.982386827468872, + "rewards/margins": 0.4378379285335541, + "rewards/rejected": -4.420225620269775, + "step": 8180 + }, + { + "epoch": 1.4110957960027566, + "grad_norm": 24.344906234071164, + "learning_rate": 1.208350903803745e-08, + "logits/chosen": -3.386155605316162, + "logits/rejected": -3.3754405975341797, + "logps/chosen": -1.9523770809173584, + "logps/rejected": -2.1760060787200928, + "loss": 1.5975, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.904754161834717, + "rewards/margins": 0.4472580552101135, + "rewards/rejected": -4.3520121574401855, + "step": 8190 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 23.024775580610477, + "learning_rate": 1.2019199705322794e-08, + "logits/chosen": -3.4059650897979736, + "logits/rejected": -3.387110948562622, + "logps/chosen": -1.9609571695327759, + "logps/rejected": -2.3324732780456543, + "loss": 1.4193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.9219143390655518, + "rewards/margins": 0.7430321574211121, + "rewards/rejected": -4.664946556091309, + "step": 8200 + }, + { + "epoch": 1.4128187456926258, + "eval_logits/chosen": -3.4648759365081787, + "eval_logits/rejected": -3.4612419605255127, + "eval_logps/chosen": -1.8235642910003662, + "eval_logps/rejected": -2.00093412399292, + "eval_loss": 1.618219017982483, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.6471285820007324, + "eval_rewards/margins": 0.3547394871711731, + "eval_rewards/rejected": -4.00186824798584, + "eval_runtime": 157.6465, + "eval_samples_per_second": 27.302, + "eval_steps_per_second": 3.413, + "step": 8200 + }, + { + "epoch": 1.414541695382495, + "grad_norm": 22.92297891811176, + "learning_rate": 1.1955007758984717e-08, + "logits/chosen": -3.3256561756134033, + "logits/rejected": -3.3165574073791504, + "logps/chosen": -2.005265474319458, + "logps/rejected": -2.3060078620910645, + "loss": 1.5524, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.010530948638916, + "rewards/margins": 0.6014853119850159, + "rewards/rejected": -4.612015724182129, + "step": 8210 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 22.011209646916715, + "learning_rate": 1.1890933779515897e-08, + "logits/chosen": -3.4100406169891357, + "logits/rejected": -3.3952860832214355, + "logps/chosen": -1.975152611732483, + "logps/rejected": -2.231205463409424, + "loss": 1.5668, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.950305223464966, + "rewards/margins": 0.5121060609817505, + "rewards/rejected": -4.462410926818848, + "step": 8220 + }, + { + "epoch": 1.417987594762233, + "grad_norm": 25.713910990302878, + "learning_rate": 1.1826978346342301e-08, + "logits/chosen": -3.3839995861053467, + "logits/rejected": -3.3707923889160156, + "logps/chosen": -1.9730503559112549, + "logps/rejected": -2.222259521484375, + "loss": 1.5827, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.9461007118225098, + "rewards/margins": 0.49841880798339844, + "rewards/rejected": -4.44451904296875, + "step": 8230 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 27.315670249193147, + "learning_rate": 1.1763142037817806e-08, + "logits/chosen": -3.4466209411621094, + "logits/rejected": -3.430936813354492, + "logps/chosen": -1.9883838891983032, + "logps/rejected": -2.2041852474212646, + "loss": 1.5941, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9767677783966064, + "rewards/margins": 0.4316031336784363, + "rewards/rejected": -4.408370494842529, + "step": 8240 + }, + { + "epoch": 1.421433494141971, + "grad_norm": 23.103632735639405, + "learning_rate": 1.169942543121908e-08, + "logits/chosen": -3.3919436931610107, + "logits/rejected": -3.3783469200134277, + "logps/chosen": -1.959984540939331, + "logps/rejected": -2.244497299194336, + "loss": 1.5258, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.919969081878662, + "rewards/margins": 0.5690252780914307, + "rewards/rejected": -4.488994598388672, + "step": 8250 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 27.2735258775169, + "learning_rate": 1.1635829102740293e-08, + "logits/chosen": -3.4555351734161377, + "logits/rejected": -3.44490122795105, + "logps/chosen": -1.9406276941299438, + "logps/rejected": -2.177938938140869, + "loss": 1.559, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.8812553882598877, + "rewards/margins": 0.47462278604507446, + "rewards/rejected": -4.355877876281738, + "step": 8260 + }, + { + "epoch": 1.4248793935217092, + "grad_norm": 26.247737239471427, + "learning_rate": 1.1572353627487949e-08, + "logits/chosen": -3.4395382404327393, + "logits/rejected": -3.434749126434326, + "logps/chosen": -1.9956096410751343, + "logps/rejected": -2.1957523822784424, + "loss": 1.6752, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.9912192821502686, + "rewards/margins": 0.400285005569458, + "rewards/rejected": -4.391504764556885, + "step": 8270 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 24.857683121876452, + "learning_rate": 1.1508999579475653e-08, + "logits/chosen": -3.386610507965088, + "logits/rejected": -3.3809409141540527, + "logps/chosen": -2.036158561706543, + "logps/rejected": -2.297449827194214, + "loss": 1.5632, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.072317123413086, + "rewards/margins": 0.5225826501846313, + "rewards/rejected": -4.594899654388428, + "step": 8280 + }, + { + "epoch": 1.4283252929014472, + "grad_norm": 23.27937108107849, + "learning_rate": 1.1445767531618943e-08, + "logits/chosen": -3.377610445022583, + "logits/rejected": -3.3523318767547607, + "logps/chosen": -1.980289101600647, + "logps/rejected": -2.225275754928589, + "loss": 1.5591, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.960578203201294, + "rewards/margins": 0.48997363448143005, + "rewards/rejected": -4.450551509857178, + "step": 8290 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 24.307563713978848, + "learning_rate": 1.1382658055730096e-08, + "logits/chosen": -3.4683947563171387, + "logits/rejected": -3.4564273357391357, + "logps/chosen": -2.030820608139038, + "logps/rejected": -2.251183271408081, + "loss": 1.6206, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.061641216278076, + "rewards/margins": 0.4407258629798889, + "rewards/rejected": -4.502366542816162, + "step": 8300 + }, + { + "epoch": 1.4300482425913164, + "eval_logits/chosen": -3.457481622695923, + "eval_logits/rejected": -3.453824520111084, + "eval_logps/chosen": -1.8269370794296265, + "eval_logps/rejected": -2.0049080848693848, + "eval_loss": 1.6176546812057495, + "eval_rewards/accuracies": 0.6226765513420105, + "eval_rewards/chosen": -3.653874158859253, + "eval_rewards/margins": 0.35594162344932556, + "eval_rewards/rejected": -4.0098161697387695, + "eval_runtime": 157.5842, + "eval_samples_per_second": 27.312, + "eval_steps_per_second": 3.414, + "step": 8300 + }, + { + "epoch": 1.4317711922811853, + "grad_norm": 28.962009473270466, + "learning_rate": 1.1319671722512957e-08, + "logits/chosen": -3.3293678760528564, + "logits/rejected": -3.3124401569366455, + "logps/chosen": -1.9799387454986572, + "logps/rejected": -2.210078239440918, + "loss": 1.5887, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9598774909973145, + "rewards/margins": 0.460279643535614, + "rewards/rejected": -4.420156478881836, + "step": 8310 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 23.011748536131623, + "learning_rate": 1.1256809101557793e-08, + "logits/chosen": -3.3953299522399902, + "logits/rejected": -3.385774612426758, + "logps/chosen": -1.9330298900604248, + "logps/rejected": -2.240124225616455, + "loss": 1.4835, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.8660597801208496, + "rewards/margins": 0.6141878366470337, + "rewards/rejected": -4.48024845123291, + "step": 8320 + }, + { + "epoch": 1.4352170916609235, + "grad_norm": 31.43268637076267, + "learning_rate": 1.1194070761336133e-08, + "logits/chosen": -3.3896522521972656, + "logits/rejected": -3.3845648765563965, + "logps/chosen": -1.9936281442642212, + "logps/rejected": -2.32498836517334, + "loss": 1.4899, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9872562885284424, + "rewards/margins": 0.662720799446106, + "rewards/rejected": -4.64997673034668, + "step": 8330 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 24.612643280636764, + "learning_rate": 1.11314572691956e-08, + "logits/chosen": -3.4313578605651855, + "logits/rejected": -3.4227542877197266, + "logps/chosen": -2.0301923751831055, + "logps/rejected": -2.2075698375701904, + "loss": 1.677, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.060384750366211, + "rewards/margins": 0.35475558042526245, + "rewards/rejected": -4.415139675140381, + "step": 8340 + }, + { + "epoch": 1.4386629910406616, + "grad_norm": 27.32869331571831, + "learning_rate": 1.106896919135483e-08, + "logits/chosen": -3.3360977172851562, + "logits/rejected": -3.3270676136016846, + "logps/chosen": -2.035007953643799, + "logps/rejected": -2.254589557647705, + "loss": 1.6194, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.070015907287598, + "rewards/margins": 0.4391636848449707, + "rewards/rejected": -4.50917911529541, + "step": 8350 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 24.634757902079727, + "learning_rate": 1.1006607092898326e-08, + "logits/chosen": -3.335069179534912, + "logits/rejected": -3.3157219886779785, + "logps/chosen": -1.9235599040985107, + "logps/rejected": -2.264448404312134, + "loss": 1.4669, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.8471198081970215, + "rewards/margins": 0.6817767024040222, + "rewards/rejected": -4.528896808624268, + "step": 8360 + }, + { + "epoch": 1.4421088904203998, + "grad_norm": 26.108501288432787, + "learning_rate": 1.0944371537771346e-08, + "logits/chosen": -3.3765196800231934, + "logits/rejected": -3.366983413696289, + "logps/chosen": -1.9752134084701538, + "logps/rejected": -2.2964110374450684, + "loss": 1.5523, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9504268169403076, + "rewards/margins": 0.6423947811126709, + "rewards/rejected": -4.592822074890137, + "step": 8370 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 23.446002839286262, + "learning_rate": 1.0882263088774809e-08, + "logits/chosen": -3.4567954540252686, + "logits/rejected": -3.447446823120117, + "logps/chosen": -1.9610036611557007, + "logps/rejected": -2.2812585830688477, + "loss": 1.4908, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9220073223114014, + "rewards/margins": 0.6405099630355835, + "rewards/rejected": -4.562517166137695, + "step": 8380 + }, + { + "epoch": 1.4455547898001377, + "grad_norm": 27.869128468985636, + "learning_rate": 1.0820282307560197e-08, + "logits/chosen": -3.4142963886260986, + "logits/rejected": -3.3978607654571533, + "logps/chosen": -1.9912614822387695, + "logps/rejected": -2.3595657348632812, + "loss": 1.3999, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.982522964477539, + "rewards/margins": 0.736608624458313, + "rewards/rejected": -4.7191314697265625, + "step": 8390 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 24.08150447730423, + "learning_rate": 1.075842975462449e-08, + "logits/chosen": -3.4053969383239746, + "logits/rejected": -3.39345121383667, + "logps/chosen": -1.9117761850357056, + "logps/rejected": -2.168217658996582, + "loss": 1.5122, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.823552370071411, + "rewards/margins": 0.5128829479217529, + "rewards/rejected": -4.336435317993164, + "step": 8400 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -3.469801664352417, + "eval_logits/rejected": -3.4662044048309326, + "eval_logps/chosen": -1.8295938968658447, + "eval_logps/rejected": -2.007685899734497, + "eval_loss": 1.617708683013916, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.6591877937316895, + "eval_rewards/margins": 0.3561842441558838, + "eval_rewards/rejected": -4.015371799468994, + "eval_runtime": 157.3786, + "eval_samples_per_second": 27.348, + "eval_steps_per_second": 3.419, + "step": 8400 + }, + { + "epoch": 1.449000689179876, + "grad_norm": 25.492888706239675, + "learning_rate": 1.0696705989305086e-08, + "logits/chosen": -3.3694286346435547, + "logits/rejected": -3.3493599891662598, + "logps/chosen": -2.0644690990448, + "logps/rejected": -2.4158430099487305, + "loss": 1.4808, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.1289381980896, + "rewards/margins": 0.7027474045753479, + "rewards/rejected": -4.831686019897461, + "step": 8410 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 23.98613092221292, + "learning_rate": 1.0635111569774754e-08, + "logits/chosen": -3.3145041465759277, + "logits/rejected": -3.3071727752685547, + "logps/chosen": -1.9745814800262451, + "logps/rejected": -2.1936869621276855, + "loss": 1.6265, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9491629600524902, + "rewards/margins": 0.4382103979587555, + "rewards/rejected": -4.387373924255371, + "step": 8420 + }, + { + "epoch": 1.452446588559614, + "grad_norm": 20.913906401631817, + "learning_rate": 1.0573647053036552e-08, + "logits/chosen": -3.392817974090576, + "logits/rejected": -3.3849518299102783, + "logps/chosen": -1.9802820682525635, + "logps/rejected": -2.276160478591919, + "loss": 1.5288, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.960564136505127, + "rewards/margins": 0.5917571783065796, + "rewards/rejected": -4.552320957183838, + "step": 8430 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 28.471531119488024, + "learning_rate": 1.0512312994918865e-08, + "logits/chosen": -3.4124526977539062, + "logits/rejected": -3.402336597442627, + "logps/chosen": -2.020848512649536, + "logps/rejected": -2.267127513885498, + "loss": 1.5868, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.041697025299072, + "rewards/margins": 0.49255818128585815, + "rewards/rejected": -4.534255027770996, + "step": 8440 + }, + { + "epoch": 1.4558924879393522, + "grad_norm": 22.93754699686189, + "learning_rate": 1.0451109950070276e-08, + "logits/chosen": -3.3442673683166504, + "logits/rejected": -3.3406822681427, + "logps/chosen": -1.9053049087524414, + "logps/rejected": -2.188507318496704, + "loss": 1.5381, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.810609817504883, + "rewards/margins": 0.5664048194885254, + "rewards/rejected": -4.377014636993408, + "step": 8450 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 23.05426808623122, + "learning_rate": 1.039003847195466e-08, + "logits/chosen": -3.403611421585083, + "logits/rejected": -3.3893253803253174, + "logps/chosen": -1.9598493576049805, + "logps/rejected": -2.3451712131500244, + "loss": 1.3781, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.919698715209961, + "rewards/margins": 0.7706438302993774, + "rewards/rejected": -4.690342426300049, + "step": 8460 + }, + { + "epoch": 1.4593383873190904, + "grad_norm": 31.020412782906355, + "learning_rate": 1.0329099112846071e-08, + "logits/chosen": -3.3857390880584717, + "logits/rejected": -3.370079517364502, + "logps/chosen": -2.070085048675537, + "logps/rejected": -2.3278279304504395, + "loss": 1.5948, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.140170097351074, + "rewards/margins": 0.5154857039451599, + "rewards/rejected": -4.655655860900879, + "step": 8470 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 25.931584347786316, + "learning_rate": 1.0268292423823838e-08, + "logits/chosen": -3.3920159339904785, + "logits/rejected": -3.3763740062713623, + "logps/chosen": -1.9758516550064087, + "logps/rejected": -2.25807785987854, + "loss": 1.5573, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9517033100128174, + "rewards/margins": 0.5644528269767761, + "rewards/rejected": -4.51615571975708, + "step": 8480 + }, + { + "epoch": 1.4627842866988283, + "grad_norm": 24.285055566752966, + "learning_rate": 1.020761895476753e-08, + "logits/chosen": -3.410891056060791, + "logits/rejected": -3.406705141067505, + "logps/chosen": -2.061661958694458, + "logps/rejected": -2.282742977142334, + "loss": 1.6355, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.123323917388916, + "rewards/margins": 0.4421631395816803, + "rewards/rejected": -4.565485954284668, + "step": 8490 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 22.018007918985788, + "learning_rate": 1.0147079254352e-08, + "logits/chosen": -3.344536542892456, + "logits/rejected": -3.336181163787842, + "logps/chosen": -2.0097360610961914, + "logps/rejected": -2.288510799407959, + "loss": 1.5508, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.019472122192383, + "rewards/margins": 0.5575498938560486, + "rewards/rejected": -4.577021598815918, + "step": 8500 + }, + { + "epoch": 1.4645072363886975, + "eval_logits/chosen": -3.4659881591796875, + "eval_logits/rejected": -3.46238374710083, + "eval_logps/chosen": -1.829746961593628, + "eval_logps/rejected": -2.007535934448242, + "eval_loss": 1.6178438663482666, + "eval_rewards/accuracies": 0.6210501790046692, + "eval_rewards/chosen": -3.659493923187256, + "eval_rewards/margins": 0.35557809472084045, + "eval_rewards/rejected": -4.015071868896484, + "eval_runtime": 157.5001, + "eval_samples_per_second": 27.327, + "eval_steps_per_second": 3.416, + "step": 8500 + }, + { + "epoch": 1.4662301860785665, + "grad_norm": 29.927002761465673, + "learning_rate": 1.008667387004242e-08, + "logits/chosen": -3.3850531578063965, + "logits/rejected": -3.3657753467559814, + "logps/chosen": -2.015775203704834, + "logps/rejected": -2.316255569458008, + "loss": 1.585, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.031550407409668, + "rewards/margins": 0.6009604334831238, + "rewards/rejected": -4.632511138916016, + "step": 8510 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 24.365033534833103, + "learning_rate": 1.0026403348089329e-08, + "logits/chosen": -3.375797748565674, + "logits/rejected": -3.356057643890381, + "logps/chosen": -1.9623439311981201, + "logps/rejected": -2.335493326187134, + "loss": 1.4468, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9246878623962402, + "rewards/margins": 0.7462981939315796, + "rewards/rejected": -4.670986652374268, + "step": 8520 + }, + { + "epoch": 1.4696760854583046, + "grad_norm": 29.128322800880827, + "learning_rate": 9.9662682335237e-09, + "logits/chosen": -3.3661162853240967, + "logits/rejected": -3.357314348220825, + "logps/chosen": -2.018242359161377, + "logps/rejected": -2.232515811920166, + "loss": 1.6313, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.036484718322754, + "rewards/margins": 0.428547203540802, + "rewards/rejected": -4.465031623840332, + "step": 8530 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 25.277307960004222, + "learning_rate": 9.906269070152004e-09, + "logits/chosen": -3.4445273876190186, + "logits/rejected": -3.4365055561065674, + "logps/chosen": -1.9746322631835938, + "logps/rejected": -2.225862741470337, + "loss": 1.5405, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9492645263671875, + "rewards/margins": 0.5024608373641968, + "rewards/rejected": -4.451725482940674, + "step": 8540 + }, + { + "epoch": 1.4731219848380428, + "grad_norm": 27.659877156748887, + "learning_rate": 9.846406400551307e-09, + "logits/chosen": -3.4056105613708496, + "logits/rejected": -3.4019649028778076, + "logps/chosen": -1.9980108737945557, + "logps/rejected": -2.213505983352661, + "loss": 1.6424, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.9960217475891113, + "rewards/margins": 0.4309898912906647, + "rewards/rejected": -4.427011966705322, + "step": 8550 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 25.147034483547706, + "learning_rate": 9.786680766064318e-09, + "logits/chosen": -3.458066940307617, + "logits/rejected": -3.4464447498321533, + "logps/chosen": -1.9665758609771729, + "logps/rejected": -2.2171244621276855, + "loss": 1.5696, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9331517219543457, + "rewards/margins": 0.5010970830917358, + "rewards/rejected": -4.434248924255371, + "step": 8560 + }, + { + "epoch": 1.476567884217781, + "grad_norm": 27.117195935732052, + "learning_rate": 9.727092706794554e-09, + "logits/chosen": -3.362035036087036, + "logits/rejected": -3.3484511375427246, + "logps/chosen": -1.993886947631836, + "logps/rejected": -2.2900900840759277, + "loss": 1.5089, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.987773895263672, + "rewards/margins": 0.5924067497253418, + "rewards/rejected": -4.5801801681518555, + "step": 8570 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 27.710183335262784, + "learning_rate": 9.667642761601433e-09, + "logits/chosen": -3.4176673889160156, + "logits/rejected": -3.4059174060821533, + "logps/chosen": -1.9150145053863525, + "logps/rejected": -2.2272796630859375, + "loss": 1.4779, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.830029010772705, + "rewards/margins": 0.6245306134223938, + "rewards/rejected": -4.454559326171875, + "step": 8580 + }, + { + "epoch": 1.480013783597519, + "grad_norm": 28.350401355000894, + "learning_rate": 9.608331468095376e-09, + "logits/chosen": -3.431151866912842, + "logits/rejected": -3.416367769241333, + "logps/chosen": -1.9471371173858643, + "logps/rejected": -2.232456922531128, + "loss": 1.5023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8942742347717285, + "rewards/margins": 0.5706399083137512, + "rewards/rejected": -4.464913845062256, + "step": 8590 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 23.08110149076074, + "learning_rate": 9.549159362632986e-09, + "logits/chosen": -3.3701159954071045, + "logits/rejected": -3.353144407272339, + "logps/chosen": -1.9776432514190674, + "logps/rejected": -2.24200177192688, + "loss": 1.5254, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.9552865028381348, + "rewards/margins": 0.5287173390388489, + "rewards/rejected": -4.48400354385376, + "step": 8600 + }, + { + "epoch": 1.481736733287388, + "eval_logits/chosen": -3.4597294330596924, + "eval_logits/rejected": -3.4560935497283936, + "eval_logps/chosen": -1.8311941623687744, + "eval_logps/rejected": -2.0092029571533203, + "eval_loss": 1.6179639101028442, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.662388324737549, + "eval_rewards/margins": 0.3560180068016052, + "eval_rewards/rejected": -4.018405914306641, + "eval_runtime": 157.512, + "eval_samples_per_second": 27.325, + "eval_steps_per_second": 3.416, + "step": 8600 + }, + { + "epoch": 1.483459682977257, + "grad_norm": 26.55102219981302, + "learning_rate": 9.490126980312165e-09, + "logits/chosen": -3.3869385719299316, + "logits/rejected": -3.373847246170044, + "logps/chosen": -1.9809229373931885, + "logps/rejected": -2.1907382011413574, + "loss": 1.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.961845874786377, + "rewards/margins": 0.4196307063102722, + "rewards/rejected": -4.381476402282715, + "step": 8610 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 26.704517235622813, + "learning_rate": 9.43123485496729e-09, + "logits/chosen": -3.3466014862060547, + "logits/rejected": -3.335735321044922, + "logps/chosen": -2.031029462814331, + "logps/rejected": -2.2389895915985107, + "loss": 1.6266, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.062058925628662, + "rewards/margins": 0.4159201979637146, + "rewards/rejected": -4.4779791831970215, + "step": 8620 + }, + { + "epoch": 1.4869055823569952, + "grad_norm": 28.971732297608895, + "learning_rate": 9.372483519164398e-09, + "logits/chosen": -3.3234505653381348, + "logits/rejected": -3.3152923583984375, + "logps/chosen": -1.9345057010650635, + "logps/rejected": -2.1689133644104004, + "loss": 1.5695, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.869011402130127, + "rewards/margins": 0.4688156545162201, + "rewards/rejected": -4.337826728820801, + "step": 8630 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 24.18784260267253, + "learning_rate": 9.313873504196313e-09, + "logits/chosen": -3.4110519886016846, + "logits/rejected": -3.3955531120300293, + "logps/chosen": -1.9623100757598877, + "logps/rejected": -2.253459930419922, + "loss": 1.4975, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9246201515197754, + "rewards/margins": 0.5822996497154236, + "rewards/rejected": -4.506919860839844, + "step": 8640 + }, + { + "epoch": 1.4903514817367332, + "grad_norm": 26.857579368121833, + "learning_rate": 9.255405340077949e-09, + "logits/chosen": -3.3653597831726074, + "logits/rejected": -3.3547444343566895, + "logps/chosen": -1.9732005596160889, + "logps/rejected": -2.2485904693603516, + "loss": 1.5472, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9464011192321777, + "rewards/margins": 0.5507797002792358, + "rewards/rejected": -4.497180938720703, + "step": 8650 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 28.148328575151062, + "learning_rate": 9.197079555541378e-09, + "logits/chosen": -3.3720264434814453, + "logits/rejected": -3.364607334136963, + "logps/chosen": -1.9655357599258423, + "logps/rejected": -2.219949722290039, + "loss": 1.5992, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9310715198516846, + "rewards/margins": 0.5088284611701965, + "rewards/rejected": -4.439899444580078, + "step": 8660 + }, + { + "epoch": 1.4937973811164715, + "grad_norm": 24.063987097771488, + "learning_rate": 9.138896678031201e-09, + "logits/chosen": -3.4277827739715576, + "logits/rejected": -3.4175171852111816, + "logps/chosen": -2.0314345359802246, + "logps/rejected": -2.327117443084717, + "loss": 1.5029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.062869071960449, + "rewards/margins": 0.5913657546043396, + "rewards/rejected": -4.654234886169434, + "step": 8670 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 27.62779711802808, + "learning_rate": 9.080857233699624e-09, + "logits/chosen": -3.4015936851501465, + "logits/rejected": -3.396472454071045, + "logps/chosen": -1.9991798400878906, + "logps/rejected": -2.2181575298309326, + "loss": 1.582, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9983596801757812, + "rewards/margins": 0.4379548132419586, + "rewards/rejected": -4.436315059661865, + "step": 8680 + }, + { + "epoch": 1.4972432804962095, + "grad_norm": 25.280308892547275, + "learning_rate": 9.022961747401842e-09, + "logits/chosen": -3.407639265060425, + "logits/rejected": -3.3939144611358643, + "logps/chosen": -1.9851402044296265, + "logps/rejected": -2.13613224029541, + "loss": 1.7039, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.970280408859253, + "rewards/margins": 0.3019842505455017, + "rewards/rejected": -4.27226448059082, + "step": 8690 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 22.929463642749656, + "learning_rate": 8.96521074269117e-09, + "logits/chosen": -3.408776044845581, + "logits/rejected": -3.390958070755005, + "logps/chosen": -1.9846868515014648, + "logps/rejected": -2.240658760070801, + "loss": 1.5461, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9693737030029297, + "rewards/margins": 0.5119439363479614, + "rewards/rejected": -4.481317520141602, + "step": 8700 + }, + { + "epoch": 1.4989662301860784, + "eval_logits/chosen": -3.45639967918396, + "eval_logits/rejected": -3.452752113342285, + "eval_logps/chosen": -1.8334901332855225, + "eval_logps/rejected": -2.0118517875671387, + "eval_loss": 1.6176307201385498, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.666980266571045, + "eval_rewards/margins": 0.35672298073768616, + "eval_rewards/rejected": -4.023703575134277, + "eval_runtime": 157.4338, + "eval_samples_per_second": 27.338, + "eval_steps_per_second": 3.417, + "step": 8700 + }, + { + "epoch": 1.5006891798759476, + "grad_norm": 23.571241462241137, + "learning_rate": 8.907604741814404e-09, + "logits/chosen": -3.4024734497070312, + "logits/rejected": -3.394758701324463, + "logps/chosen": -1.9411218166351318, + "logps/rejected": -2.1511824131011963, + "loss": 1.6276, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8822436332702637, + "rewards/margins": 0.4201214909553528, + "rewards/rejected": -4.302364826202393, + "step": 8710 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 23.883452027797357, + "learning_rate": 8.850144265707039e-09, + "logits/chosen": -3.385542631149292, + "logits/rejected": -3.3734371662139893, + "logps/chosen": -1.9368032217025757, + "logps/rejected": -2.178506374359131, + "loss": 1.5414, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.8736064434051514, + "rewards/margins": 0.483405739068985, + "rewards/rejected": -4.357012748718262, + "step": 8720 + }, + { + "epoch": 1.5041350792556858, + "grad_norm": 29.321722200946986, + "learning_rate": 8.792829833988588e-09, + "logits/chosen": -3.3948593139648438, + "logits/rejected": -3.3793387413024902, + "logps/chosen": -1.9272693395614624, + "logps/rejected": -2.234009265899658, + "loss": 1.4992, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.854538679122925, + "rewards/margins": 0.6134799122810364, + "rewards/rejected": -4.468018531799316, + "step": 8730 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 24.514903935737344, + "learning_rate": 8.73566196495787e-09, + "logits/chosen": -3.3680412769317627, + "logits/rejected": -3.361706495285034, + "logps/chosen": -1.976222038269043, + "logps/rejected": -2.251128673553467, + "loss": 1.5564, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.952444076538086, + "rewards/margins": 0.5498131513595581, + "rewards/rejected": -4.502257347106934, + "step": 8740 + }, + { + "epoch": 1.5075809786354237, + "grad_norm": 28.54180902275943, + "learning_rate": 8.678641175588324e-09, + "logits/chosen": -3.4019246101379395, + "logits/rejected": -3.3866848945617676, + "logps/chosen": -1.9768030643463135, + "logps/rejected": -2.3179221153259277, + "loss": 1.5136, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.953606128692627, + "rewards/margins": 0.6822377443313599, + "rewards/rejected": -4.6358442306518555, + "step": 8750 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 24.288262025725373, + "learning_rate": 8.621767981523351e-09, + "logits/chosen": -3.3662846088409424, + "logits/rejected": -3.358330249786377, + "logps/chosen": -1.9504038095474243, + "logps/rejected": -2.2805440425872803, + "loss": 1.4771, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.9008076190948486, + "rewards/margins": 0.6602801084518433, + "rewards/rejected": -4.5610880851745605, + "step": 8760 + }, + { + "epoch": 1.5110268780151621, + "grad_norm": 27.24603971533629, + "learning_rate": 8.565042897071607e-09, + "logits/chosen": -3.400085926055908, + "logits/rejected": -3.384584426879883, + "logps/chosen": -1.989044427871704, + "logps/rejected": -2.2306816577911377, + "loss": 1.5791, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.978088855743408, + "rewards/margins": 0.48327502608299255, + "rewards/rejected": -4.461363315582275, + "step": 8770 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 26.700752880971578, + "learning_rate": 8.508466435202402e-09, + "logits/chosen": -3.409217119216919, + "logits/rejected": -3.409679889678955, + "logps/chosen": -2.014639377593994, + "logps/rejected": -2.2706656455993652, + "loss": 1.6105, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.029278755187988, + "rewards/margins": 0.5120527148246765, + "rewards/rejected": -4.5413312911987305, + "step": 8780 + }, + { + "epoch": 1.5144727773949, + "grad_norm": 24.61591275920267, + "learning_rate": 8.452039107541043e-09, + "logits/chosen": -3.4245705604553223, + "logits/rejected": -3.4097092151641846, + "logps/chosen": -2.0039587020874023, + "logps/rejected": -2.2955374717712402, + "loss": 1.5402, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.007917404174805, + "rewards/margins": 0.5831576585769653, + "rewards/rejected": -4.5910749435424805, + "step": 8790 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 26.26916529279563, + "learning_rate": 8.395761424364193e-09, + "logits/chosen": -3.347684860229492, + "logits/rejected": -3.3297207355499268, + "logps/chosen": -1.9706910848617554, + "logps/rejected": -2.3306450843811035, + "loss": 1.4625, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9413821697235107, + "rewards/margins": 0.719907283782959, + "rewards/rejected": -4.661290168762207, + "step": 8800 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -3.4559524059295654, + "eval_logits/rejected": -3.452305316925049, + "eval_logps/chosen": -1.8347411155700684, + "eval_logps/rejected": -2.013340473175049, + "eval_loss": 1.61762535572052, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.6694822311401367, + "eval_rewards/margins": 0.3571985363960266, + "eval_rewards/rejected": -4.026680946350098, + "eval_runtime": 157.3905, + "eval_samples_per_second": 27.346, + "eval_steps_per_second": 3.418, + "step": 8800 + }, + { + "epoch": 1.5179186767746382, + "grad_norm": 28.062135052424527, + "learning_rate": 8.33963389459528e-09, + "logits/chosen": -3.44390869140625, + "logits/rejected": -3.4325485229492188, + "logps/chosen": -1.93198561668396, + "logps/rejected": -2.2114360332489014, + "loss": 1.4958, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.86397123336792, + "rewards/margins": 0.5589008331298828, + "rewards/rejected": -4.422872066497803, + "step": 8810 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 29.403538244533795, + "learning_rate": 8.283657025799872e-09, + "logits/chosen": -3.3957228660583496, + "logits/rejected": -3.3860347270965576, + "logps/chosen": -2.0003128051757812, + "logps/rejected": -2.2199902534484863, + "loss": 1.5973, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.0006256103515625, + "rewards/margins": 0.4393545687198639, + "rewards/rejected": -4.439980506896973, + "step": 8820 + }, + { + "epoch": 1.5213645761543764, + "grad_norm": 28.411575562250786, + "learning_rate": 8.227831324181108e-09, + "logits/chosen": -3.3160910606384277, + "logits/rejected": -3.3057663440704346, + "logps/chosen": -1.9990060329437256, + "logps/rejected": -2.234621524810791, + "loss": 1.5986, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.998012065887451, + "rewards/margins": 0.4712304472923279, + "rewards/rejected": -4.469243049621582, + "step": 8830 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 27.770787583664454, + "learning_rate": 8.172157294575107e-09, + "logits/chosen": -3.337235689163208, + "logits/rejected": -3.330984592437744, + "logps/chosen": -1.9760258197784424, + "logps/rejected": -2.2644717693328857, + "loss": 1.5096, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9520516395568848, + "rewards/margins": 0.5768924951553345, + "rewards/rejected": -4.5289435386657715, + "step": 8840 + }, + { + "epoch": 1.5248104755341143, + "grad_norm": 23.34466613552977, + "learning_rate": 8.116635440446401e-09, + "logits/chosen": -3.456653594970703, + "logits/rejected": -3.446545124053955, + "logps/chosen": -1.8929256200790405, + "logps/rejected": -2.1735408306121826, + "loss": 1.503, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.785851240158081, + "rewards/margins": 0.5612308979034424, + "rewards/rejected": -4.347081661224365, + "step": 8850 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 27.50412392424376, + "learning_rate": 8.061266263883404e-09, + "logits/chosen": -3.3937485218048096, + "logits/rejected": -3.381727695465088, + "logps/chosen": -1.9125213623046875, + "logps/rejected": -2.1820521354675293, + "loss": 1.4913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.825042724609375, + "rewards/margins": 0.5390619039535522, + "rewards/rejected": -4.364104270935059, + "step": 8860 + }, + { + "epoch": 1.5282563749138525, + "grad_norm": 22.30701243133975, + "learning_rate": 8.006050265593815e-09, + "logits/chosen": -3.4785289764404297, + "logits/rejected": -3.4605841636657715, + "logps/chosen": -1.9472230672836304, + "logps/rejected": -2.2302138805389404, + "loss": 1.5478, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.8944461345672607, + "rewards/margins": 0.5659812092781067, + "rewards/rejected": -4.460427761077881, + "step": 8870 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 25.958110648673614, + "learning_rate": 7.950987944900191e-09, + "logits/chosen": -3.3571701049804688, + "logits/rejected": -3.342996120452881, + "logps/chosen": -2.0334134101867676, + "logps/rejected": -2.2937817573547363, + "loss": 1.5792, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.066826820373535, + "rewards/margins": 0.5207368731498718, + "rewards/rejected": -4.587563514709473, + "step": 8880 + }, + { + "epoch": 1.5317022742935906, + "grad_norm": 27.83371019124868, + "learning_rate": 7.896079799735308e-09, + "logits/chosen": -3.3929152488708496, + "logits/rejected": -3.3789241313934326, + "logps/chosen": -1.9082889556884766, + "logps/rejected": -2.1398444175720215, + "loss": 1.5403, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.816577911376953, + "rewards/margins": 0.46311086416244507, + "rewards/rejected": -4.279688835144043, + "step": 8890 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 24.734769742531, + "learning_rate": 7.841326326637782e-09, + "logits/chosen": -3.4019572734832764, + "logits/rejected": -3.385303497314453, + "logps/chosen": -1.961182951927185, + "logps/rejected": -2.2766194343566895, + "loss": 1.5134, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.92236590385437, + "rewards/margins": 0.6308724284172058, + "rewards/rejected": -4.553238868713379, + "step": 8900 + }, + { + "epoch": 1.5334252239834596, + "eval_logits/chosen": -3.45566725730896, + "eval_logits/rejected": -3.4520249366760254, + "eval_logps/chosen": -1.8381030559539795, + "eval_logps/rejected": -2.017106294631958, + "eval_loss": 1.6170810461044312, + "eval_rewards/accuracies": 0.6208178400993347, + "eval_rewards/chosen": -3.676206111907959, + "eval_rewards/margins": 0.35800644755363464, + "eval_rewards/rejected": -4.034212589263916, + "eval_runtime": 157.6756, + "eval_samples_per_second": 27.297, + "eval_steps_per_second": 3.412, + "step": 8900 + }, + { + "epoch": 1.5351481736733288, + "grad_norm": 22.645520895101082, + "learning_rate": 7.786728020747463e-09, + "logits/chosen": -3.3814163208007812, + "logits/rejected": -3.373598098754883, + "logps/chosen": -2.0018677711486816, + "logps/rejected": -2.2561798095703125, + "loss": 1.6578, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.003735542297363, + "rewards/margins": 0.5086239576339722, + "rewards/rejected": -4.512359619140625, + "step": 8910 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 24.558545634066668, + "learning_rate": 7.732285375801039e-09, + "logits/chosen": -3.4347946643829346, + "logits/rejected": -3.416597366333008, + "logps/chosen": -2.030313014984131, + "logps/rejected": -2.371870517730713, + "loss": 1.4574, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.060626029968262, + "rewards/margins": 0.6831148266792297, + "rewards/rejected": -4.743741035461426, + "step": 8920 + }, + { + "epoch": 1.538594073053067, + "grad_norm": 23.95264025682932, + "learning_rate": 7.677998884127543e-09, + "logits/chosen": -3.4050192832946777, + "logits/rejected": -3.387233018875122, + "logps/chosen": -1.952326774597168, + "logps/rejected": -2.226639747619629, + "loss": 1.5485, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.904653549194336, + "rewards/margins": 0.5486257672309875, + "rewards/rejected": -4.453279495239258, + "step": 8930 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 26.05883600740083, + "learning_rate": 7.623869036643902e-09, + "logits/chosen": -3.3941650390625, + "logits/rejected": -3.3853180408477783, + "logps/chosen": -2.0001511573791504, + "logps/rejected": -2.2974281311035156, + "loss": 1.5215, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.000302314758301, + "rewards/margins": 0.5945541262626648, + "rewards/rejected": -4.594856262207031, + "step": 8940 + }, + { + "epoch": 1.5420399724328049, + "grad_norm": 29.655529896615057, + "learning_rate": 7.569896322850488e-09, + "logits/chosen": -3.335721969604492, + "logits/rejected": -3.334639072418213, + "logps/chosen": -1.9350265264511108, + "logps/rejected": -2.1575846672058105, + "loss": 1.5806, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.8700530529022217, + "rewards/margins": 0.44511571526527405, + "rewards/rejected": -4.315169334411621, + "step": 8950 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 23.36180951286622, + "learning_rate": 7.516081230826716e-09, + "logits/chosen": -3.412951946258545, + "logits/rejected": -3.400667667388916, + "logps/chosen": -2.0302627086639404, + "logps/rejected": -2.3069934844970703, + "loss": 1.534, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.060525417327881, + "rewards/margins": 0.5534616708755493, + "rewards/rejected": -4.613986968994141, + "step": 8960 + }, + { + "epoch": 1.545485871812543, + "grad_norm": 27.251654527750453, + "learning_rate": 7.462424247226607e-09, + "logits/chosen": -3.3761744499206543, + "logits/rejected": -3.363429546356201, + "logps/chosen": -1.9866615533828735, + "logps/rejected": -2.1794304847717285, + "loss": 1.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.973323106765747, + "rewards/margins": 0.38553738594055176, + "rewards/rejected": -4.358860969543457, + "step": 8970 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 27.024772479115086, + "learning_rate": 7.408925857274373e-09, + "logits/chosen": -3.4327149391174316, + "logits/rejected": -3.4161429405212402, + "logps/chosen": -2.026679515838623, + "logps/rejected": -2.3088765144348145, + "loss": 1.5469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.053359031677246, + "rewards/margins": 0.5643936395645142, + "rewards/rejected": -4.617753028869629, + "step": 8980 + }, + { + "epoch": 1.5489317711922812, + "grad_norm": 23.81268113292815, + "learning_rate": 7.355586544760109e-09, + "logits/chosen": -3.388331174850464, + "logits/rejected": -3.378638505935669, + "logps/chosen": -1.9388223886489868, + "logps/rejected": -2.1868948936462402, + "loss": 1.5321, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8776447772979736, + "rewards/margins": 0.49614542722702026, + "rewards/rejected": -4.3737897872924805, + "step": 8990 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 28.193018390385642, + "learning_rate": 7.302406792035298e-09, + "logits/chosen": -3.3974368572235107, + "logits/rejected": -3.3802618980407715, + "logps/chosen": -2.0405921936035156, + "logps/rejected": -2.294504404067993, + "loss": 1.5758, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.081184387207031, + "rewards/margins": 0.5078238248825073, + "rewards/rejected": -4.589008808135986, + "step": 9000 + }, + { + "epoch": 1.5506547208821502, + "eval_logits/chosen": -3.459662437438965, + "eval_logits/rejected": -3.456043243408203, + "eval_logps/chosen": -1.8396071195602417, + "eval_logps/rejected": -2.0189428329467773, + "eval_loss": 1.6168333292007446, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.6792142391204834, + "eval_rewards/margins": 0.35867178440093994, + "eval_rewards/rejected": -4.037885665893555, + "eval_runtime": 157.5774, + "eval_samples_per_second": 27.314, + "eval_steps_per_second": 3.414, + "step": 9000 + }, + { + "epoch": 1.5523776705720191, + "grad_norm": 29.10595505446765, + "learning_rate": 7.249387080008551e-09, + "logits/chosen": -3.384018659591675, + "logits/rejected": -3.3711326122283936, + "logps/chosen": -1.97023606300354, + "logps/rejected": -2.228088855743408, + "loss": 1.5598, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.94047212600708, + "rewards/margins": 0.5157051682472229, + "rewards/rejected": -4.456177711486816, + "step": 9010 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 23.368622070117414, + "learning_rate": 7.196527888141199e-09, + "logits/chosen": -3.350865125656128, + "logits/rejected": -3.3418736457824707, + "logps/chosen": -2.0243988037109375, + "logps/rejected": -2.336397647857666, + "loss": 1.5183, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.048797607421875, + "rewards/margins": 0.623997151851654, + "rewards/rejected": -4.672795295715332, + "step": 9020 + }, + { + "epoch": 1.5558235699517575, + "grad_norm": 27.233884544152613, + "learning_rate": 7.14382969444299e-09, + "logits/chosen": -3.354069471359253, + "logits/rejected": -3.3562285900115967, + "logps/chosen": -2.0174779891967773, + "logps/rejected": -2.1930603981018066, + "loss": 1.6835, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.034955978393555, + "rewards/margins": 0.35116511583328247, + "rewards/rejected": -4.386120796203613, + "step": 9030 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 24.259057399277445, + "learning_rate": 7.091292975467744e-09, + "logits/chosen": -3.3651835918426514, + "logits/rejected": -3.3529579639434814, + "logps/chosen": -1.9068677425384521, + "logps/rejected": -2.148203134536743, + "loss": 1.5766, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.8137354850769043, + "rewards/margins": 0.48267069458961487, + "rewards/rejected": -4.296406269073486, + "step": 9040 + }, + { + "epoch": 1.5592694693314955, + "grad_norm": 28.231666208421885, + "learning_rate": 7.038918206309061e-09, + "logits/chosen": -3.3883309364318848, + "logits/rejected": -3.3790218830108643, + "logps/chosen": -2.062511444091797, + "logps/rejected": -2.3295905590057373, + "loss": 1.598, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.125022888183594, + "rewards/margins": 0.5341578722000122, + "rewards/rejected": -4.659181118011475, + "step": 9050 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 27.727286502168237, + "learning_rate": 6.986705860596004e-09, + "logits/chosen": -3.396008253097534, + "logits/rejected": -3.382503032684326, + "logps/chosen": -2.0013046264648438, + "logps/rejected": -2.2741687297821045, + "loss": 1.5383, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.0026092529296875, + "rewards/margins": 0.5457279086112976, + "rewards/rejected": -4.548337459564209, + "step": 9060 + }, + { + "epoch": 1.5627153687112336, + "grad_norm": 27.78760145119851, + "learning_rate": 6.934656410488848e-09, + "logits/chosen": -3.379351854324341, + "logits/rejected": -3.3690693378448486, + "logps/chosen": -1.9849498271942139, + "logps/rejected": -2.2790093421936035, + "loss": 1.507, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9698996543884277, + "rewards/margins": 0.5881190896034241, + "rewards/rejected": -4.558018684387207, + "step": 9070 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 24.06327454767885, + "learning_rate": 6.882770326674752e-09, + "logits/chosen": -3.3698318004608154, + "logits/rejected": -3.369764804840088, + "logps/chosen": -1.9418232440948486, + "logps/rejected": -2.2121336460113525, + "loss": 1.5506, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8836464881896973, + "rewards/margins": 0.5406206846237183, + "rewards/rejected": -4.424267292022705, + "step": 9080 + }, + { + "epoch": 1.5661612680909718, + "grad_norm": 26.12519607675819, + "learning_rate": 6.831048078363602e-09, + "logits/chosen": -3.3632705211639404, + "logits/rejected": -3.3456485271453857, + "logps/chosen": -1.997636079788208, + "logps/rejected": -2.2432701587677, + "loss": 1.5615, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.995272159576416, + "rewards/margins": 0.4912680983543396, + "rewards/rejected": -4.4865403175354, + "step": 9090 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 30.60057695261972, + "learning_rate": 6.779490133283638e-09, + "logits/chosen": -3.399038314819336, + "logits/rejected": -3.384644031524658, + "logps/chosen": -2.0249524116516113, + "logps/rejected": -2.216341018676758, + "loss": 1.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.049904823303223, + "rewards/margins": 0.38277676701545715, + "rewards/rejected": -4.432682037353516, + "step": 9100 + }, + { + "epoch": 1.5678842177808407, + "eval_logits/chosen": -3.4513652324676514, + "eval_logits/rejected": -3.4477076530456543, + "eval_logps/chosen": -1.8408371210098267, + "eval_logps/rejected": -2.020103931427002, + "eval_loss": 1.6168689727783203, + "eval_rewards/accuracies": 0.6226765513420105, + "eval_rewards/chosen": -3.6816742420196533, + "eval_rewards/margins": 0.3585336208343506, + "eval_rewards/rejected": -4.040207862854004, + "eval_runtime": 157.7423, + "eval_samples_per_second": 27.285, + "eval_steps_per_second": 3.411, + "step": 9100 + }, + { + "epoch": 1.5696071674707097, + "grad_norm": 23.057645488509806, + "learning_rate": 6.72809695767736e-09, + "logits/chosen": -3.4094815254211426, + "logits/rejected": -3.3946213722229004, + "logps/chosen": -1.9660415649414062, + "logps/rejected": -2.2856812477111816, + "loss": 1.5096, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9320831298828125, + "rewards/margins": 0.6392796635627747, + "rewards/rejected": -4.571362495422363, + "step": 9110 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 24.272184322875813, + "learning_rate": 6.676869016297179e-09, + "logits/chosen": -3.4148666858673096, + "logits/rejected": -3.3957276344299316, + "logps/chosen": -1.9689308404922485, + "logps/rejected": -2.2667717933654785, + "loss": 1.4919, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.937861680984497, + "rewards/margins": 0.5956820845603943, + "rewards/rejected": -4.533543586730957, + "step": 9120 + }, + { + "epoch": 1.573053066850448, + "grad_norm": 25.068779371540106, + "learning_rate": 6.625806772401346e-09, + "logits/chosen": -3.3600242137908936, + "logits/rejected": -3.347273588180542, + "logps/chosen": -1.9701874256134033, + "logps/rejected": -2.1995785236358643, + "loss": 1.587, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9403748512268066, + "rewards/margins": 0.458781898021698, + "rewards/rejected": -4.3991570472717285, + "step": 9130 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 22.64214567526691, + "learning_rate": 6.574910687749641e-09, + "logits/chosen": -3.409505844116211, + "logits/rejected": -3.380748748779297, + "logps/chosen": -1.880601167678833, + "logps/rejected": -2.231144666671753, + "loss": 1.4259, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.761202335357666, + "rewards/margins": 0.7010871171951294, + "rewards/rejected": -4.462289333343506, + "step": 9140 + }, + { + "epoch": 1.576498966230186, + "grad_norm": 31.123348905571255, + "learning_rate": 6.524181222599282e-09, + "logits/chosen": -3.4019031524658203, + "logits/rejected": -3.3878674507141113, + "logps/chosen": -2.0236306190490723, + "logps/rejected": -2.3806300163269043, + "loss": 1.5617, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.0472612380981445, + "rewards/margins": 0.713998556137085, + "rewards/rejected": -4.761260032653809, + "step": 9150 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 33.22803574664989, + "learning_rate": 6.473618835700731e-09, + "logits/chosen": -3.4080681800842285, + "logits/rejected": -3.405228853225708, + "logps/chosen": -2.0078952312469482, + "logps/rejected": -2.259218692779541, + "loss": 1.5622, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.0157904624938965, + "rewards/margins": 0.5026463270187378, + "rewards/rejected": -4.518437385559082, + "step": 9160 + }, + { + "epoch": 1.5799448656099242, + "grad_norm": 23.410733392851263, + "learning_rate": 6.4232239842935434e-09, + "logits/chosen": -3.3997905254364014, + "logits/rejected": -3.383671522140503, + "logps/chosen": -2.082437753677368, + "logps/rejected": -2.33508038520813, + "loss": 1.6225, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.164875507354736, + "rewards/margins": 0.5052854418754578, + "rewards/rejected": -4.67016077041626, + "step": 9170 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 24.816904361035135, + "learning_rate": 6.372997124102245e-09, + "logits/chosen": -3.3951804637908936, + "logits/rejected": -3.382948637008667, + "logps/chosen": -2.0069243907928467, + "logps/rejected": -2.294646978378296, + "loss": 1.5296, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.013848781585693, + "rewards/margins": 0.5754456520080566, + "rewards/rejected": -4.589293956756592, + "step": 9180 + }, + { + "epoch": 1.5833907649896624, + "grad_norm": 25.812398217355323, + "learning_rate": 6.3229387093321955e-09, + "logits/chosen": -3.4635708332061768, + "logits/rejected": -3.4590096473693848, + "logps/chosen": -1.9890754222869873, + "logps/rejected": -2.224609136581421, + "loss": 1.5876, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9781508445739746, + "rewards/margins": 0.4710671007633209, + "rewards/rejected": -4.449218273162842, + "step": 9190 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 28.102572665150653, + "learning_rate": 6.273049192665503e-09, + "logits/chosen": -3.410214900970459, + "logits/rejected": -3.398358106613159, + "logps/chosen": -2.0300934314727783, + "logps/rejected": -2.282491683959961, + "loss": 1.5434, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.060186862945557, + "rewards/margins": 0.5047963857650757, + "rewards/rejected": -4.564983367919922, + "step": 9200 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -3.4575252532958984, + "eval_logits/rejected": -3.4539077281951904, + "eval_logps/chosen": -1.8418513536453247, + "eval_logps/rejected": -2.0210683345794678, + "eval_loss": 1.617126703262329, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.6837027072906494, + "eval_rewards/margins": 0.35843366384506226, + "eval_rewards/rejected": -4.0421366691589355, + "eval_runtime": 157.7444, + "eval_samples_per_second": 27.285, + "eval_steps_per_second": 3.411, + "step": 9200 + }, + { + "epoch": 1.5868366643694003, + "grad_norm": 27.314875627670414, + "learning_rate": 6.223329025256896e-09, + "logits/chosen": -3.322859525680542, + "logits/rejected": -3.3110861778259277, + "logps/chosen": -2.0635874271392822, + "logps/rejected": -2.3346166610717773, + "loss": 1.595, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.1271748542785645, + "rewards/margins": 0.5420587658882141, + "rewards/rejected": -4.669233322143555, + "step": 9210 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 29.213225562712747, + "learning_rate": 6.173778656729678e-09, + "logits/chosen": -3.3913254737854004, + "logits/rejected": -3.3788421154022217, + "logps/chosen": -1.9684524536132812, + "logps/rejected": -2.246532440185547, + "loss": 1.5306, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9369049072265625, + "rewards/margins": 0.556160569190979, + "rewards/rejected": -4.493064880371094, + "step": 9220 + }, + { + "epoch": 1.5902825637491387, + "grad_norm": 26.828866845866724, + "learning_rate": 6.124398535171654e-09, + "logits/chosen": -3.3348209857940674, + "logits/rejected": -3.328462600708008, + "logps/chosen": -1.9279639720916748, + "logps/rejected": -2.211570978164673, + "loss": 1.4875, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8559279441833496, + "rewards/margins": 0.5672137141227722, + "rewards/rejected": -4.423141956329346, + "step": 9230 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 21.172188113329227, + "learning_rate": 6.075189107131059e-09, + "logits/chosen": -3.3528149127960205, + "logits/rejected": -3.3480567932128906, + "logps/chosen": -2.0422444343566895, + "logps/rejected": -2.2171826362609863, + "loss": 1.71, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.084488868713379, + "rewards/margins": 0.3498762249946594, + "rewards/rejected": -4.434365272521973, + "step": 9240 + }, + { + "epoch": 1.5937284631288766, + "grad_norm": 21.029164268489943, + "learning_rate": 6.026150817612544e-09, + "logits/chosen": -3.370058059692383, + "logits/rejected": -3.357508420944214, + "logps/chosen": -1.8985515832901, + "logps/rejected": -2.2947030067443848, + "loss": 1.4872, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.7971031665802, + "rewards/margins": 0.7923027873039246, + "rewards/rejected": -4.5894060134887695, + "step": 9250 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 29.274649460717477, + "learning_rate": 5.977284110073136e-09, + "logits/chosen": -3.357381820678711, + "logits/rejected": -3.3468868732452393, + "logps/chosen": -2.0351369380950928, + "logps/rejected": -2.354037046432495, + "loss": 1.4718, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.0702738761901855, + "rewards/margins": 0.6378002166748047, + "rewards/rejected": -4.70807409286499, + "step": 9260 + }, + { + "epoch": 1.5971743625086148, + "grad_norm": 25.250685662971957, + "learning_rate": 5.928589426418234e-09, + "logits/chosen": -3.451566219329834, + "logits/rejected": -3.433729648590088, + "logps/chosen": -2.0068719387054443, + "logps/rejected": -2.338134288787842, + "loss": 1.4725, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.013743877410889, + "rewards/margins": 0.6625255346298218, + "rewards/rejected": -4.676268577575684, + "step": 9270 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 25.65937609478678, + "learning_rate": 5.880067206997611e-09, + "logits/chosen": -3.3905422687530518, + "logits/rejected": -3.3812294006347656, + "logps/chosen": -1.9552417993545532, + "logps/rejected": -2.241483211517334, + "loss": 1.4952, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9104835987091064, + "rewards/margins": 0.5724822282791138, + "rewards/rejected": -4.482966423034668, + "step": 9280 + }, + { + "epoch": 1.600620261888353, + "grad_norm": 24.370798633841655, + "learning_rate": 5.831717890601434e-09, + "logits/chosen": -3.3270785808563232, + "logits/rejected": -3.315871000289917, + "logps/chosen": -2.0054831504821777, + "logps/rejected": -2.227461099624634, + "loss": 1.6002, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.0109663009643555, + "rewards/margins": 0.4439551830291748, + "rewards/rejected": -4.454922199249268, + "step": 9290 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 33.45612144445083, + "learning_rate": 5.7835419144563e-09, + "logits/chosen": -3.3847250938415527, + "logits/rejected": -3.3817343711853027, + "logps/chosen": -2.118396520614624, + "logps/rejected": -2.3421573638916016, + "loss": 1.6069, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.236793041229248, + "rewards/margins": 0.4475215971469879, + "rewards/rejected": -4.684314727783203, + "step": 9300 + }, + { + "epoch": 1.602343211578222, + "eval_logits/chosen": -3.4530177116394043, + "eval_logits/rejected": -3.4493794441223145, + "eval_logps/chosen": -1.8417394161224365, + "eval_logps/rejected": -2.0213191509246826, + "eval_loss": 1.6168341636657715, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.683478832244873, + "eval_rewards/margins": 0.3591594994068146, + "eval_rewards/rejected": -4.042638301849365, + "eval_runtime": 157.4154, + "eval_samples_per_second": 27.342, + "eval_steps_per_second": 3.418, + "step": 9300 + }, + { + "epoch": 1.6040661612680909, + "grad_norm": 28.88698994129908, + "learning_rate": 5.7355397142212495e-09, + "logits/chosen": -3.396108627319336, + "logits/rejected": -3.3789000511169434, + "logps/chosen": -2.0037143230438232, + "logps/rejected": -2.315995931625366, + "loss": 1.4808, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.0074286460876465, + "rewards/margins": 0.6245633959770203, + "rewards/rejected": -4.631991863250732, + "step": 9310 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 23.237751615130577, + "learning_rate": 5.687711723983907e-09, + "logits/chosen": -3.4338138103485107, + "logits/rejected": -3.423569440841675, + "logps/chosen": -2.0041418075561523, + "logps/rejected": -2.3365399837493896, + "loss": 1.4357, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.008283615112305, + "rewards/margins": 0.664795994758606, + "rewards/rejected": -4.673079967498779, + "step": 9320 + }, + { + "epoch": 1.607512060647829, + "grad_norm": 31.22086928585365, + "learning_rate": 5.640058376256437e-09, + "logits/chosen": -3.402078151702881, + "logits/rejected": -3.38782000541687, + "logps/chosen": -2.066725015640259, + "logps/rejected": -2.332266092300415, + "loss": 1.5619, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.133450031280518, + "rewards/margins": 0.5310825705528259, + "rewards/rejected": -4.66453218460083, + "step": 9330 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 22.394232750630472, + "learning_rate": 5.592580101971764e-09, + "logits/chosen": -3.3568739891052246, + "logits/rejected": -3.35017466545105, + "logps/chosen": -2.070354461669922, + "logps/rejected": -2.29188871383667, + "loss": 1.5995, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.140708923339844, + "rewards/margins": 0.443067729473114, + "rewards/rejected": -4.58377742767334, + "step": 9340 + }, + { + "epoch": 1.6109579600275672, + "grad_norm": 25.359215659271083, + "learning_rate": 5.545277330479558e-09, + "logits/chosen": -3.3939404487609863, + "logits/rejected": -3.378258228302002, + "logps/chosen": -2.0078938007354736, + "logps/rejected": -2.3154680728912354, + "loss": 1.4873, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.015787601470947, + "rewards/margins": 0.6151489615440369, + "rewards/rejected": -4.630936145782471, + "step": 9350 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 34.59469808984219, + "learning_rate": 5.498150489542428e-09, + "logits/chosen": -3.431382656097412, + "logits/rejected": -3.4124226570129395, + "logps/chosen": -1.944173812866211, + "logps/rejected": -2.2736570835113525, + "loss": 1.4824, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.888347625732422, + "rewards/margins": 0.6589663624763489, + "rewards/rejected": -4.547314167022705, + "step": 9360 + }, + { + "epoch": 1.6144038594073054, + "grad_norm": 19.619818264012785, + "learning_rate": 5.4512000053320264e-09, + "logits/chosen": -3.44238543510437, + "logits/rejected": -3.4260878562927246, + "logps/chosen": -1.961296796798706, + "logps/rejected": -2.2417988777160645, + "loss": 1.5262, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.922593593597412, + "rewards/margins": 0.5610038638114929, + "rewards/rejected": -4.483597755432129, + "step": 9370 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 28.676271718461553, + "learning_rate": 5.4044263024251994e-09, + "logits/chosen": -3.448455333709717, + "logits/rejected": -3.4393436908721924, + "logps/chosen": -1.9728612899780273, + "logps/rejected": -2.171426296234131, + "loss": 1.673, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.9457225799560547, + "rewards/margins": 0.39712995290756226, + "rewards/rejected": -4.342852592468262, + "step": 9380 + }, + { + "epoch": 1.6178497587870435, + "grad_norm": 28.71183845362896, + "learning_rate": 5.3578298038001375e-09, + "logits/chosen": -3.3019280433654785, + "logits/rejected": -3.2948784828186035, + "logps/chosen": -2.0483555793762207, + "logps/rejected": -2.313016176223755, + "loss": 1.5294, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.096711158752441, + "rewards/margins": 0.5293210744857788, + "rewards/rejected": -4.62603235244751, + "step": 9390 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 24.458952020443537, + "learning_rate": 5.311410930832574e-09, + "logits/chosen": -3.361856460571289, + "logits/rejected": -3.353131055831909, + "logps/chosen": -1.9678901433944702, + "logps/rejected": -2.1995654106140137, + "loss": 1.5762, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9357802867889404, + "rewards/margins": 0.46335020661354065, + "rewards/rejected": -4.399130821228027, + "step": 9400 + }, + { + "epoch": 1.6195727084769125, + "eval_logits/chosen": -3.447918653488159, + "eval_logits/rejected": -3.4442591667175293, + "eval_logps/chosen": -1.8428826332092285, + "eval_logps/rejected": -2.022775411605835, + "eval_loss": 1.616538166999817, + "eval_rewards/accuracies": 0.6226765513420105, + "eval_rewards/chosen": -3.685765266418457, + "eval_rewards/margins": 0.35978561639785767, + "eval_rewards/rejected": -4.04555082321167, + "eval_runtime": 157.2904, + "eval_samples_per_second": 27.363, + "eval_steps_per_second": 3.42, + "step": 9400 + }, + { + "epoch": 1.6212956581667815, + "grad_norm": 27.352654851355044, + "learning_rate": 5.265170103291952e-09, + "logits/chosen": -3.377066135406494, + "logits/rejected": -3.366779327392578, + "logps/chosen": -2.0785205364227295, + "logps/rejected": -2.336418390274048, + "loss": 1.6539, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.157041072845459, + "rewards/margins": 0.5157960057258606, + "rewards/rejected": -4.672836780548096, + "step": 9410 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 26.725861321741775, + "learning_rate": 5.219107739337616e-09, + "logits/chosen": -3.4047577381134033, + "logits/rejected": -3.3945765495300293, + "logps/chosen": -2.099597454071045, + "logps/rejected": -2.2650179862976074, + "loss": 1.6411, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.19919490814209, + "rewards/margins": 0.33084067702293396, + "rewards/rejected": -4.530035972595215, + "step": 9420 + }, + { + "epoch": 1.6247415575465196, + "grad_norm": 27.140255248344374, + "learning_rate": 5.173224255515099e-09, + "logits/chosen": -3.387317657470703, + "logits/rejected": -3.3822522163391113, + "logps/chosen": -2.0175862312316895, + "logps/rejected": -2.172361135482788, + "loss": 1.6981, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.035172462463379, + "rewards/margins": 0.30954989790916443, + "rewards/rejected": -4.344722270965576, + "step": 9430 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 30.064066409288404, + "learning_rate": 5.127520066752256e-09, + "logits/chosen": -3.3840835094451904, + "logits/rejected": -3.3801932334899902, + "logps/chosen": -2.0160694122314453, + "logps/rejected": -2.3217813968658447, + "loss": 1.5397, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.032138824462891, + "rewards/margins": 0.6114238500595093, + "rewards/rejected": -4.6435627937316895, + "step": 9440 + }, + { + "epoch": 1.6281874569262578, + "grad_norm": 25.74205540890287, + "learning_rate": 5.081995586355592e-09, + "logits/chosen": -3.4516549110412598, + "logits/rejected": -3.44805645942688, + "logps/chosen": -2.0959725379943848, + "logps/rejected": -2.2452540397644043, + "loss": 1.735, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.1919450759887695, + "rewards/margins": 0.29856228828430176, + "rewards/rejected": -4.490508079528809, + "step": 9450 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 29.603157863836643, + "learning_rate": 5.0366512260064885e-09, + "logits/chosen": -3.359147310256958, + "logits/rejected": -3.3550238609313965, + "logps/chosen": -2.004164695739746, + "logps/rejected": -2.3392276763916016, + "loss": 1.4731, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.008329391479492, + "rewards/margins": 0.6701260805130005, + "rewards/rejected": -4.678455352783203, + "step": 9460 + }, + { + "epoch": 1.6316333563059957, + "grad_norm": 28.47590048096695, + "learning_rate": 4.99148739575749e-09, + "logits/chosen": -3.2774970531463623, + "logits/rejected": -3.259338855743408, + "logps/chosen": -2.018129825592041, + "logps/rejected": -2.268707752227783, + "loss": 1.5827, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.036259651184082, + "rewards/margins": 0.5011555552482605, + "rewards/rejected": -4.537415504455566, + "step": 9470 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 31.008474620564034, + "learning_rate": 4.94650450402859e-09, + "logits/chosen": -3.363196611404419, + "logits/rejected": -3.345864772796631, + "logps/chosen": -2.0971293449401855, + "logps/rejected": -2.425625801086426, + "loss": 1.4543, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.194258689880371, + "rewards/margins": 0.6569935083389282, + "rewards/rejected": -4.851251602172852, + "step": 9480 + }, + { + "epoch": 1.635079255685734, + "grad_norm": 26.77415530114236, + "learning_rate": 4.90170295760354e-09, + "logits/chosen": -3.3716769218444824, + "logits/rejected": -3.3606808185577393, + "logps/chosen": -1.9748363494873047, + "logps/rejected": -2.247223377227783, + "loss": 1.5422, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9496726989746094, + "rewards/margins": 0.5447738766670227, + "rewards/rejected": -4.494446754455566, + "step": 9490 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 21.787458181516328, + "learning_rate": 4.857083161626174e-09, + "logits/chosen": -3.386723041534424, + "logits/rejected": -3.3780601024627686, + "logps/chosen": -2.008662223815918, + "logps/rejected": -2.2889654636383057, + "loss": 1.5365, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.017324447631836, + "rewards/margins": 0.5606063604354858, + "rewards/rejected": -4.577930927276611, + "step": 9500 + }, + { + "epoch": 1.636802205375603, + "eval_logits/chosen": -3.458066701889038, + "eval_logits/rejected": -3.4544572830200195, + "eval_logps/chosen": -1.8431885242462158, + "eval_logps/rejected": -2.022937774658203, + "eval_loss": 1.616588830947876, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.6863770484924316, + "eval_rewards/margins": 0.3594987094402313, + "eval_rewards/rejected": -4.045875549316406, + "eval_runtime": 157.3756, + "eval_samples_per_second": 27.349, + "eval_steps_per_second": 3.419, + "step": 9500 + }, + { + "epoch": 1.638525155065472, + "grad_norm": 29.812415546293302, + "learning_rate": 4.812645519596748e-09, + "logits/chosen": -3.334829807281494, + "logits/rejected": -3.3274903297424316, + "logps/chosen": -1.9621950387954712, + "logps/rejected": -2.2649292945861816, + "loss": 1.5088, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9243900775909424, + "rewards/margins": 0.6054679155349731, + "rewards/rejected": -4.529858589172363, + "step": 9510 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 26.533211617220594, + "learning_rate": 4.768390433368272e-09, + "logits/chosen": -3.457014560699463, + "logits/rejected": -3.450606107711792, + "logps/chosen": -1.999945044517517, + "logps/rejected": -2.3875722885131836, + "loss": 1.3998, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.999890089035034, + "rewards/margins": 0.7752548456192017, + "rewards/rejected": -4.775144577026367, + "step": 9520 + }, + { + "epoch": 1.6419710544452102, + "grad_norm": 25.627754013691185, + "learning_rate": 4.72431830314291e-09, + "logits/chosen": -3.401480197906494, + "logits/rejected": -3.390071392059326, + "logps/chosen": -1.9792699813842773, + "logps/rejected": -2.322640895843506, + "loss": 1.5235, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9585399627685547, + "rewards/margins": 0.6867424845695496, + "rewards/rejected": -4.645281791687012, + "step": 9530 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 24.067638926349087, + "learning_rate": 4.680429527468311e-09, + "logits/chosen": -3.368239641189575, + "logits/rejected": -3.3556246757507324, + "logps/chosen": -2.01406192779541, + "logps/rejected": -2.3475120067596436, + "loss": 1.4731, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.02812385559082, + "rewards/margins": 0.6669005155563354, + "rewards/rejected": -4.695024013519287, + "step": 9540 + }, + { + "epoch": 1.6454169538249483, + "grad_norm": 26.51855079473667, + "learning_rate": 4.636724503234074e-09, + "logits/chosen": -3.397850751876831, + "logits/rejected": -3.3986778259277344, + "logps/chosen": -2.028869867324829, + "logps/rejected": -2.2952098846435547, + "loss": 1.5615, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.057739734649658, + "rewards/margins": 0.5326803922653198, + "rewards/rejected": -4.590419769287109, + "step": 9550 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 25.208953911096994, + "learning_rate": 4.593203625668077e-09, + "logits/chosen": -3.4253182411193848, + "logits/rejected": -3.4193968772888184, + "logps/chosen": -1.9727824926376343, + "logps/rejected": -2.195401906967163, + "loss": 1.5999, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9455649852752686, + "rewards/margins": 0.4452383518218994, + "rewards/rejected": -4.390803813934326, + "step": 9560 + }, + { + "epoch": 1.6488628532046863, + "grad_norm": 23.687829642295572, + "learning_rate": 4.549867288332987e-09, + "logits/chosen": -3.382124423980713, + "logits/rejected": -3.373439073562622, + "logps/chosen": -2.007606267929077, + "logps/rejected": -2.274506092071533, + "loss": 1.5317, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.015212535858154, + "rewards/margins": 0.5337998270988464, + "rewards/rejected": -4.549012184143066, + "step": 9570 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 24.208646207645277, + "learning_rate": 4.506715883122628e-09, + "logits/chosen": -3.4240787029266357, + "logits/rejected": -3.4151813983917236, + "logps/chosen": -2.029395341873169, + "logps/rejected": -2.295823335647583, + "loss": 1.552, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.058790683746338, + "rewards/margins": 0.5328559279441833, + "rewards/rejected": -4.591646671295166, + "step": 9580 + }, + { + "epoch": 1.6523087525844247, + "grad_norm": 30.025224024886192, + "learning_rate": 4.463749800258479e-09, + "logits/chosen": -3.4658255577087402, + "logits/rejected": -3.455538272857666, + "logps/chosen": -2.0401244163513184, + "logps/rejected": -2.2941622734069824, + "loss": 1.5955, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.080248832702637, + "rewards/margins": 0.508075475692749, + "rewards/rejected": -4.588324546813965, + "step": 9590 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 23.842556650395704, + "learning_rate": 4.420969428286139e-09, + "logits/chosen": -3.3668007850646973, + "logits/rejected": -3.3484439849853516, + "logps/chosen": -1.9667737483978271, + "logps/rejected": -2.240955114364624, + "loss": 1.5801, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.9335474967956543, + "rewards/margins": 0.5483629107475281, + "rewards/rejected": -4.481910228729248, + "step": 9600 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -3.4533019065856934, + "eval_logits/rejected": -3.449674129486084, + "eval_logps/chosen": -1.8432886600494385, + "eval_logps/rejected": -2.0229451656341553, + "eval_loss": 1.6167532205581665, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.686577320098877, + "eval_rewards/margins": 0.3593129515647888, + "eval_rewards/rejected": -4.0458903312683105, + "eval_runtime": 157.3108, + "eval_samples_per_second": 27.36, + "eval_steps_per_second": 3.42, + "step": 9600 + }, + { + "epoch": 1.6557546519641626, + "grad_norm": 26.57511819516294, + "learning_rate": 4.3783751540718065e-09, + "logits/chosen": -3.3814780712127686, + "logits/rejected": -3.3690991401672363, + "logps/chosen": -1.974647879600525, + "logps/rejected": -2.2723212242126465, + "loss": 1.5453, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.94929575920105, + "rewards/margins": 0.5953468680381775, + "rewards/rejected": -4.544642448425293, + "step": 9610 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 24.620120031818757, + "learning_rate": 4.335967362798787e-09, + "logits/chosen": -3.4539589881896973, + "logits/rejected": -3.451262950897217, + "logps/chosen": -2.0212087631225586, + "logps/rejected": -2.210979461669922, + "loss": 1.6825, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.042417526245117, + "rewards/margins": 0.3795414865016937, + "rewards/rejected": -4.421958923339844, + "step": 9620 + }, + { + "epoch": 1.6592005513439008, + "grad_norm": 24.627988420718033, + "learning_rate": 4.2937464379639824e-09, + "logits/chosen": -3.3973236083984375, + "logits/rejected": -3.381181240081787, + "logps/chosen": -2.050283432006836, + "logps/rejected": -2.2766592502593994, + "loss": 1.5937, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.100566864013672, + "rewards/margins": 0.4527515470981598, + "rewards/rejected": -4.553318500518799, + "step": 9630 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 26.440369459445296, + "learning_rate": 4.251712761374499e-09, + "logits/chosen": -3.4433906078338623, + "logits/rejected": -3.4261956214904785, + "logps/chosen": -2.030791759490967, + "logps/rejected": -2.289353847503662, + "loss": 1.5837, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.061583518981934, + "rewards/margins": 0.5171231031417847, + "rewards/rejected": -4.578707695007324, + "step": 9640 + }, + { + "epoch": 1.662646450723639, + "grad_norm": 28.348872255095912, + "learning_rate": 4.209866713144078e-09, + "logits/chosen": -3.3535187244415283, + "logits/rejected": -3.3430256843566895, + "logps/chosen": -2.113848924636841, + "logps/rejected": -2.312478542327881, + "loss": 1.6375, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.227697849273682, + "rewards/margins": 0.39725902676582336, + "rewards/rejected": -4.624957084655762, + "step": 9650 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 29.415754014624635, + "learning_rate": 4.1682086716897824e-09, + "logits/chosen": -3.379497528076172, + "logits/rejected": -3.3754146099090576, + "logps/chosen": -1.9367421865463257, + "logps/rejected": -2.168555736541748, + "loss": 1.5921, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8734843730926514, + "rewards/margins": 0.4636271893978119, + "rewards/rejected": -4.337111473083496, + "step": 9660 + }, + { + "epoch": 1.6660923501033769, + "grad_norm": 25.247696097184534, + "learning_rate": 4.1267390137284725e-09, + "logits/chosen": -3.424443483352661, + "logits/rejected": -3.408032178878784, + "logps/chosen": -1.9714887142181396, + "logps/rejected": -2.249882698059082, + "loss": 1.5439, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9429774284362793, + "rewards/margins": 0.5567878484725952, + "rewards/rejected": -4.499765396118164, + "step": 9670 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 26.898293875845077, + "learning_rate": 4.0854581142734625e-09, + "logits/chosen": -3.4011292457580566, + "logits/rejected": -3.3918089866638184, + "logps/chosen": -2.069446325302124, + "logps/rejected": -2.1919503211975098, + "loss": 1.7393, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.138892650604248, + "rewards/margins": 0.24500791728496552, + "rewards/rejected": -4.3839006423950195, + "step": 9680 + }, + { + "epoch": 1.6695382494831152, + "grad_norm": 29.03894384501455, + "learning_rate": 4.044366346631107e-09, + "logits/chosen": -3.332477569580078, + "logits/rejected": -3.321429491043091, + "logps/chosen": -2.04760479927063, + "logps/rejected": -2.2741804122924805, + "loss": 1.6173, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.09520959854126, + "rewards/margins": 0.4531516432762146, + "rewards/rejected": -4.548360824584961, + "step": 9690 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 24.015809715839374, + "learning_rate": 4.003464082397421e-09, + "logits/chosen": -3.3757903575897217, + "logits/rejected": -3.358320713043213, + "logps/chosen": -1.9648540019989014, + "logps/rejected": -2.2921643257141113, + "loss": 1.4796, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9297080039978027, + "rewards/margins": 0.6546201705932617, + "rewards/rejected": -4.584328651428223, + "step": 9700 + }, + { + "epoch": 1.6712611991729842, + "eval_logits/chosen": -3.458920955657959, + "eval_logits/rejected": -3.455321788787842, + "eval_logps/chosen": -1.843773365020752, + "eval_logps/rejected": -2.0231881141662598, + "eval_loss": 1.6170227527618408, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.687546730041504, + "eval_rewards/margins": 0.3588295578956604, + "eval_rewards/rejected": -4.0463762283325195, + "eval_runtime": 157.4902, + "eval_samples_per_second": 27.329, + "eval_steps_per_second": 3.416, + "step": 9700 + }, + { + "epoch": 1.6729841488628532, + "grad_norm": 28.799901400790148, + "learning_rate": 3.9627516914547295e-09, + "logits/chosen": -3.2980926036834717, + "logits/rejected": -3.2875049114227295, + "logps/chosen": -2.0383944511413574, + "logps/rejected": -2.2527966499328613, + "loss": 1.6713, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.076788902282715, + "rewards/margins": 0.428804486989975, + "rewards/rejected": -4.505593299865723, + "step": 9710 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 28.387349479644534, + "learning_rate": 3.922229541968322e-09, + "logits/chosen": -3.4008171558380127, + "logits/rejected": -3.3930504322052, + "logps/chosen": -1.9901485443115234, + "logps/rejected": -2.1876189708709717, + "loss": 1.6228, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.980297088623047, + "rewards/margins": 0.39494091272354126, + "rewards/rejected": -4.375237941741943, + "step": 9720 + }, + { + "epoch": 1.6764300482425913, + "grad_norm": 25.464480245141136, + "learning_rate": 3.8818980003831155e-09, + "logits/chosen": -3.373661756515503, + "logits/rejected": -3.3642220497131348, + "logps/chosen": -2.00123929977417, + "logps/rejected": -2.2850382328033447, + "loss": 1.5182, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.00247859954834, + "rewards/margins": 0.5675975680351257, + "rewards/rejected": -4.5700764656066895, + "step": 9730 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 24.313437659973154, + "learning_rate": 3.841757431420351e-09, + "logits/chosen": -3.4022789001464844, + "logits/rejected": -3.393200635910034, + "logps/chosen": -2.024486780166626, + "logps/rejected": -2.2075483798980713, + "loss": 1.648, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.048973560333252, + "rewards/margins": 0.366123229265213, + "rewards/rejected": -4.415096759796143, + "step": 9740 + }, + { + "epoch": 1.6798759476223295, + "grad_norm": 25.917466096482467, + "learning_rate": 3.8018081980742664e-09, + "logits/chosen": -3.392129421234131, + "logits/rejected": -3.371306896209717, + "logps/chosen": -1.9980226755142212, + "logps/rejected": -2.277165174484253, + "loss": 1.5187, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9960453510284424, + "rewards/margins": 0.5582850575447083, + "rewards/rejected": -4.554330348968506, + "step": 9750 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 25.923571083940974, + "learning_rate": 3.7620506616088815e-09, + "logits/chosen": -3.408726215362549, + "logits/rejected": -3.3967883586883545, + "logps/chosen": -2.044023036956787, + "logps/rejected": -2.310995578765869, + "loss": 1.5776, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.088046073913574, + "rewards/margins": 0.5339447259902954, + "rewards/rejected": -4.621991157531738, + "step": 9760 + }, + { + "epoch": 1.6833218470020674, + "grad_norm": 24.176193083278594, + "learning_rate": 3.7224851815546298e-09, + "logits/chosen": -3.3582584857940674, + "logits/rejected": -3.3469951152801514, + "logps/chosen": -1.9432952404022217, + "logps/rejected": -2.211101531982422, + "loss": 1.5479, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8865904808044434, + "rewards/margins": 0.5356132388114929, + "rewards/rejected": -4.422203063964844, + "step": 9770 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 24.431098961588518, + "learning_rate": 3.6831121157052254e-09, + "logits/chosen": -3.4073097705841064, + "logits/rejected": -3.3828327655792236, + "logps/chosen": -1.9910447597503662, + "logps/rejected": -2.290560245513916, + "loss": 1.5106, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9820895195007324, + "rewards/margins": 0.5990304946899414, + "rewards/rejected": -4.581120491027832, + "step": 9780 + }, + { + "epoch": 1.6867677463818056, + "grad_norm": 27.313283233308354, + "learning_rate": 3.64393182011431e-09, + "logits/chosen": -3.3841776847839355, + "logits/rejected": -3.3826088905334473, + "logps/chosen": -1.942065954208374, + "logps/rejected": -2.3032023906707764, + "loss": 1.4661, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.884131908416748, + "rewards/margins": 0.7222728133201599, + "rewards/rejected": -4.606404781341553, + "step": 9790 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 25.90043539128856, + "learning_rate": 3.604944649092323e-09, + "logits/chosen": -3.395308256149292, + "logits/rejected": -3.3764114379882812, + "logps/chosen": -1.9286320209503174, + "logps/rejected": -2.301144599914551, + "loss": 1.384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.8572640419006348, + "rewards/margins": 0.7450253963470459, + "rewards/rejected": -4.602289199829102, + "step": 9800 + }, + { + "epoch": 1.6884906960716748, + "eval_logits/chosen": -3.4484305381774902, + "eval_logits/rejected": -3.444789171218872, + "eval_logps/chosen": -1.8443588018417358, + "eval_logps/rejected": -2.0240273475646973, + "eval_loss": 1.616855502128601, + "eval_rewards/accuracies": 0.6229089498519897, + "eval_rewards/chosen": -3.6887176036834717, + "eval_rewards/margins": 0.35933732986450195, + "eval_rewards/rejected": -4.0480546951293945, + "eval_runtime": 157.5615, + "eval_samples_per_second": 27.316, + "eval_steps_per_second": 3.415, + "step": 9800 + }, + { + "epoch": 1.6902136457615438, + "grad_norm": 23.579822419197335, + "learning_rate": 3.566150955203251e-09, + "logits/chosen": -3.353625774383545, + "logits/rejected": -3.3356621265411377, + "logps/chosen": -2.0019569396972656, + "logps/rejected": -2.4045376777648926, + "loss": 1.4082, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.003913879394531, + "rewards/margins": 0.8051609992980957, + "rewards/rejected": -4.809075355529785, + "step": 9810 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 30.597985650539822, + "learning_rate": 3.52755108926146e-09, + "logits/chosen": -3.399155378341675, + "logits/rejected": -3.3862757682800293, + "logps/chosen": -1.9372270107269287, + "logps/rejected": -2.2467448711395264, + "loss": 1.5081, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8744540214538574, + "rewards/margins": 0.6190361976623535, + "rewards/rejected": -4.493489742279053, + "step": 9820 + }, + { + "epoch": 1.693659545141282, + "grad_norm": 22.200761221819036, + "learning_rate": 3.489145400328511e-09, + "logits/chosen": -3.412567138671875, + "logits/rejected": -3.4088339805603027, + "logps/chosen": -2.037950038909912, + "logps/rejected": -2.2209019660949707, + "loss": 1.7065, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.075900077819824, + "rewards/margins": 0.36590367555618286, + "rewards/rejected": -4.441803932189941, + "step": 9830 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 21.948487922644176, + "learning_rate": 3.4509342357099904e-09, + "logits/chosen": -3.371875047683716, + "logits/rejected": -3.3546719551086426, + "logps/chosen": -1.9857927560806274, + "logps/rejected": -2.3077261447906494, + "loss": 1.4725, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.971585512161255, + "rewards/margins": 0.643866777420044, + "rewards/rejected": -4.615452289581299, + "step": 9840 + }, + { + "epoch": 1.69710544452102, + "grad_norm": 23.580327437320673, + "learning_rate": 3.412917940952423e-09, + "logits/chosen": -3.3865959644317627, + "logits/rejected": -3.3803296089172363, + "logps/chosen": -1.9591586589813232, + "logps/rejected": -2.1538960933685303, + "loss": 1.64, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.9183173179626465, + "rewards/margins": 0.3894748091697693, + "rewards/rejected": -4.3077921867370605, + "step": 9850 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 33.19735753728511, + "learning_rate": 3.375096859840071e-09, + "logits/chosen": -3.425288677215576, + "logits/rejected": -3.4195034503936768, + "logps/chosen": -2.1636297702789307, + "logps/rejected": -2.2502341270446777, + "loss": 1.7858, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.327259540557861, + "rewards/margins": 0.17320939898490906, + "rewards/rejected": -4.5004682540893555, + "step": 9860 + }, + { + "epoch": 1.700551343900758, + "grad_norm": 27.01684916074081, + "learning_rate": 3.337471334391903e-09, + "logits/chosen": -3.4214859008789062, + "logits/rejected": -3.4055123329162598, + "logps/chosen": -1.9767429828643799, + "logps/rejected": -2.284975528717041, + "loss": 1.5016, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9534859657287598, + "rewards/margins": 0.6164640188217163, + "rewards/rejected": -4.569951057434082, + "step": 9870 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 24.192542614429698, + "learning_rate": 3.300041704858425e-09, + "logits/chosen": -3.3477120399475098, + "logits/rejected": -3.3444302082061768, + "logps/chosen": -1.9840704202651978, + "logps/rejected": -2.224355459213257, + "loss": 1.5962, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.9681408405303955, + "rewards/margins": 0.48057013750076294, + "rewards/rejected": -4.448710918426514, + "step": 9880 + }, + { + "epoch": 1.7039972432804962, + "grad_norm": 27.065133161191227, + "learning_rate": 3.2628083097186675e-09, + "logits/chosen": -3.32279896736145, + "logits/rejected": -3.3181262016296387, + "logps/chosen": -2.0749213695526123, + "logps/rejected": -2.271827459335327, + "loss": 1.6654, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.149842739105225, + "rewards/margins": 0.39381149411201477, + "rewards/rejected": -4.543654918670654, + "step": 9890 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 27.634417765324873, + "learning_rate": 3.2257714856770866e-09, + "logits/chosen": -3.3798203468322754, + "logits/rejected": -3.362905979156494, + "logps/chosen": -1.9234071969985962, + "logps/rejected": -2.302262544631958, + "loss": 1.4182, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.8468143939971924, + "rewards/margins": 0.7577108144760132, + "rewards/rejected": -4.604525089263916, + "step": 9900 + }, + { + "epoch": 1.7057201929703654, + "eval_logits/chosen": -3.4620373249053955, + "eval_logits/rejected": -3.4584641456604004, + "eval_logps/chosen": -1.8444513082504272, + "eval_logps/rejected": -2.02405047416687, + "eval_loss": 1.6171423196792603, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.6889026165008545, + "eval_rewards/margins": 0.35919860005378723, + "eval_rewards/rejected": -4.04810094833374, + "eval_runtime": 157.0421, + "eval_samples_per_second": 27.407, + "eval_steps_per_second": 3.426, + "step": 9900 + }, + { + "epoch": 1.7074431426602343, + "grad_norm": 27.389604842818315, + "learning_rate": 3.188931567660533e-09, + "logits/chosen": -3.4224448204040527, + "logits/rejected": -3.399517774581909, + "logps/chosen": -2.0451226234436035, + "logps/rejected": -2.3083033561706543, + "loss": 1.5782, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.090245246887207, + "rewards/margins": 0.5263611674308777, + "rewards/rejected": -4.616606712341309, + "step": 9910 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 26.061645190407305, + "learning_rate": 3.152288888815227e-09, + "logits/chosen": -3.4228084087371826, + "logits/rejected": -3.406785488128662, + "logps/chosen": -1.963828444480896, + "logps/rejected": -2.235150098800659, + "loss": 1.5327, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.927656888961792, + "rewards/margins": 0.5426429510116577, + "rewards/rejected": -4.470300197601318, + "step": 9920 + }, + { + "epoch": 1.7108890420399723, + "grad_norm": 22.894874452402952, + "learning_rate": 3.1158437805037296e-09, + "logits/chosen": -3.394484281539917, + "logits/rejected": -3.390847682952881, + "logps/chosen": -1.9526355266571045, + "logps/rejected": -2.1933956146240234, + "loss": 1.5406, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.905271053314209, + "rewards/margins": 0.48152002692222595, + "rewards/rejected": -4.386791229248047, + "step": 9930 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 22.108549122948293, + "learning_rate": 3.0795965723019653e-09, + "logits/chosen": -3.4399852752685547, + "logits/rejected": -3.4374115467071533, + "logps/chosen": -1.97061288356781, + "logps/rejected": -2.191434383392334, + "loss": 1.5876, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.94122576713562, + "rewards/margins": 0.4416431486606598, + "rewards/rejected": -4.382868766784668, + "step": 9940 + }, + { + "epoch": 1.7143349414197107, + "grad_norm": 31.66644009052055, + "learning_rate": 3.043547591996226e-09, + "logits/chosen": -3.402891159057617, + "logits/rejected": -3.384230136871338, + "logps/chosen": -1.9416204690933228, + "logps/rejected": -2.2737221717834473, + "loss": 1.4596, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.8832409381866455, + "rewards/margins": 0.6642034649848938, + "rewards/rejected": -4.5474443435668945, + "step": 9950 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 27.857828176865677, + "learning_rate": 3.0076971655802196e-09, + "logits/chosen": -3.421611785888672, + "logits/rejected": -3.4116172790527344, + "logps/chosen": -2.0732438564300537, + "logps/rejected": -2.276869058609009, + "loss": 1.6573, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.146487712860107, + "rewards/margins": 0.4072498381137848, + "rewards/rejected": -4.553738117218018, + "step": 9960 + }, + { + "epoch": 1.7177808407994486, + "grad_norm": 25.079716328754415, + "learning_rate": 2.972045617252114e-09, + "logits/chosen": -3.383039951324463, + "logits/rejected": -3.3715012073516846, + "logps/chosen": -2.039543628692627, + "logps/rejected": -2.367424488067627, + "loss": 1.5241, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.079087257385254, + "rewards/margins": 0.6557615995407104, + "rewards/rejected": -4.734848976135254, + "step": 9970 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 26.102896562677333, + "learning_rate": 2.9365932694115913e-09, + "logits/chosen": -3.3656280040740967, + "logits/rejected": -3.3586764335632324, + "logps/chosen": -2.017770290374756, + "logps/rejected": -2.250100612640381, + "loss": 1.5921, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.035540580749512, + "rewards/margins": 0.46466073393821716, + "rewards/rejected": -4.500201225280762, + "step": 9980 + }, + { + "epoch": 1.7212267401791868, + "grad_norm": 27.723388119662225, + "learning_rate": 2.9013404426569853e-09, + "logits/chosen": -3.412670135498047, + "logits/rejected": -3.3857197761535645, + "logps/chosen": -2.012162923812866, + "logps/rejected": -2.413954257965088, + "loss": 1.4428, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.024325847625732, + "rewards/margins": 0.8035832643508911, + "rewards/rejected": -4.827908515930176, + "step": 9990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 24.917661327153247, + "learning_rate": 2.8662874557823015e-09, + "logits/chosen": -3.4202284812927246, + "logits/rejected": -3.4076600074768066, + "logps/chosen": -1.936103105545044, + "logps/rejected": -2.273078680038452, + "loss": 1.4467, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.872206211090088, + "rewards/margins": 0.6739513874053955, + "rewards/rejected": -4.546157360076904, + "step": 10000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -3.4524741172790527, + "eval_logits/rejected": -3.4488494396209717, + "eval_logps/chosen": -1.8448247909545898, + "eval_logps/rejected": -2.0249435901641846, + "eval_loss": 1.616273283958435, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.6896495819091797, + "eval_rewards/margins": 0.36023786664009094, + "eval_rewards/rejected": -4.049887180328369, + "eval_runtime": 157.7563, + "eval_samples_per_second": 27.283, + "eval_steps_per_second": 3.41, + "step": 10000 + }, + { + "epoch": 1.724672639558925, + "grad_norm": 27.83690526808807, + "learning_rate": 2.8314346257744175e-09, + "logits/chosen": -3.402029514312744, + "logits/rejected": -3.3907599449157715, + "logps/chosen": -2.0048117637634277, + "logps/rejected": -2.32271146774292, + "loss": 1.4845, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.0096235275268555, + "rewards/margins": 0.6357995867729187, + "rewards/rejected": -4.64542293548584, + "step": 10010 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 35.06914617936652, + "learning_rate": 2.7967822678101468e-09, + "logits/chosen": -3.3755085468292236, + "logits/rejected": -3.3585753440856934, + "logps/chosen": -2.0493006706237793, + "logps/rejected": -2.3235561847686768, + "loss": 1.5393, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.098601341247559, + "rewards/margins": 0.5485110878944397, + "rewards/rejected": -4.6471123695373535, + "step": 10020 + }, + { + "epoch": 1.7281185389386629, + "grad_norm": 31.55934865653278, + "learning_rate": 2.7623306952534314e-09, + "logits/chosen": -3.403115749359131, + "logits/rejected": -3.3850009441375732, + "logps/chosen": -2.0171444416046143, + "logps/rejected": -2.2472317218780518, + "loss": 1.5811, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.0342888832092285, + "rewards/margins": 0.46017417311668396, + "rewards/rejected": -4.4944634437561035, + "step": 10030 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 25.185417849013394, + "learning_rate": 2.7280802196525036e-09, + "logits/chosen": -3.4772956371307373, + "logits/rejected": -3.4693024158477783, + "logps/chosen": -1.9739303588867188, + "logps/rejected": -2.263120174407959, + "loss": 1.5364, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9478607177734375, + "rewards/margins": 0.5783795714378357, + "rewards/rejected": -4.526240348815918, + "step": 10040 + }, + { + "epoch": 1.7315644383184012, + "grad_norm": 31.29401202169722, + "learning_rate": 2.694031150737036e-09, + "logits/chosen": -3.372457504272461, + "logits/rejected": -3.369093418121338, + "logps/chosen": -2.09028959274292, + "logps/rejected": -2.1841254234313965, + "loss": 1.764, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.18057918548584, + "rewards/margins": 0.18767204880714417, + "rewards/rejected": -4.368250846862793, + "step": 10050 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 27.642881160861926, + "learning_rate": 2.6601837964153994e-09, + "logits/chosen": -3.3511931896209717, + "logits/rejected": -3.3483853340148926, + "logps/chosen": -2.021761417388916, + "logps/rejected": -2.287548542022705, + "loss": 1.5992, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.043522834777832, + "rewards/margins": 0.5315744280815125, + "rewards/rejected": -4.57509708404541, + "step": 10060 + }, + { + "epoch": 1.7350103376981392, + "grad_norm": 27.15155636750693, + "learning_rate": 2.6265384627718046e-09, + "logits/chosen": -3.3405463695526123, + "logits/rejected": -3.3328394889831543, + "logps/chosen": -1.9514484405517578, + "logps/rejected": -2.2058067321777344, + "loss": 1.5384, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9028968811035156, + "rewards/margins": 0.5087161660194397, + "rewards/rejected": -4.411613464355469, + "step": 10070 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 33.64085879538794, + "learning_rate": 2.593095454063615e-09, + "logits/chosen": -3.42987060546875, + "logits/rejected": -3.4221389293670654, + "logps/chosen": -2.0721659660339355, + "logps/rejected": -2.2082300186157227, + "loss": 1.705, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.144331932067871, + "rewards/margins": 0.27212804555892944, + "rewards/rejected": -4.416460037231445, + "step": 10080 + }, + { + "epoch": 1.7384562370778773, + "grad_norm": 26.91216719572685, + "learning_rate": 2.5598550727185142e-09, + "logits/chosen": -3.4298949241638184, + "logits/rejected": -3.4195475578308105, + "logps/chosen": -1.9169880151748657, + "logps/rejected": -2.191281318664551, + "loss": 1.4813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.8339760303497314, + "rewards/margins": 0.5485867261886597, + "rewards/rejected": -4.382562637329102, + "step": 10090 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 28.942256632926878, + "learning_rate": 2.5268176193318473e-09, + "logits/chosen": -3.402538776397705, + "logits/rejected": -3.3968594074249268, + "logps/chosen": -2.0303781032562256, + "logps/rejected": -2.2684473991394043, + "loss": 1.5786, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.060756206512451, + "rewards/margins": 0.47613826394081116, + "rewards/rejected": -4.536894798278809, + "step": 10100 + }, + { + "epoch": 1.7401791867677465, + "eval_logits/chosen": -3.449225425720215, + "eval_logits/rejected": -3.4455862045288086, + "eval_logps/chosen": -1.84548020362854, + "eval_logps/rejected": -2.0255517959594727, + "eval_loss": 1.6163413524627686, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.69096040725708, + "eval_rewards/margins": 0.3601427972316742, + "eval_rewards/rejected": -4.051103591918945, + "eval_runtime": 157.4632, + "eval_samples_per_second": 27.333, + "eval_steps_per_second": 3.417, + "step": 10100 + }, + { + "epoch": 1.7419021364576155, + "grad_norm": 26.91792180831608, + "learning_rate": 2.4939833926638397e-09, + "logits/chosen": -3.430773973464966, + "logits/rejected": -3.433886766433716, + "logps/chosen": -2.139800548553467, + "logps/rejected": -2.3567564487457275, + "loss": 1.7204, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.279601097106934, + "rewards/margins": 0.4339119791984558, + "rewards/rejected": -4.713512897491455, + "step": 10110 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 29.159520288334143, + "learning_rate": 2.4613526896369308e-09, + "logits/chosen": -3.399869203567505, + "logits/rejected": -3.3889966011047363, + "logps/chosen": -2.0565710067749023, + "logps/rejected": -2.333401918411255, + "loss": 1.5594, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.113142013549805, + "rewards/margins": 0.5536612272262573, + "rewards/rejected": -4.66680383682251, + "step": 10120 + }, + { + "epoch": 1.7453480358373534, + "grad_norm": 27.578592292700968, + "learning_rate": 2.428925805333082e-09, + "logits/chosen": -3.42749285697937, + "logits/rejected": -3.4149670600891113, + "logps/chosen": -1.9169721603393555, + "logps/rejected": -2.2252702713012695, + "loss": 1.4867, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.833944320678711, + "rewards/margins": 0.6165964007377625, + "rewards/rejected": -4.450540542602539, + "step": 10130 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 32.396838211330966, + "learning_rate": 2.396703032991107e-09, + "logits/chosen": -3.3989977836608887, + "logits/rejected": -3.3766727447509766, + "logps/chosen": -1.9514436721801758, + "logps/rejected": -2.2066168785095215, + "loss": 1.5462, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9028873443603516, + "rewards/margins": 0.5103455781936646, + "rewards/rejected": -4.413233757019043, + "step": 10140 + }, + { + "epoch": 1.7487939352170918, + "grad_norm": 23.983253047079614, + "learning_rate": 2.364684664004016e-09, + "logits/chosen": -3.372251033782959, + "logits/rejected": -3.3619492053985596, + "logps/chosen": -2.038382053375244, + "logps/rejected": -2.3031179904937744, + "loss": 1.5712, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.076764106750488, + "rewards/margins": 0.529471755027771, + "rewards/rejected": -4.606235980987549, + "step": 10150 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 30.37732802706253, + "learning_rate": 2.3328709879163826e-09, + "logits/chosen": -3.3815879821777344, + "logits/rejected": -3.373589277267456, + "logps/chosen": -2.0754475593566895, + "logps/rejected": -2.319063186645508, + "loss": 1.6147, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.150895118713379, + "rewards/margins": 0.48723095655441284, + "rewards/rejected": -4.638126373291016, + "step": 10160 + }, + { + "epoch": 1.7522398345968297, + "grad_norm": 21.36487806188162, + "learning_rate": 2.301262292421732e-09, + "logits/chosen": -3.394362688064575, + "logits/rejected": -3.3879902362823486, + "logps/chosen": -1.9925777912139893, + "logps/rejected": -2.215636968612671, + "loss": 1.6544, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.9851555824279785, + "rewards/margins": 0.44611844420433044, + "rewards/rejected": -4.431273937225342, + "step": 10170 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 22.59530080025638, + "learning_rate": 2.269858863359936e-09, + "logits/chosen": -3.3547675609588623, + "logits/rejected": -3.3394851684570312, + "logps/chosen": -1.9032920598983765, + "logps/rejected": -2.2436602115631104, + "loss": 1.4502, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.806584119796753, + "rewards/margins": 0.680736243724823, + "rewards/rejected": -4.487320423126221, + "step": 10180 + }, + { + "epoch": 1.755685733976568, + "grad_norm": 28.95057292526965, + "learning_rate": 2.2386609847146077e-09, + "logits/chosen": -3.3782577514648438, + "logits/rejected": -3.363912582397461, + "logps/chosen": -1.9338315725326538, + "logps/rejected": -2.1901416778564453, + "loss": 1.5539, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.8676631450653076, + "rewards/margins": 0.5126203894615173, + "rewards/rejected": -4.380283355712891, + "step": 10190 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 25.976155489365432, + "learning_rate": 2.207668938610582e-09, + "logits/chosen": -3.410379409790039, + "logits/rejected": -3.400247097015381, + "logps/chosen": -2.0425350666046143, + "logps/rejected": -2.310239315032959, + "loss": 1.5566, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.0850701332092285, + "rewards/margins": 0.5354089736938477, + "rewards/rejected": -4.620478630065918, + "step": 10200 + }, + { + "epoch": 1.757408683666437, + "eval_logits/chosen": -3.4634041786193848, + "eval_logits/rejected": -3.459841251373291, + "eval_logps/chosen": -1.8467559814453125, + "eval_logps/rejected": -2.026546001434326, + "eval_loss": 1.6168062686920166, + "eval_rewards/accuracies": 0.6215148568153381, + "eval_rewards/chosen": -3.693511962890625, + "eval_rewards/margins": 0.35958027839660645, + "eval_rewards/rejected": -4.053092002868652, + "eval_runtime": 157.9638, + "eval_samples_per_second": 27.247, + "eval_steps_per_second": 3.406, + "step": 10200 + }, + { + "epoch": 1.759131633356306, + "grad_norm": 30.691300251531644, + "learning_rate": 2.176883005311303e-09, + "logits/chosen": -3.4175727367401123, + "logits/rejected": -3.415929079055786, + "logps/chosen": -2.0461859703063965, + "logps/rejected": -2.392319917678833, + "loss": 1.4782, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.092371940612793, + "rewards/margins": 0.6922675371170044, + "rewards/rejected": -4.784639835357666, + "step": 10210 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 24.921015721447304, + "learning_rate": 2.1463034632163533e-09, + "logits/chosen": -3.391049861907959, + "logits/rejected": -3.3896865844726562, + "logps/chosen": -1.946466088294983, + "logps/rejected": -2.260529041290283, + "loss": 1.5106, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.892932176589966, + "rewards/margins": 0.6281254291534424, + "rewards/rejected": -4.521058082580566, + "step": 10220 + }, + { + "epoch": 1.762577532736044, + "grad_norm": 28.01882881215195, + "learning_rate": 2.1159305888588664e-09, + "logits/chosen": -3.337859630584717, + "logits/rejected": -3.3212833404541016, + "logps/chosen": -2.0040853023529053, + "logps/rejected": -2.329961061477661, + "loss": 1.4789, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.0081706047058105, + "rewards/margins": 0.6517510414123535, + "rewards/rejected": -4.659922122955322, + "step": 10230 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 26.238540121008892, + "learning_rate": 2.085764656903105e-09, + "logits/chosen": -3.369973659515381, + "logits/rejected": -3.3581345081329346, + "logps/chosen": -1.979588508605957, + "logps/rejected": -2.2576193809509277, + "loss": 1.5293, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.959177017211914, + "rewards/margins": 0.556061327457428, + "rewards/rejected": -4.5152387619018555, + "step": 10240 + }, + { + "epoch": 1.7660234321157822, + "grad_norm": 27.57843193311513, + "learning_rate": 2.055805940141897e-09, + "logits/chosen": -3.420767307281494, + "logits/rejected": -3.399543046951294, + "logps/chosen": -1.9594180583953857, + "logps/rejected": -2.298522710800171, + "loss": 1.4609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9188361167907715, + "rewards/margins": 0.6782091856002808, + "rewards/rejected": -4.597045421600342, + "step": 10250 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 24.257055523707752, + "learning_rate": 2.026054709494235e-09, + "logits/chosen": -3.372257947921753, + "logits/rejected": -3.3724048137664795, + "logps/chosen": -2.0588574409484863, + "logps/rejected": -2.30212140083313, + "loss": 1.5746, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.117714881896973, + "rewards/margins": 0.48652735352516174, + "rewards/rejected": -4.60424280166626, + "step": 10260 + }, + { + "epoch": 1.7694693314955203, + "grad_norm": 25.17870790535535, + "learning_rate": 1.9965112340027874e-09, + "logits/chosen": -3.3797316551208496, + "logits/rejected": -3.3742434978485107, + "logps/chosen": -2.0292623043060303, + "logps/rejected": -2.2535901069641113, + "loss": 1.6069, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.0585246086120605, + "rewards/margins": 0.44865554571151733, + "rewards/rejected": -4.507180213928223, + "step": 10270 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 27.07884069327652, + "learning_rate": 1.9671757808314675e-09, + "logits/chosen": -3.3692023754119873, + "logits/rejected": -3.358720064163208, + "logps/chosen": -2.103325366973877, + "logps/rejected": -2.316772222518921, + "loss": 1.6105, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.206650733947754, + "rewards/margins": 0.4268937110900879, + "rewards/rejected": -4.633544445037842, + "step": 10280 + }, + { + "epoch": 1.7729152308752585, + "grad_norm": 25.58942124268339, + "learning_rate": 1.9380486152630547e-09, + "logits/chosen": -3.3406243324279785, + "logits/rejected": -3.341259002685547, + "logps/chosen": -2.0119788646698, + "logps/rejected": -2.2350258827209473, + "loss": 1.5998, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.0239577293396, + "rewards/margins": 0.4460935592651367, + "rewards/rejected": -4.4700517654418945, + "step": 10290 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 26.094164922133285, + "learning_rate": 1.909130000696732e-09, + "logits/chosen": -3.3790981769561768, + "logits/rejected": -3.370288372039795, + "logps/chosen": -2.017609119415283, + "logps/rejected": -2.234529495239258, + "loss": 1.6336, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.035218238830566, + "rewards/margins": 0.4338412284851074, + "rewards/rejected": -4.469058990478516, + "step": 10300 + }, + { + "epoch": 1.7746381805651275, + "eval_logits/chosen": -3.4472544193267822, + "eval_logits/rejected": -3.4436070919036865, + "eval_logps/chosen": -1.8466594219207764, + "eval_logps/rejected": -2.026883602142334, + "eval_loss": 1.6164582967758179, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.6933188438415527, + "eval_rewards/margins": 0.36044859886169434, + "eval_rewards/rejected": -4.053767204284668, + "eval_runtime": 157.4658, + "eval_samples_per_second": 27.333, + "eval_steps_per_second": 3.417, + "step": 10300 + }, + { + "epoch": 1.7763611302549966, + "grad_norm": 23.867028161064592, + "learning_rate": 1.880420198645774e-09, + "logits/chosen": -3.338766098022461, + "logits/rejected": -3.324528217315674, + "logps/chosen": -2.002267837524414, + "logps/rejected": -2.3285584449768066, + "loss": 1.4475, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.004535675048828, + "rewards/margins": 0.6525813341140747, + "rewards/rejected": -4.657116889953613, + "step": 10310 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 23.48918641658649, + "learning_rate": 1.8519194687351191e-09, + "logits/chosen": -3.391566753387451, + "logits/rejected": -3.376260757446289, + "logps/chosen": -2.054238796234131, + "logps/rejected": -2.3049066066741943, + "loss": 1.5853, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.108477592468262, + "rewards/margins": 0.5013359189033508, + "rewards/rejected": -4.609813213348389, + "step": 10320 + }, + { + "epoch": 1.7798070296347346, + "grad_norm": 24.891300707253066, + "learning_rate": 1.8236280686990653e-09, + "logits/chosen": -3.402294158935547, + "logits/rejected": -3.394604444503784, + "logps/chosen": -1.9756720066070557, + "logps/rejected": -2.298142671585083, + "loss": 1.4642, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9513440132141113, + "rewards/margins": 0.6449408531188965, + "rewards/rejected": -4.596285343170166, + "step": 10330 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 30.657920419294104, + "learning_rate": 1.7955462543789268e-09, + "logits/chosen": -3.40057373046875, + "logits/rejected": -3.390129804611206, + "logps/chosen": -2.022609233856201, + "logps/rejected": -2.2765769958496094, + "loss": 1.5435, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.045218467712402, + "rewards/margins": 0.5079357028007507, + "rewards/rejected": -4.553153991699219, + "step": 10340 + }, + { + "epoch": 1.7832529290144727, + "grad_norm": 26.42747059563329, + "learning_rate": 1.7676742797207045e-09, + "logits/chosen": -3.4635109901428223, + "logits/rejected": -3.4491753578186035, + "logps/chosen": -1.9680490493774414, + "logps/rejected": -2.2178990840911865, + "loss": 1.5661, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.936098098754883, + "rewards/margins": 0.4997001588344574, + "rewards/rejected": -4.435798168182373, + "step": 10350 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 29.180745220056384, + "learning_rate": 1.7400123967728192e-09, + "logits/chosen": -3.358567714691162, + "logits/rejected": -3.340315341949463, + "logps/chosen": -1.957014799118042, + "logps/rejected": -2.227858781814575, + "loss": 1.5406, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.914029598236084, + "rewards/margins": 0.5416873693466187, + "rewards/rejected": -4.45571756362915, + "step": 10360 + }, + { + "epoch": 1.786698828394211, + "grad_norm": 24.002466400149814, + "learning_rate": 1.7125608556838034e-09, + "logits/chosen": -3.2896666526794434, + "logits/rejected": -3.272705078125, + "logps/chosen": -1.9320682287216187, + "logps/rejected": -2.3524723052978516, + "loss": 1.3379, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8641364574432373, + "rewards/margins": 0.8408076167106628, + "rewards/rejected": -4.704944610595703, + "step": 10370 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 27.789106629012554, + "learning_rate": 1.6853199047000583e-09, + "logits/chosen": -3.3901572227478027, + "logits/rejected": -3.383671522140503, + "logps/chosen": -2.1301345825195312, + "logps/rejected": -2.2762646675109863, + "loss": 1.7656, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.2602691650390625, + "rewards/margins": 0.2922601103782654, + "rewards/rejected": -4.552529335021973, + "step": 10380 + }, + { + "epoch": 1.7901447277739488, + "grad_norm": 24.618941925377477, + "learning_rate": 1.6582897901636028e-09, + "logits/chosen": -3.401486873626709, + "logits/rejected": -3.387925624847412, + "logps/chosen": -2.024334192276001, + "logps/rejected": -2.347886085510254, + "loss": 1.4713, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.048668384552002, + "rewards/margins": 0.6471039056777954, + "rewards/rejected": -4.695772171020508, + "step": 10390 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 24.674897691698433, + "learning_rate": 1.6314707565098396e-09, + "logits/chosen": -3.369704484939575, + "logits/rejected": -3.3580455780029297, + "logps/chosen": -1.9531681537628174, + "logps/rejected": -2.338794708251953, + "loss": 1.3869, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.9063363075256348, + "rewards/margins": 0.7712531685829163, + "rewards/rejected": -4.677589416503906, + "step": 10400 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -3.4512581825256348, + "eval_logits/rejected": -3.44763445854187, + "eval_logps/chosen": -1.8468294143676758, + "eval_logps/rejected": -2.026984930038452, + "eval_loss": 1.6163454055786133, + "eval_rewards/accuracies": 0.6217471957206726, + "eval_rewards/chosen": -3.6936588287353516, + "eval_rewards/margins": 0.36031123995780945, + "eval_rewards/rejected": -4.053969860076904, + "eval_runtime": 156.812, + "eval_samples_per_second": 27.447, + "eval_steps_per_second": 3.431, + "step": 10400 + }, + { + "epoch": 1.7935906271536872, + "grad_norm": 24.28334495471964, + "learning_rate": 1.6048630462653618e-09, + "logits/chosen": -3.381967544555664, + "logits/rejected": -3.360619068145752, + "logps/chosen": -2.0169501304626465, + "logps/rejected": -2.342252254486084, + "loss": 1.4752, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.033900260925293, + "rewards/margins": 0.6506044864654541, + "rewards/rejected": -4.684504508972168, + "step": 10410 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 26.275026703925363, + "learning_rate": 1.5784669000457328e-09, + "logits/chosen": -3.3932032585144043, + "logits/rejected": -3.377817153930664, + "logps/chosen": -2.0148704051971436, + "logps/rejected": -2.278454065322876, + "loss": 1.5524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.029740810394287, + "rewards/margins": 0.5271669626235962, + "rewards/rejected": -4.556908130645752, + "step": 10420 + }, + { + "epoch": 1.7970365265334252, + "grad_norm": 30.922923855656204, + "learning_rate": 1.5522825565533443e-09, + "logits/chosen": -3.425365447998047, + "logits/rejected": -3.4170615673065186, + "logps/chosen": -2.0239665508270264, + "logps/rejected": -2.209071397781372, + "loss": 1.6698, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.047933101654053, + "rewards/margins": 0.37020939588546753, + "rewards/rejected": -4.418142795562744, + "step": 10430 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 24.40339753719164, + "learning_rate": 1.5263102525752219e-09, + "logits/chosen": -3.432948589324951, + "logits/rejected": -3.4264023303985596, + "logps/chosen": -2.068981170654297, + "logps/rejected": -2.2332615852355957, + "loss": 1.6569, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.137962341308594, + "rewards/margins": 0.3285607397556305, + "rewards/rejected": -4.466523170471191, + "step": 10440 + }, + { + "epoch": 1.8004824259131633, + "grad_norm": 25.078420841601332, + "learning_rate": 1.500550222980923e-09, + "logits/chosen": -3.397326946258545, + "logits/rejected": -3.394031047821045, + "logps/chosen": -2.069031238555908, + "logps/rejected": -2.3183205127716064, + "loss": 1.5706, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.138062477111816, + "rewards/margins": 0.4985784590244293, + "rewards/rejected": -4.636641025543213, + "step": 10450 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 32.20395667705715, + "learning_rate": 1.4750027007203653e-09, + "logits/chosen": -3.398843288421631, + "logits/rejected": -3.386193037033081, + "logps/chosen": -1.9356803894042969, + "logps/rejected": -2.181884765625, + "loss": 1.5609, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.8713607788085938, + "rewards/margins": 0.4924088418483734, + "rewards/rejected": -4.36376953125, + "step": 10460 + }, + { + "epoch": 1.8039283252929015, + "grad_norm": 26.834313964424137, + "learning_rate": 1.4496679168217645e-09, + "logits/chosen": -3.3025131225585938, + "logits/rejected": -3.290679454803467, + "logps/chosen": -1.9691206216812134, + "logps/rejected": -2.2298367023468018, + "loss": 1.5753, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9382412433624268, + "rewards/margins": 0.5214322209358215, + "rewards/rejected": -4.4596734046936035, + "step": 10470 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 27.622887113300063, + "learning_rate": 1.424546100389523e-09, + "logits/chosen": -3.3977298736572266, + "logits/rejected": -3.3787741661071777, + "logps/chosen": -1.966896653175354, + "logps/rejected": -2.284470558166504, + "loss": 1.5289, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.933793306350708, + "rewards/margins": 0.6351473927497864, + "rewards/rejected": -4.568941116333008, + "step": 10480 + }, + { + "epoch": 1.8073742246726394, + "grad_norm": 26.171618365117457, + "learning_rate": 1.3996374786021641e-09, + "logits/chosen": -3.397204637527466, + "logits/rejected": -3.3808207511901855, + "logps/chosen": -1.927760362625122, + "logps/rejected": -2.2958831787109375, + "loss": 1.441, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.855520725250244, + "rewards/margins": 0.7362455129623413, + "rewards/rejected": -4.591766357421875, + "step": 10490 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 26.95498139495041, + "learning_rate": 1.3749422767102697e-09, + "logits/chosen": -3.3866755962371826, + "logits/rejected": -3.381289005279541, + "logps/chosen": -1.9732935428619385, + "logps/rejected": -2.243061065673828, + "loss": 1.5501, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.946587085723877, + "rewards/margins": 0.5395349264144897, + "rewards/rejected": -4.486122131347656, + "step": 10500 + }, + { + "epoch": 1.8090971743625086, + "eval_logits/chosen": -3.4528300762176514, + "eval_logits/rejected": -3.4492132663726807, + "eval_logps/chosen": -1.8473472595214844, + "eval_logps/rejected": -2.027251720428467, + "eval_loss": 1.6167519092559814, + "eval_rewards/accuracies": 0.6231412887573242, + "eval_rewards/chosen": -3.6946945190429688, + "eval_rewards/margins": 0.359809011220932, + "eval_rewards/rejected": -4.054503440856934, + "eval_runtime": 157.3756, + "eval_samples_per_second": 27.349, + "eval_steps_per_second": 3.419, + "step": 10500 + }, + { + "epoch": 1.8108201240523778, + "grad_norm": 28.28741606720961, + "learning_rate": 1.3504607180344462e-09, + "logits/chosen": -3.378485918045044, + "logits/rejected": -3.366244077682495, + "logps/chosen": -2.0741066932678223, + "logps/rejected": -2.2906525135040283, + "loss": 1.6114, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.1482133865356445, + "rewards/margins": 0.4330918788909912, + "rewards/rejected": -4.581305027008057, + "step": 10510 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 23.952654812112154, + "learning_rate": 1.3261930239633263e-09, + "logits/chosen": -3.4293341636657715, + "logits/rejected": -3.431368350982666, + "logps/chosen": -2.048293352127075, + "logps/rejected": -2.301866054534912, + "loss": 1.5776, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.09658670425415, + "rewards/margins": 0.50714510679245, + "rewards/rejected": -4.603732109069824, + "step": 10520 + }, + { + "epoch": 1.8142660234321157, + "grad_norm": 31.560271783998758, + "learning_rate": 1.3021394139515196e-09, + "logits/chosen": -3.3743948936462402, + "logits/rejected": -3.363818407058716, + "logps/chosen": -2.0341897010803223, + "logps/rejected": -2.208510398864746, + "loss": 1.6874, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.0683794021606445, + "rewards/margins": 0.3486413061618805, + "rewards/rejected": -4.417020797729492, + "step": 10530 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 26.35510719231006, + "learning_rate": 1.2783001055176905e-09, + "logits/chosen": -3.36950945854187, + "logits/rejected": -3.355470657348633, + "logps/chosen": -1.9890272617340088, + "logps/rejected": -2.2732205390930176, + "loss": 1.5402, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.9780545234680176, + "rewards/margins": 0.5683860182762146, + "rewards/rejected": -4.546441078186035, + "step": 10540 + }, + { + "epoch": 1.817711922811854, + "grad_norm": 30.53423832086018, + "learning_rate": 1.2546753142425314e-09, + "logits/chosen": -3.449842929840088, + "logits/rejected": -3.4498214721679688, + "logps/chosen": -2.0826101303100586, + "logps/rejected": -2.282869577407837, + "loss": 1.6773, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.165220260620117, + "rewards/margins": 0.4005189836025238, + "rewards/rejected": -4.565739154815674, + "step": 10550 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 26.4591837153588, + "learning_rate": 1.23126525376685e-09, + "logits/chosen": -3.3651070594787598, + "logits/rejected": -3.3563663959503174, + "logps/chosen": -2.015317440032959, + "logps/rejected": -2.3080027103424072, + "loss": 1.5155, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.030634880065918, + "rewards/margins": 0.5853704214096069, + "rewards/rejected": -4.6160054206848145, + "step": 10560 + }, + { + "epoch": 1.821157822191592, + "grad_norm": 22.051310304840822, + "learning_rate": 1.2080701357896266e-09, + "logits/chosen": -3.410693407058716, + "logits/rejected": -3.4026222229003906, + "logps/chosen": -2.0012669563293457, + "logps/rejected": -2.2908103466033936, + "loss": 1.5234, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.002533912658691, + "rewards/margins": 0.5790874361991882, + "rewards/rejected": -4.581620693206787, + "step": 10570 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 24.284363992070322, + "learning_rate": 1.185090170066097e-09, + "logits/chosen": -3.4124627113342285, + "logits/rejected": -3.403453826904297, + "logps/chosen": -2.06648325920105, + "logps/rejected": -2.281487464904785, + "loss": 1.6328, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.1329665184021, + "rewards/margins": 0.4300087094306946, + "rewards/rejected": -4.56297492980957, + "step": 10580 + }, + { + "epoch": 1.82460372157133, + "grad_norm": 23.753510985180796, + "learning_rate": 1.1623255644058637e-09, + "logits/chosen": -3.374683380126953, + "logits/rejected": -3.3535304069519043, + "logps/chosen": -2.0431432723999023, + "logps/rejected": -2.3552169799804688, + "loss": 1.458, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.086286544799805, + "rewards/margins": 0.6241474747657776, + "rewards/rejected": -4.7104339599609375, + "step": 10590 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 23.220504143448945, + "learning_rate": 1.1397765246710072e-09, + "logits/chosen": -3.438599109649658, + "logits/rejected": -3.4326069355010986, + "logps/chosen": -1.9828017950057983, + "logps/rejected": -2.199333906173706, + "loss": 1.6115, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9656035900115967, + "rewards/margins": 0.43306416273117065, + "rewards/rejected": -4.398667812347412, + "step": 10600 + }, + { + "epoch": 1.8263266712611992, + "eval_logits/chosen": -3.4488871097564697, + "eval_logits/rejected": -3.4452497959136963, + "eval_logps/chosen": -1.8465189933776855, + "eval_logps/rejected": -2.0269534587860107, + "eval_loss": 1.6158052682876587, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.693037986755371, + "eval_rewards/margins": 0.3608691692352295, + "eval_rewards/rejected": -4.0539069175720215, + "eval_runtime": 157.1549, + "eval_samples_per_second": 27.387, + "eval_steps_per_second": 3.423, + "step": 10600 + }, + { + "epoch": 1.8280496209510684, + "grad_norm": 24.585159772059775, + "learning_rate": 1.1174432547742307e-09, + "logits/chosen": -3.3746566772460938, + "logits/rejected": -3.3680107593536377, + "logps/chosen": -1.980501413345337, + "logps/rejected": -2.243885040283203, + "loss": 1.5556, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.961002826690674, + "rewards/margins": 0.526767373085022, + "rewards/rejected": -4.487770080566406, + "step": 10610 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 28.9224978257502, + "learning_rate": 1.095325956677015e-09, + "logits/chosen": -3.3592841625213623, + "logits/rejected": -3.3458988666534424, + "logps/chosen": -1.9858665466308594, + "logps/rejected": -2.206908941268921, + "loss": 1.6392, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9717330932617188, + "rewards/margins": 0.4420841336250305, + "rewards/rejected": -4.413817882537842, + "step": 10620 + }, + { + "epoch": 1.8314955203308063, + "grad_norm": 29.637640041852062, + "learning_rate": 1.0734248303877812e-09, + "logits/chosen": -3.4070637226104736, + "logits/rejected": -3.3985595703125, + "logps/chosen": -2.0170860290527344, + "logps/rejected": -2.2444097995758057, + "loss": 1.582, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.034172058105469, + "rewards/margins": 0.4546467661857605, + "rewards/rejected": -4.488819599151611, + "step": 10630 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 36.71923010318626, + "learning_rate": 1.051740073960114e-09, + "logits/chosen": -3.3674988746643066, + "logits/rejected": -3.3544132709503174, + "logps/chosen": -2.0536468029022217, + "logps/rejected": -2.360785961151123, + "loss": 1.5649, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.107293605804443, + "rewards/margins": 0.6142787337303162, + "rewards/rejected": -4.721571922302246, + "step": 10640 + }, + { + "epoch": 1.8349414197105445, + "grad_norm": 23.88029340226108, + "learning_rate": 1.0302718834909213e-09, + "logits/chosen": -3.4211647510528564, + "logits/rejected": -3.4095406532287598, + "logps/chosen": -1.9759200811386108, + "logps/rejected": -2.276256561279297, + "loss": 1.4618, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.9518401622772217, + "rewards/margins": 0.6006731390953064, + "rewards/rejected": -4.552513122558594, + "step": 10650 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 27.207875391162798, + "learning_rate": 1.0090204531187168e-09, + "logits/chosen": -3.374750852584839, + "logits/rejected": -3.3688838481903076, + "logps/chosen": -2.076523542404175, + "logps/rejected": -2.3039824962615967, + "loss": 1.595, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.15304708480835, + "rewards/margins": 0.4549180567264557, + "rewards/rejected": -4.607964992523193, + "step": 10660 + }, + { + "epoch": 1.8383873190902826, + "grad_norm": 29.84912784370513, + "learning_rate": 9.8798597502181e-10, + "logits/chosen": -3.3824145793914795, + "logits/rejected": -3.374300479888916, + "logps/chosen": -2.060993194580078, + "logps/rejected": -2.3516900539398193, + "loss": 1.576, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.121986389160156, + "rewards/margins": 0.5813939571380615, + "rewards/rejected": -4.703380107879639, + "step": 10670 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 26.868404186835335, + "learning_rate": 9.671686394166156e-10, + "logits/chosen": -3.3935294151306152, + "logits/rejected": -3.3688817024230957, + "logps/chosen": -2.0074622631073, + "logps/rejected": -2.3132314682006836, + "loss": 1.5312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.0149245262146, + "rewards/margins": 0.6115384101867676, + "rewards/rejected": -4.626462936401367, + "step": 10680 + }, + { + "epoch": 1.8418332184700206, + "grad_norm": 26.860109682131725, + "learning_rate": 9.465686345558944e-10, + "logits/chosen": -3.391305923461914, + "logits/rejected": -3.387481212615967, + "logps/chosen": -1.9894644021987915, + "logps/rejected": -2.212947368621826, + "loss": 1.61, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.978928804397583, + "rewards/margins": 0.44696635007858276, + "rewards/rejected": -4.425894737243652, + "step": 10690 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 24.33853610460194, + "learning_rate": 9.261861467270788e-10, + "logits/chosen": -3.424182415008545, + "logits/rejected": -3.405421733856201, + "logps/chosen": -1.9788753986358643, + "logps/rejected": -2.268667221069336, + "loss": 1.5153, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.9577507972717285, + "rewards/margins": 0.579583466053009, + "rewards/rejected": -4.537334442138672, + "step": 10700 + }, + { + "epoch": 1.8435561681598898, + "eval_logits/chosen": -3.4457170963287354, + "eval_logits/rejected": -3.4420700073242188, + "eval_logps/chosen": -1.8469839096069336, + "eval_logps/rejected": -2.0271010398864746, + "eval_loss": 1.616729497909546, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.693967819213867, + "eval_rewards/margins": 0.36023351550102234, + "eval_rewards/rejected": -4.054202079772949, + "eval_runtime": 156.9812, + "eval_samples_per_second": 27.417, + "eval_steps_per_second": 3.427, + "step": 10700 + }, + { + "epoch": 1.8452791178497587, + "grad_norm": 24.778628803347726, + "learning_rate": 9.060213602505778e-10, + "logits/chosen": -3.374159336090088, + "logits/rejected": -3.3584251403808594, + "logps/chosen": -1.9941987991333008, + "logps/rejected": -2.2356715202331543, + "loss": 1.591, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.9883975982666016, + "rewards/margins": 0.48294591903686523, + "rewards/rejected": -4.471343040466309, + "step": 10710 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 26.41478502861514, + "learning_rate": 8.860744574781032e-10, + "logits/chosen": -3.38908052444458, + "logits/rejected": -3.372950315475464, + "logps/chosen": -2.029578447341919, + "logps/rejected": -2.3962135314941406, + "loss": 1.5256, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.059156894683838, + "rewards/margins": 0.7332701683044434, + "rewards/rejected": -4.792427062988281, + "step": 10720 + }, + { + "epoch": 1.848725017229497, + "grad_norm": 26.660594491516687, + "learning_rate": 8.663456187910423e-10, + "logits/chosen": -3.4262642860412598, + "logits/rejected": -3.40718412399292, + "logps/chosen": -2.0151467323303223, + "logps/rejected": -2.24993634223938, + "loss": 1.5984, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.0302934646606445, + "rewards/margins": 0.46957927942276, + "rewards/rejected": -4.49987268447876, + "step": 10730 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 29.579429142656203, + "learning_rate": 8.468350225987909e-10, + "logits/chosen": -3.3597683906555176, + "logits/rejected": -3.355727434158325, + "logps/chosen": -2.078266143798828, + "logps/rejected": -2.1835720539093018, + "loss": 1.7517, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.156532287597656, + "rewards/margins": 0.21061138808727264, + "rewards/rejected": -4.3671441078186035, + "step": 10740 + }, + { + "epoch": 1.852170916609235, + "grad_norm": 22.678700804795913, + "learning_rate": 8.275428453371813e-10, + "logits/chosen": -3.369760513305664, + "logits/rejected": -3.3537421226501465, + "logps/chosen": -1.9459880590438843, + "logps/rejected": -2.312242269515991, + "loss": 1.4091, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8919761180877686, + "rewards/margins": 0.7325085401535034, + "rewards/rejected": -4.624484539031982, + "step": 10750 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 28.4708640380067, + "learning_rate": 8.084692614668543e-10, + "logits/chosen": -3.3657565116882324, + "logits/rejected": -3.3550362586975098, + "logps/chosen": -1.9698045253753662, + "logps/rejected": -2.2087206840515137, + "loss": 1.5749, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.9396090507507324, + "rewards/margins": 0.47783222794532776, + "rewards/rejected": -4.417441368103027, + "step": 10760 + }, + { + "epoch": 1.8556168159889732, + "grad_norm": 22.73481013091019, + "learning_rate": 7.896144434716951e-10, + "logits/chosen": -3.3628830909729004, + "logits/rejected": -3.3544068336486816, + "logps/chosen": -1.9884296655654907, + "logps/rejected": -2.2667458057403564, + "loss": 1.5873, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.9768593311309814, + "rewards/margins": 0.5566323399543762, + "rewards/rejected": -4.533491611480713, + "step": 10770 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 25.916673490567877, + "learning_rate": 7.709785618572801e-10, + "logits/chosen": -3.439791202545166, + "logits/rejected": -3.4297046661376953, + "logps/chosen": -1.9773257970809937, + "logps/rejected": -2.2534193992614746, + "loss": 1.5212, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.9546515941619873, + "rewards/margins": 0.5521878600120544, + "rewards/rejected": -4.506838798522949, + "step": 10780 + }, + { + "epoch": 1.8590627153687111, + "grad_norm": 28.11036744280473, + "learning_rate": 7.525617851493166e-10, + "logits/chosen": -3.4316086769104004, + "logits/rejected": -3.417210102081299, + "logps/chosen": -1.9320377111434937, + "logps/rejected": -2.333613872528076, + "loss": 1.4528, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.8640754222869873, + "rewards/margins": 0.8031523823738098, + "rewards/rejected": -4.667227745056152, + "step": 10790 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 27.380746593017555, + "learning_rate": 7.343642798921384e-10, + "logits/chosen": -3.441600799560547, + "logits/rejected": -3.4314606189727783, + "logps/chosen": -1.9913040399551392, + "logps/rejected": -2.3773794174194336, + "loss": 1.4252, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.9826080799102783, + "rewards/margins": 0.7721507549285889, + "rewards/rejected": -4.754758834838867, + "step": 10800 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -3.4485909938812256, + "eval_logits/rejected": -3.4449594020843506, + "eval_logps/chosen": -1.846598744392395, + "eval_logps/rejected": -2.0272164344787598, + "eval_loss": 1.6157639026641846, + "eval_rewards/accuracies": 0.6215148568153381, + "eval_rewards/chosen": -3.69319748878479, + "eval_rewards/margins": 0.3612358272075653, + "eval_rewards/rejected": -4.0544328689575195, + "eval_runtime": 157.1608, + "eval_samples_per_second": 27.386, + "eval_steps_per_second": 3.423, + "step": 10800 + }, + { + "epoch": 1.8625086147484493, + "grad_norm": 23.13171955742743, + "learning_rate": 7.163862106471852e-10, + "logits/chosen": -3.391110897064209, + "logits/rejected": -3.3694920539855957, + "logps/chosen": -2.0106093883514404, + "logps/rejected": -2.2952871322631836, + "loss": 1.5456, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.021218776702881, + "rewards/margins": 0.5693555474281311, + "rewards/rejected": -4.590574264526367, + "step": 10810 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 29.065477075548763, + "learning_rate": 6.986277399915197e-10, + "logits/chosen": -3.3638012409210205, + "logits/rejected": -3.3515002727508545, + "logps/chosen": -1.955622673034668, + "logps/rejected": -2.2531790733337402, + "loss": 1.5695, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.911245346069336, + "rewards/margins": 0.595112681388855, + "rewards/rejected": -4.5063581466674805, + "step": 10820 + }, + { + "epoch": 1.8659545141281875, + "grad_norm": 32.715270073875686, + "learning_rate": 6.810890285163628e-10, + "logits/chosen": -3.3777356147766113, + "logits/rejected": -3.3626091480255127, + "logps/chosen": -1.9976733922958374, + "logps/rejected": -2.3324923515319824, + "loss": 1.4524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.995346784591675, + "rewards/margins": 0.6696378588676453, + "rewards/rejected": -4.664984703063965, + "step": 10830 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 26.243672055093388, + "learning_rate": 6.637702348256307e-10, + "logits/chosen": -3.4121055603027344, + "logits/rejected": -3.3951950073242188, + "logps/chosen": -1.9369113445281982, + "logps/rejected": -2.2816243171691895, + "loss": 1.5493, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.8738226890563965, + "rewards/margins": 0.6894260048866272, + "rewards/rejected": -4.563248634338379, + "step": 10840 + }, + { + "epoch": 1.8694004135079254, + "grad_norm": 24.999654024454227, + "learning_rate": 6.466715155345109e-10, + "logits/chosen": -3.3505184650421143, + "logits/rejected": -3.345618486404419, + "logps/chosen": -1.937790870666504, + "logps/rejected": -2.2204010486602783, + "loss": 1.5281, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.875581741333008, + "rewards/margins": 0.5652204751968384, + "rewards/rejected": -4.440802097320557, + "step": 10850 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 26.117567200174392, + "learning_rate": 6.2979302526803e-10, + "logits/chosen": -3.440147876739502, + "logits/rejected": -3.42299222946167, + "logps/chosen": -1.9944636821746826, + "logps/rejected": -2.2180466651916504, + "loss": 1.5798, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9889273643493652, + "rewards/margins": 0.44716590642929077, + "rewards/rejected": -4.436093330383301, + "step": 10860 + }, + { + "epoch": 1.8728463128876638, + "grad_norm": 25.581571418721794, + "learning_rate": 6.131349166596883e-10, + "logits/chosen": -3.3393867015838623, + "logits/rejected": -3.337136745452881, + "logps/chosen": -1.9832773208618164, + "logps/rejected": -2.3098785877227783, + "loss": 1.4485, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.966554641723633, + "rewards/margins": 0.6532031297683716, + "rewards/rejected": -4.619757175445557, + "step": 10870 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 27.719099565389534, + "learning_rate": 5.966973403500303e-10, + "logits/chosen": -3.3893535137176514, + "logits/rejected": -3.378805637359619, + "logps/chosen": -2.0204780101776123, + "logps/rejected": -2.232448101043701, + "loss": 1.6141, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.040956020355225, + "rewards/margins": 0.4239395558834076, + "rewards/rejected": -4.464896202087402, + "step": 10880 + }, + { + "epoch": 1.8762922122674017, + "grad_norm": 23.330208015162235, + "learning_rate": 5.804804449853401e-10, + "logits/chosen": -3.420135974884033, + "logits/rejected": -3.4115073680877686, + "logps/chosen": -1.9878511428833008, + "logps/rejected": -2.301091432571411, + "loss": 1.4702, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.9757022857666016, + "rewards/margins": 0.6264804601669312, + "rewards/rejected": -4.602182865142822, + "step": 10890 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 23.51920284109079, + "learning_rate": 5.644843772162373e-10, + "logits/chosen": -3.4475085735321045, + "logits/rejected": -3.430647611618042, + "logps/chosen": -1.9232308864593506, + "logps/rejected": -2.160459280014038, + "loss": 1.5627, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.846461772918701, + "rewards/margins": 0.4744564890861511, + "rewards/rejected": -4.320918560028076, + "step": 10900 + }, + { + "epoch": 1.8780151619572707, + "eval_logits/chosen": -3.4612090587615967, + "eval_logits/rejected": -3.457638740539551, + "eval_logps/chosen": -1.847074270248413, + "eval_logps/rejected": -2.027289867401123, + "eval_loss": 1.6163222789764404, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.694148540496826, + "eval_rewards/margins": 0.3604317307472229, + "eval_rewards/rejected": -4.054579734802246, + "eval_runtime": 157.1697, + "eval_samples_per_second": 27.384, + "eval_steps_per_second": 3.423, + "step": 10900 + }, + { + "epoch": 1.8797381116471399, + "grad_norm": 23.485628577660794, + "learning_rate": 5.487092816963995e-10, + "logits/chosen": -3.393101930618286, + "logits/rejected": -3.3763985633850098, + "logps/chosen": -1.9684337377548218, + "logps/rejected": -2.347898483276367, + "loss": 1.4327, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9368674755096436, + "rewards/margins": 0.7589290738105774, + "rewards/rejected": -4.695796966552734, + "step": 10910 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 24.370554013031125, + "learning_rate": 5.331553010812311e-10, + "logits/chosen": -3.3652336597442627, + "logits/rejected": -3.353187084197998, + "logps/chosen": -2.0598347187042236, + "logps/rejected": -2.2855281829833984, + "loss": 1.5782, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.119669437408447, + "rewards/margins": 0.4513870179653168, + "rewards/rejected": -4.571056365966797, + "step": 10920 + }, + { + "epoch": 1.883184011026878, + "grad_norm": 29.968291245666087, + "learning_rate": 5.178225760265775e-10, + "logits/chosen": -3.369079113006592, + "logits/rejected": -3.3526718616485596, + "logps/chosen": -1.9959361553192139, + "logps/rejected": -2.324263095855713, + "loss": 1.4845, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9918723106384277, + "rewards/margins": 0.6566535830497742, + "rewards/rejected": -4.648526191711426, + "step": 10930 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 30.13128242995009, + "learning_rate": 5.027112451874482e-10, + "logits/chosen": -3.3268673419952393, + "logits/rejected": -3.3162410259246826, + "logps/chosen": -2.0601000785827637, + "logps/rejected": -2.310098171234131, + "loss": 1.5738, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.120200157165527, + "rewards/margins": 0.49999600648880005, + "rewards/rejected": -4.620196342468262, + "step": 10940 + }, + { + "epoch": 1.886629910406616, + "grad_norm": 24.68121232096063, + "learning_rate": 4.87821445216774e-10, + "logits/chosen": -3.3560338020324707, + "logits/rejected": -3.3459270000457764, + "logps/chosen": -2.025421619415283, + "logps/rejected": -2.2899863719940186, + "loss": 1.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.050843238830566, + "rewards/margins": 0.5291293263435364, + "rewards/rejected": -4.579972743988037, + "step": 10950 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 28.175853815022254, + "learning_rate": 4.731533107641627e-10, + "logits/chosen": -3.407466173171997, + "logits/rejected": -3.3960678577423096, + "logps/chosen": -1.982832908630371, + "logps/rejected": -2.2596242427825928, + "loss": 1.5319, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.965665817260742, + "rewards/margins": 0.5535832047462463, + "rewards/rejected": -4.5192484855651855, + "step": 10960 + }, + { + "epoch": 1.8900758097863544, + "grad_norm": 25.18814875907745, + "learning_rate": 4.587069744746791e-10, + "logits/chosen": -3.4128692150115967, + "logits/rejected": -3.3966903686523438, + "logps/chosen": -1.9241507053375244, + "logps/rejected": -2.2077901363372803, + "loss": 1.4886, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.848301410675049, + "rewards/margins": 0.5672794580459595, + "rewards/rejected": -4.4155802726745605, + "step": 10970 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 23.096292951199924, + "learning_rate": 4.4448256698766393e-10, + "logits/chosen": -3.403791904449463, + "logits/rejected": -3.3973846435546875, + "logps/chosen": -2.011340618133545, + "logps/rejected": -2.3063220977783203, + "loss": 1.5203, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.02268123626709, + "rewards/margins": 0.5899624228477478, + "rewards/rejected": -4.612644195556641, + "step": 10980 + }, + { + "epoch": 1.8935217091660923, + "grad_norm": 20.174572119819228, + "learning_rate": 4.3048021693552206e-10, + "logits/chosen": -3.3539624214172363, + "logits/rejected": -3.3476696014404297, + "logps/chosen": -2.041971445083618, + "logps/rejected": -2.2923545837402344, + "loss": 1.5779, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.083942890167236, + "rewards/margins": 0.5007668137550354, + "rewards/rejected": -4.584709167480469, + "step": 10990 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 28.57938113590763, + "learning_rate": 4.167000509425811e-10, + "logits/chosen": -3.461742401123047, + "logits/rejected": -3.456514835357666, + "logps/chosen": -2.071506977081299, + "logps/rejected": -2.3209681510925293, + "loss": 1.6357, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.143013954162598, + "rewards/margins": 0.49892324209213257, + "rewards/rejected": -4.641936302185059, + "step": 11000 + }, + { + "epoch": 1.8952446588559613, + "eval_logits/chosen": -3.4536523818969727, + "eval_logits/rejected": -3.450037956237793, + "eval_logps/chosen": -1.847220778465271, + "eval_logps/rejected": -2.0274834632873535, + "eval_loss": 1.6163759231567383, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.694441556930542, + "eval_rewards/margins": 0.3605256974697113, + "eval_rewards/rejected": -4.054966926574707, + "eval_runtime": 157.2684, + "eval_samples_per_second": 27.367, + "eval_steps_per_second": 3.421, + "step": 11000 + }, + { + "epoch": 1.8969676085458305, + "grad_norm": 26.839663557623652, + "learning_rate": 4.0314219362395095e-10, + "logits/chosen": -3.4240424633026123, + "logits/rejected": -3.406816005706787, + "logps/chosen": -1.9249967336654663, + "logps/rejected": -2.308694362640381, + "loss": 1.4025, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8499934673309326, + "rewards/margins": 0.7673959732055664, + "rewards/rejected": -4.617388725280762, + "step": 11010 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 30.24488545249009, + "learning_rate": 3.898067675843747e-10, + "logits/chosen": -3.4310882091522217, + "logits/rejected": -3.4220104217529297, + "logps/chosen": -1.9558305740356445, + "logps/rejected": -2.2041993141174316, + "loss": 1.5834, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.911661148071289, + "rewards/margins": 0.4967372417449951, + "rewards/rejected": -4.408398628234863, + "step": 11020 + }, + { + "epoch": 1.9004135079255686, + "grad_norm": 29.419904937277412, + "learning_rate": 3.766938934171349e-10, + "logits/chosen": -3.4053268432617188, + "logits/rejected": -3.4012553691864014, + "logps/chosen": -2.108567237854004, + "logps/rejected": -2.3711509704589844, + "loss": 1.5859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.217134475708008, + "rewards/margins": 0.5251680612564087, + "rewards/rejected": -4.742301940917969, + "step": 11030 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 32.302465890754505, + "learning_rate": 3.6380368970296836e-10, + "logits/chosen": -3.4159626960754395, + "logits/rejected": -3.4097740650177, + "logps/chosen": -2.1416077613830566, + "logps/rejected": -2.3337085247039795, + "loss": 1.6618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.283215522766113, + "rewards/margins": 0.3842013478279114, + "rewards/rejected": -4.667417049407959, + "step": 11040 + }, + { + "epoch": 1.9038594073053066, + "grad_norm": 25.325972100734237, + "learning_rate": 3.5113627300897284e-10, + "logits/chosen": -3.3827853202819824, + "logits/rejected": -3.3713066577911377, + "logps/chosen": -1.9343725442886353, + "logps/rejected": -2.271419048309326, + "loss": 1.4584, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.8687450885772705, + "rewards/margins": 0.6740939617156982, + "rewards/rejected": -4.542838096618652, + "step": 11050 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 26.28254041794441, + "learning_rate": 3.38691757887577e-10, + "logits/chosen": -3.4117984771728516, + "logits/rejected": -3.3903260231018066, + "logps/chosen": -1.9647823572158813, + "logps/rejected": -2.288883686065674, + "loss": 1.4704, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9295647144317627, + "rewards/margins": 0.6482027173042297, + "rewards/rejected": -4.577767372131348, + "step": 11060 + }, + { + "epoch": 1.907305306685045, + "grad_norm": 22.38743369193518, + "learning_rate": 3.264702568754912e-10, + "logits/chosen": -3.4108879566192627, + "logits/rejected": -3.3858184814453125, + "logps/chosen": -1.887737512588501, + "logps/rejected": -2.351466655731201, + "loss": 1.342, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.775475025177002, + "rewards/margins": 0.9274588823318481, + "rewards/rejected": -4.702933311462402, + "step": 11070 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 24.127042175477722, + "learning_rate": 3.1447188049268656e-10, + "logits/chosen": -3.4160218238830566, + "logits/rejected": -3.402169704437256, + "logps/chosen": -2.0151193141937256, + "logps/rejected": -2.2987477779388428, + "loss": 1.4906, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.030238628387451, + "rewards/margins": 0.567257285118103, + "rewards/rejected": -4.5974955558776855, + "step": 11080 + }, + { + "epoch": 1.9107512060647829, + "grad_norm": 30.57253492539503, + "learning_rate": 3.0269673724140353e-10, + "logits/chosen": -3.3982276916503906, + "logits/rejected": -3.388754367828369, + "logps/chosen": -2.064014196395874, + "logps/rejected": -2.1896982192993164, + "loss": 1.7817, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.128028392791748, + "rewards/margins": 0.2513675093650818, + "rewards/rejected": -4.379396438598633, + "step": 11090 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 23.532924992540753, + "learning_rate": 2.9114493360517243e-10, + "logits/chosen": -3.325058698654175, + "logits/rejected": -3.316911220550537, + "logps/chosen": -1.9759376049041748, + "logps/rejected": -2.2502129077911377, + "loss": 1.5558, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9518752098083496, + "rewards/margins": 0.5485507249832153, + "rewards/rejected": -4.500425815582275, + "step": 11100 + }, + { + "epoch": 1.9124741557546519, + "eval_logits/chosen": -3.448634147644043, + "eval_logits/rejected": -3.4449892044067383, + "eval_logps/chosen": -1.8470559120178223, + "eval_logps/rejected": -2.026993989944458, + "eval_loss": 1.6165279150009155, + "eval_rewards/accuracies": 0.6212825179100037, + "eval_rewards/chosen": -3.6941118240356445, + "eval_rewards/margins": 0.3598756194114685, + "eval_rewards/rejected": -4.053987979888916, + "eval_runtime": 156.9103, + "eval_samples_per_second": 27.43, + "eval_steps_per_second": 3.429, + "step": 11100 + }, + { + "epoch": 1.914197105444521, + "grad_norm": 24.068500924862295, + "learning_rate": 2.79816574047842e-10, + "logits/chosen": -3.419996738433838, + "logits/rejected": -3.4088845252990723, + "logps/chosen": -2.0631823539733887, + "logps/rejected": -2.392012119293213, + "loss": 1.5055, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.126364707946777, + "rewards/margins": 0.6576594710350037, + "rewards/rejected": -4.784024238586426, + "step": 11110 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 22.901094487559398, + "learning_rate": 2.6871176101263826e-10, + "logits/chosen": -3.468132495880127, + "logits/rejected": -3.451923370361328, + "logps/chosen": -2.0699965953826904, + "logps/rejected": -2.327219009399414, + "loss": 1.563, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.139993190765381, + "rewards/margins": 0.5144453048706055, + "rewards/rejected": -4.654438018798828, + "step": 11120 + }, + { + "epoch": 1.9176430048242592, + "grad_norm": 29.51132391917758, + "learning_rate": 2.5783059492124335e-10, + "logits/chosen": -3.3659400939941406, + "logits/rejected": -3.3547091484069824, + "logps/chosen": -2.092963695526123, + "logps/rejected": -2.3193814754486084, + "loss": 1.6055, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.185927391052246, + "rewards/margins": 0.4528353810310364, + "rewards/rejected": -4.638762950897217, + "step": 11130 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 24.785121484585442, + "learning_rate": 2.471731741728794e-10, + "logits/chosen": -3.3456077575683594, + "logits/rejected": -3.334561586380005, + "logps/chosen": -1.9468122720718384, + "logps/rejected": -2.260291814804077, + "loss": 1.52, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.8936245441436768, + "rewards/margins": 0.6269592642784119, + "rewards/rejected": -4.520583629608154, + "step": 11140 + }, + { + "epoch": 1.9210889042039971, + "grad_norm": 25.36003277923625, + "learning_rate": 2.367395951434231e-10, + "logits/chosen": -3.387727737426758, + "logits/rejected": -3.379549503326416, + "logps/chosen": -2.0501720905303955, + "logps/rejected": -2.3050007820129395, + "loss": 1.5793, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.100344181060791, + "rewards/margins": 0.5096569061279297, + "rewards/rejected": -4.610001564025879, + "step": 11150 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 23.80810105223262, + "learning_rate": 2.2652995218452876e-10, + "logits/chosen": -3.443822145462036, + "logits/rejected": -3.43287992477417, + "logps/chosen": -1.9143426418304443, + "logps/rejected": -2.1211400032043457, + "loss": 1.6413, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.8286852836608887, + "rewards/margins": 0.41359448432922363, + "rewards/rejected": -4.242280006408691, + "step": 11160 + }, + { + "epoch": 1.9245348035837355, + "grad_norm": 25.331918965250612, + "learning_rate": 2.1654433762278713e-10, + "logits/chosen": -3.341073513031006, + "logits/rejected": -3.325766086578369, + "logps/chosen": -2.0386483669281006, + "logps/rejected": -2.1694233417510986, + "loss": 1.7265, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.077296733856201, + "rewards/margins": 0.2615496814250946, + "rewards/rejected": -4.338846683502197, + "step": 11170 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 27.212084454651748, + "learning_rate": 2.0678284175887906e-10, + "logits/chosen": -3.4242770671844482, + "logits/rejected": -3.4113476276397705, + "logps/chosen": -1.9908275604248047, + "logps/rejected": -2.2803826332092285, + "loss": 1.5472, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9816551208496094, + "rewards/margins": 0.579109787940979, + "rewards/rejected": -4.560765266418457, + "step": 11180 + }, + { + "epoch": 1.9279807029634735, + "grad_norm": 34.085799158124175, + "learning_rate": 1.972455528667677e-10, + "logits/chosen": -3.4184272289276123, + "logits/rejected": -3.401930570602417, + "logps/chosen": -2.0033936500549316, + "logps/rejected": -2.320781707763672, + "loss": 1.5091, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.006787300109863, + "rewards/margins": 0.6347756385803223, + "rewards/rejected": -4.641563415527344, + "step": 11190 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 28.607013358394077, + "learning_rate": 1.8793255719288248e-10, + "logits/chosen": -3.4229025840759277, + "logits/rejected": -3.4111130237579346, + "logps/chosen": -1.9279229640960693, + "logps/rejected": -2.2258658409118652, + "loss": 1.4591, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.8558459281921387, + "rewards/margins": 0.5958852767944336, + "rewards/rejected": -4.4517316818237305, + "step": 11200 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -3.4505960941314697, + "eval_logits/rejected": -3.446967840194702, + "eval_logps/chosen": -1.8468133211135864, + "eval_logps/rejected": -2.026925563812256, + "eval_loss": 1.6164582967758179, + "eval_rewards/accuracies": 0.6226765513420105, + "eval_rewards/chosen": -3.693626642227173, + "eval_rewards/margins": 0.3602244555950165, + "eval_rewards/rejected": -4.053851127624512, + "eval_runtime": 156.9226, + "eval_samples_per_second": 27.428, + "eval_steps_per_second": 3.428, + "step": 11200 + }, + { + "epoch": 1.9314266023432116, + "grad_norm": 25.28882939631575, + "learning_rate": 1.7884393895536697e-10, + "logits/chosen": -3.3341434001922607, + "logits/rejected": -3.325577974319458, + "logps/chosen": -2.047227382659912, + "logps/rejected": -2.3589389324188232, + "loss": 1.5053, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.094454765319824, + "rewards/margins": 0.6234235167503357, + "rewards/rejected": -4.7178778648376465, + "step": 11210 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 24.07292984530674, + "learning_rate": 1.6997978034329342e-10, + "logits/chosen": -3.399669647216797, + "logits/rejected": -3.3929238319396973, + "logps/chosen": -1.9279512166976929, + "logps/rejected": -2.239488124847412, + "loss": 1.5249, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.8559024333953857, + "rewards/margins": 0.6230736374855042, + "rewards/rejected": -4.478976249694824, + "step": 11220 + }, + { + "epoch": 1.9348725017229498, + "grad_norm": 25.374120680774084, + "learning_rate": 1.6134016151592988e-10, + "logits/chosen": -3.3835625648498535, + "logits/rejected": -3.3770880699157715, + "logps/chosen": -2.101574182510376, + "logps/rejected": -2.305133581161499, + "loss": 1.7501, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.203148365020752, + "rewards/margins": 0.4071192741394043, + "rewards/rejected": -4.610267162322998, + "step": 11230 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 31.278456122573452, + "learning_rate": 1.5292516060201598e-10, + "logits/chosen": -3.371049404144287, + "logits/rejected": -3.3635432720184326, + "logps/chosen": -2.0738697052001953, + "logps/rejected": -2.293327808380127, + "loss": 1.7263, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.147739410400391, + "rewards/margins": 0.4389162063598633, + "rewards/rejected": -4.586655616760254, + "step": 11240 + }, + { + "epoch": 1.9383184011026877, + "grad_norm": 31.55571078456244, + "learning_rate": 1.4473485369905225e-10, + "logits/chosen": -3.3949081897735596, + "logits/rejected": -3.381544589996338, + "logps/chosen": -2.021014928817749, + "logps/rejected": -2.2920193672180176, + "loss": 1.5225, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.042029857635498, + "rewards/margins": 0.5420087575912476, + "rewards/rejected": -4.584038734436035, + "step": 11250 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 28.974919821390785, + "learning_rate": 1.3676931487261456e-10, + "logits/chosen": -3.353949785232544, + "logits/rejected": -3.3361446857452393, + "logps/chosen": -2.0332696437835693, + "logps/rejected": -2.3275554180145264, + "loss": 1.5878, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.066539287567139, + "rewards/margins": 0.5885715484619141, + "rewards/rejected": -4.655110836029053, + "step": 11260 + }, + { + "epoch": 1.9417643004824259, + "grad_norm": 27.1654535815397, + "learning_rate": 1.2902861615568529e-10, + "logits/chosen": -3.3954601287841797, + "logits/rejected": -3.378387928009033, + "logps/chosen": -2.0470194816589355, + "logps/rejected": -2.263256549835205, + "loss": 1.6083, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.094038963317871, + "rewards/margins": 0.43247395753860474, + "rewards/rejected": -4.52651309967041, + "step": 11270 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 25.346749331224736, + "learning_rate": 1.215128275479954e-10, + "logits/chosen": -3.426262617111206, + "logits/rejected": -3.4118473529815674, + "logps/chosen": -2.0194125175476074, + "logps/rejected": -2.1804115772247314, + "loss": 1.6512, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.038825035095215, + "rewards/margins": 0.3219974935054779, + "rewards/rejected": -4.360823154449463, + "step": 11280 + }, + { + "epoch": 1.945210199862164, + "grad_norm": 27.486061350855827, + "learning_rate": 1.1422201701540569e-10, + "logits/chosen": -3.394376277923584, + "logits/rejected": -3.384406566619873, + "logps/chosen": -2.042423963546753, + "logps/rejected": -2.2977445125579834, + "loss": 1.6092, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.084847927093506, + "rewards/margins": 0.5106416940689087, + "rewards/rejected": -4.595489025115967, + "step": 11290 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 25.5771889104583, + "learning_rate": 1.0715625048927091e-10, + "logits/chosen": -3.3836891651153564, + "logits/rejected": -3.370030641555786, + "logps/chosen": -2.046790599822998, + "logps/rejected": -2.2022275924682617, + "loss": 1.6996, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.093581199645996, + "rewards/margins": 0.31087392568588257, + "rewards/rejected": -4.404455184936523, + "step": 11300 + }, + { + "epoch": 1.946933149552033, + "eval_logits/chosen": -3.4613680839538574, + "eval_logits/rejected": -3.4577908515930176, + "eval_logps/chosen": -1.8468897342681885, + "eval_logps/rejected": -2.026904821395874, + "eval_loss": 1.6165390014648438, + "eval_rewards/accuracies": 0.6219795346260071, + "eval_rewards/chosen": -3.693779468536377, + "eval_rewards/margins": 0.36003032326698303, + "eval_rewards/rejected": -4.053809642791748, + "eval_runtime": 157.0631, + "eval_samples_per_second": 27.403, + "eval_steps_per_second": 3.425, + "step": 11300 + }, + { + "epoch": 1.948656099241902, + "grad_norm": 22.499065318138452, + "learning_rate": 1.0031559186586824e-10, + "logits/chosen": -3.455717086791992, + "logits/rejected": -3.451998472213745, + "logps/chosen": -1.9763959646224976, + "logps/rejected": -2.2574944496154785, + "loss": 1.5287, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.952791929244995, + "rewards/margins": 0.562197208404541, + "rewards/rejected": -4.514988899230957, + "step": 11310 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 24.833421336756707, + "learning_rate": 9.370010300579212e-11, + "logits/chosen": -3.395156145095825, + "logits/rejected": -3.378815174102783, + "logps/chosen": -1.9510142803192139, + "logps/rejected": -2.287487745285034, + "loss": 1.4676, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9020285606384277, + "rewards/margins": 0.6729470491409302, + "rewards/rejected": -4.574975490570068, + "step": 11320 + }, + { + "epoch": 1.9521019986216404, + "grad_norm": 28.96324461401707, + "learning_rate": 8.73098437334241e-11, + "logits/chosen": -3.4059181213378906, + "logits/rejected": -3.384840726852417, + "logps/chosen": -1.959080457687378, + "logps/rejected": -2.3184664249420166, + "loss": 1.4073, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.918160915374756, + "rewards/margins": 0.7187713980674744, + "rewards/rejected": -4.636932849884033, + "step": 11330 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 26.911101832635165, + "learning_rate": 8.114487183636942e-11, + "logits/chosen": -3.32439923286438, + "logits/rejected": -3.3143794536590576, + "logps/chosen": -2.0331687927246094, + "logps/rejected": -2.252995252609253, + "loss": 1.6235, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.066337585449219, + "rewards/margins": 0.4396524429321289, + "rewards/rejected": -4.505990505218506, + "step": 11340 + }, + { + "epoch": 1.9555478980013783, + "grad_norm": 27.95516329099853, + "learning_rate": 7.520524306494358e-11, + "logits/chosen": -3.4238200187683105, + "logits/rejected": -3.418254852294922, + "logps/chosen": -2.0828404426574707, + "logps/rejected": -2.2630558013916016, + "loss": 1.6352, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.165680885314941, + "rewards/margins": 0.36043089628219604, + "rewards/rejected": -4.526111602783203, + "step": 11350 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 28.427253683896392, + "learning_rate": 6.949101113166711e-11, + "logits/chosen": -3.3972134590148926, + "logits/rejected": -3.3817343711853027, + "logps/chosen": -2.0165886878967285, + "logps/rejected": -2.375457286834717, + "loss": 1.5286, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.033177375793457, + "rewards/margins": 0.7177375555038452, + "rewards/rejected": -4.750914573669434, + "step": 11360 + }, + { + "epoch": 1.9589937973811165, + "grad_norm": 29.45814514907188, + "learning_rate": 6.40022277107799e-11, + "logits/chosen": -3.376837968826294, + "logits/rejected": -3.3655834197998047, + "logps/chosen": -2.022885799407959, + "logps/rejected": -2.2897229194641113, + "loss": 1.6007, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.045771598815918, + "rewards/margins": 0.5336742401123047, + "rewards/rejected": -4.579445838928223, + "step": 11370 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 29.072359734259642, + "learning_rate": 5.873894243776933e-11, + "logits/chosen": -3.3590691089630127, + "logits/rejected": -3.3496429920196533, + "logps/chosen": -2.0713436603546143, + "logps/rejected": -2.301238775253296, + "loss": 1.6478, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.1426873207092285, + "rewards/margins": 0.4597903788089752, + "rewards/rejected": -4.602477550506592, + "step": 11380 + }, + { + "epoch": 1.9624396967608546, + "grad_norm": 26.381600452182138, + "learning_rate": 5.3701202908931766e-11, + "logits/chosen": -3.425877809524536, + "logits/rejected": -3.4150230884552, + "logps/chosen": -1.9828720092773438, + "logps/rejected": -2.3024253845214844, + "loss": 1.4617, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.9657440185546875, + "rewards/margins": 0.6391069889068604, + "rewards/rejected": -4.604850769042969, + "step": 11390 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 26.352186062701353, + "learning_rate": 4.8889054680936736e-11, + "logits/chosen": -3.400228977203369, + "logits/rejected": -3.381631374359131, + "logps/chosen": -1.95590341091156, + "logps/rejected": -2.2450382709503174, + "loss": 1.5312, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.91180682182312, + "rewards/margins": 0.5782698392868042, + "rewards/rejected": -4.490076541900635, + "step": 11400 + }, + { + "epoch": 1.9641626464507236, + "eval_logits/chosen": -3.45371675491333, + "eval_logits/rejected": -3.4501090049743652, + "eval_logps/chosen": -1.8470561504364014, + "eval_logps/rejected": -2.0268359184265137, + "eval_loss": 1.6168980598449707, + "eval_rewards/accuracies": 0.6210501790046692, + "eval_rewards/chosen": -3.6941123008728027, + "eval_rewards/margins": 0.35955992341041565, + "eval_rewards/rejected": -4.053671836853027, + "eval_runtime": 156.7298, + "eval_samples_per_second": 27.461, + "eval_steps_per_second": 3.433, + "step": 11400 + }, + { + "epoch": 1.9658855961405926, + "grad_norm": 23.569994190006422, + "learning_rate": 4.4302541270407887e-11, + "logits/chosen": -3.403521776199341, + "logits/rejected": -3.389683246612549, + "logps/chosen": -1.9634898900985718, + "logps/rejected": -2.2698521614074707, + "loss": 1.4917, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9269797801971436, + "rewards/margins": 0.6127251386642456, + "rewards/rejected": -4.539704322814941, + "step": 11410 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 26.064198889060894, + "learning_rate": 3.994170415353715e-11, + "logits/chosen": -3.3980674743652344, + "logits/rejected": -3.3808493614196777, + "logps/chosen": -1.973003625869751, + "logps/rejected": -2.2682454586029053, + "loss": 1.5713, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.946007251739502, + "rewards/margins": 0.5904842019081116, + "rewards/rejected": -4.5364909172058105, + "step": 11420 + }, + { + "epoch": 1.969331495520331, + "grad_norm": 26.301815866749525, + "learning_rate": 3.5806582765715576e-11, + "logits/chosen": -3.3450355529785156, + "logits/rejected": -3.3306972980499268, + "logps/chosen": -2.021624803543091, + "logps/rejected": -2.2888169288635254, + "loss": 1.5151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.043249607086182, + "rewards/margins": 0.5343844294548035, + "rewards/rejected": -4.577633857727051, + "step": 11430 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 27.893312154855394, + "learning_rate": 3.189721450116145e-11, + "logits/chosen": -3.3915677070617676, + "logits/rejected": -3.3853259086608887, + "logps/chosen": -2.0288314819335938, + "logps/rejected": -2.2169597148895264, + "loss": 1.6687, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.0576629638671875, + "rewards/margins": 0.37625688314437866, + "rewards/rejected": -4.433919429779053, + "step": 11440 + }, + { + "epoch": 1.9727773949000689, + "grad_norm": 29.253807519234208, + "learning_rate": 2.821363471259275e-11, + "logits/chosen": -3.3416316509246826, + "logits/rejected": -3.328852891921997, + "logps/chosen": -1.950411081314087, + "logps/rejected": -2.2286744117736816, + "loss": 1.5211, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.900822162628174, + "rewards/margins": 0.556526780128479, + "rewards/rejected": -4.457348823547363, + "step": 11450 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 28.50884149045529, + "learning_rate": 2.4755876710905176e-11, + "logits/chosen": -3.3827414512634277, + "logits/rejected": -3.3734962940216064, + "logps/chosen": -2.060555934906006, + "logps/rejected": -2.335235118865967, + "loss": 1.5713, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.121111869812012, + "rewards/margins": 0.5493584871292114, + "rewards/rejected": -4.670470237731934, + "step": 11460 + }, + { + "epoch": 1.976223294279807, + "grad_norm": 25.86088697159483, + "learning_rate": 2.1523971764869642e-11, + "logits/chosen": -3.411350965499878, + "logits/rejected": -3.3967528343200684, + "logps/chosen": -2.10691499710083, + "logps/rejected": -2.31646466255188, + "loss": 1.654, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.21382999420166, + "rewards/margins": 0.41909924149513245, + "rewards/rejected": -4.63292932510376, + "step": 11470 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 21.66934202174706, + "learning_rate": 1.851794910085469e-11, + "logits/chosen": -3.4262359142303467, + "logits/rejected": -3.409626007080078, + "logps/chosen": -1.9301522970199585, + "logps/rejected": -2.208036184310913, + "loss": 1.5289, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.860304594039917, + "rewards/margins": 0.5557678937911987, + "rewards/rejected": -4.416072368621826, + "step": 11480 + }, + { + "epoch": 1.9796691936595452, + "grad_norm": 20.69052189168406, + "learning_rate": 1.5737835902551733e-11, + "logits/chosen": -3.368183135986328, + "logits/rejected": -3.35334849357605, + "logps/chosen": -1.9831539392471313, + "logps/rejected": -2.190124034881592, + "loss": 1.598, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.9663078784942627, + "rewards/margins": 0.41394057869911194, + "rewards/rejected": -4.380248069763184, + "step": 11490 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 23.92327838272183, + "learning_rate": 1.3183657310741891e-11, + "logits/chosen": -3.4008121490478516, + "logits/rejected": -3.389730453491211, + "logps/chosen": -2.0140676498413086, + "logps/rejected": -2.248105525970459, + "loss": 1.6372, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.028135299682617, + "rewards/margins": 0.4680759310722351, + "rewards/rejected": -4.496211051940918, + "step": 11500 + }, + { + "epoch": 1.9813921433494142, + "eval_logits/chosen": -3.445789337158203, + "eval_logits/rejected": -3.442142963409424, + "eval_logps/chosen": -1.8475120067596436, + "eval_logps/rejected": -2.027435302734375, + "eval_loss": 1.616647481918335, + "eval_rewards/accuracies": 0.6222118735313416, + "eval_rewards/chosen": -3.695024013519287, + "eval_rewards/margins": 0.3598465919494629, + "eval_rewards/rejected": -4.05487060546875, + "eval_runtime": 156.7981, + "eval_samples_per_second": 27.449, + "eval_steps_per_second": 3.431, + "step": 11500 + }, + { + "epoch": 1.9831150930392831, + "grad_norm": 28.723897823566663, + "learning_rate": 1.0855436423054531e-11, + "logits/chosen": -3.338587999343872, + "logits/rejected": -3.3330318927764893, + "logps/chosen": -2.0461111068725586, + "logps/rejected": -2.234325408935547, + "loss": 1.6759, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.092222213745117, + "rewards/margins": 0.37642863392829895, + "rewards/rejected": -4.468650817871094, + "step": 11510 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 24.48850772242874, + "learning_rate": 8.753194293770194e-12, + "logits/chosen": -3.4132132530212402, + "logits/rejected": -3.3848025798797607, + "logps/chosen": -1.9694095849990845, + "logps/rejected": -2.262876033782959, + "loss": 1.4767, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.938819169998169, + "rewards/margins": 0.5869328379631042, + "rewards/rejected": -4.525752067565918, + "step": 11520 + }, + { + "epoch": 1.9865609924190215, + "grad_norm": 22.24040123264218, + "learning_rate": 6.876949933631859e-12, + "logits/chosen": -3.3948769569396973, + "logits/rejected": -3.375946521759033, + "logps/chosen": -2.0476231575012207, + "logps/rejected": -2.3237972259521484, + "loss": 1.5832, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.095246315002441, + "rewards/margins": 0.5523483157157898, + "rewards/rejected": -4.647594451904297, + "step": 11530 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 25.9805197424792, + "learning_rate": 5.226720309656207e-12, + "logits/chosen": -3.4152884483337402, + "logits/rejected": -3.408207654953003, + "logps/chosen": -1.9097973108291626, + "logps/rejected": -2.2843213081359863, + "loss": 1.4078, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.819594621658325, + "rewards/margins": 0.7490483522415161, + "rewards/rejected": -4.568642616271973, + "step": 11540 + }, + { + "epoch": 1.9900068917987594, + "grad_norm": 21.885680324554908, + "learning_rate": 3.802520345000393e-12, + "logits/chosen": -3.373927354812622, + "logits/rejected": -3.3622632026672363, + "logps/chosen": -1.9245151281356812, + "logps/rejected": -2.288876533508301, + "loss": 1.4385, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.8490302562713623, + "rewards/margins": 0.7287222743034363, + "rewards/rejected": -4.577753067016602, + "step": 11550 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 25.641820880209643, + "learning_rate": 2.604362918812164e-12, + "logits/chosen": -3.3903298377990723, + "logits/rejected": -3.375819444656372, + "logps/chosen": -2.0744354724884033, + "logps/rejected": -2.308337450027466, + "loss": 1.5696, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.148870944976807, + "rewards/margins": 0.4678036570549011, + "rewards/rejected": -4.616674900054932, + "step": 11560 + }, + { + "epoch": 1.9934527911784976, + "grad_norm": 25.313996234247206, + "learning_rate": 1.6322588661216163e-12, + "logits/chosen": -3.389694929122925, + "logits/rejected": -3.3794150352478027, + "logps/chosen": -2.076744556427002, + "logps/rejected": -2.344348669052124, + "loss": 1.5647, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.153489112854004, + "rewards/margins": 0.5352082848548889, + "rewards/rejected": -4.688697338104248, + "step": 11570 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 28.35227576393564, + "learning_rate": 8.862169777440476e-13, + "logits/chosen": -3.4078574180603027, + "logits/rejected": -3.396538496017456, + "logps/chosen": -2.0853512287139893, + "logps/rejected": -2.338848829269409, + "loss": 1.5709, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.1707024574279785, + "rewards/margins": 0.5069957971572876, + "rewards/rejected": -4.677697658538818, + "step": 11580 + }, + { + "epoch": 1.9968986905582358, + "grad_norm": 27.308532562051173, + "learning_rate": 3.662440001883649e-13, + "logits/chosen": -3.3452811241149902, + "logits/rejected": -3.326258897781372, + "logps/chosen": -1.998936653137207, + "logps/rejected": -2.2670681476593018, + "loss": 1.6033, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.997873306274414, + "rewards/margins": 0.5362626314163208, + "rewards/rejected": -4.5341362953186035, + "step": 11590 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 28.641741272326293, + "learning_rate": 7.234463561267556e-14, + "logits/chosen": -3.364387035369873, + "logits/rejected": -3.3585898876190186, + "logps/chosen": -1.9972450733184814, + "logps/rejected": -2.3113553524017334, + "loss": 1.4693, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.994490146636963, + "rewards/margins": 0.6282204985618591, + "rewards/rejected": -4.622710704803467, + "step": 11600 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -3.445793628692627, + "eval_logits/rejected": -3.4421546459198, + "eval_logps/chosen": -1.8473639488220215, + "eval_logps/rejected": -2.0278360843658447, + "eval_loss": 1.6161540746688843, + "eval_rewards/accuracies": 0.622444212436676, + "eval_rewards/chosen": -3.694727897644043, + "eval_rewards/margins": 0.3609439730644226, + "eval_rewards/rejected": -4.0556721687316895, + "eval_runtime": 156.6802, + "eval_samples_per_second": 27.47, + "eval_steps_per_second": 3.434, + "step": 11600 + }, + { + "epoch": 2.0, + "step": 11608, + "total_flos": 0.0, + "train_loss": 1.6182483464356212, + "train_runtime": 54021.947, + "train_samples_per_second": 3.438, + "train_steps_per_second": 0.215 + } + ], + "logging_steps": 10, + "max_steps": 11608, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}