{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 4168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002399232245681382, "grad_norm": 4.7795046499064915, "learning_rate": 1.199040767386091e-09, "logits/chosen": -0.7570170760154724, "logits/rejected": -0.7606267929077148, "logps/chosen": -147.62075805664062, "logps/rejected": -139.63986206054688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0023992322456813818, "grad_norm": 5.0729607226586175, "learning_rate": 1.199040767386091e-08, "logits/chosen": -0.7337759137153625, "logits/rejected": -0.8291671872138977, "logps/chosen": -372.46026611328125, "logps/rejected": -298.1966247558594, "loss": 0.693, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0005176405538804829, "rewards/margins": 0.00119220616761595, "rewards/rejected": -0.0006745656137354672, "step": 10 }, { "epoch": 0.0047984644913627635, "grad_norm": 5.162819171123915, "learning_rate": 2.398081534772182e-08, "logits/chosen": -0.7522455453872681, "logits/rejected": -0.7984375953674316, "logps/chosen": -240.97720336914062, "logps/rejected": -211.13278198242188, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00016315083485096693, "rewards/margins": 0.0002667726075742394, "rewards/rejected": -0.00010362181637901813, "step": 20 }, { "epoch": 0.007197696737044146, "grad_norm": 4.74337539097734, "learning_rate": 3.597122302158273e-08, "logits/chosen": -0.7967968583106995, "logits/rejected": -0.8497036099433899, "logps/chosen": -252.3729705810547, "logps/rejected": -261.5249328613281, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0017296618316322565, "rewards/margins": -0.0007607643492519855, "rewards/rejected": -0.0009688973659649491, "step": 30 }, { "epoch": 0.009596928982725527, "grad_norm": 4.907565292084559, "learning_rate": 4.796163069544364e-08, "logits/chosen": -0.8299921154975891, "logits/rejected": -0.883353054523468, "logps/chosen": -268.02789306640625, "logps/rejected": -251.27548217773438, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 6.413871597032994e-05, "rewards/margins": 0.0001428989926353097, "rewards/rejected": -7.876028394093737e-05, "step": 40 }, { "epoch": 0.01199616122840691, "grad_norm": 5.291199439758451, "learning_rate": 5.995203836930455e-08, "logits/chosen": -0.7905577421188354, "logits/rejected": -0.8132292032241821, "logps/chosen": -273.465087890625, "logps/rejected": -236.5275421142578, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": -0.00024220789782702923, "rewards/margins": -0.0008948832983151078, "rewards/rejected": 0.0006526754004880786, "step": 50 }, { "epoch": 0.014395393474088292, "grad_norm": 5.461587897395331, "learning_rate": 7.194244604316546e-08, "logits/chosen": -0.8055087924003601, "logits/rejected": -0.7774447202682495, "logps/chosen": -279.95806884765625, "logps/rejected": -260.2548828125, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": -0.0018558722222223878, "rewards/margins": -0.0015968760708346963, "rewards/rejected": -0.00025899597676470876, "step": 60 }, { "epoch": 0.016794625719769675, "grad_norm": 4.789491285565402, "learning_rate": 8.393285371702638e-08, "logits/chosen": -0.6775354743003845, "logits/rejected": -0.6865079998970032, "logps/chosen": -284.73492431640625, "logps/rejected": -268.7757263183594, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0004096252378076315, "rewards/margins": -0.001045037293806672, "rewards/rejected": 0.0006354121142067015, "step": 70 }, { "epoch": 0.019193857965451054, "grad_norm": 5.22044457856631, "learning_rate": 9.592326139088728e-08, "logits/chosen": -0.7918148040771484, "logits/rejected": -0.6770384907722473, "logps/chosen": -193.21511840820312, "logps/rejected": -248.8389892578125, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0001543355901958421, "rewards/margins": 0.0017128061736002564, "rewards/rejected": -0.0015584708889946342, "step": 80 }, { "epoch": 0.021593090211132437, "grad_norm": 5.0135506626659065, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -0.860626220703125, "logits/rejected": -0.9020501971244812, "logps/chosen": -332.2583312988281, "logps/rejected": -287.39312744140625, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.847834836458787e-05, "rewards/margins": 0.00035616609966382384, "rewards/rejected": -0.00041464445530436933, "step": 90 }, { "epoch": 0.02399232245681382, "grad_norm": 5.263212106273348, "learning_rate": 1.199040767386091e-07, "logits/chosen": -0.7230492234230042, "logits/rejected": -0.6537036895751953, "logps/chosen": -265.91143798828125, "logps/rejected": -282.36163330078125, "loss": 0.6927, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00011261176405241713, "rewards/margins": 0.00016119341307785362, "rewards/rejected": -0.00027380516985431314, "step": 100 }, { "epoch": 0.026391554702495202, "grad_norm": 4.559210233997684, "learning_rate": 1.3189448441247004e-07, "logits/chosen": -0.8084124326705933, "logits/rejected": -0.838187038898468, "logps/chosen": -228.7566375732422, "logps/rejected": -229.68017578125, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": -0.00025232377811335027, "rewards/margins": 0.0021261090878397226, "rewards/rejected": -0.002378433011472225, "step": 110 }, { "epoch": 0.028790786948176585, "grad_norm": 4.987801425382141, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -0.7631937265396118, "logits/rejected": -0.8265846967697144, "logps/chosen": -287.75518798828125, "logps/rejected": -274.0089111328125, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0009368563769385219, "rewards/margins": 0.0011595094110816717, "rewards/rejected": -0.0020963659044355154, "step": 120 }, { "epoch": 0.031190019193857964, "grad_norm": 4.539938061260808, "learning_rate": 1.5587529976019183e-07, "logits/chosen": -0.8044384717941284, "logits/rejected": -0.7853862643241882, "logps/chosen": -208.2334747314453, "logps/rejected": -308.89727783203125, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0008903613197617233, "rewards/margins": 0.003986647818237543, "rewards/rejected": -0.004877009429037571, "step": 130 }, { "epoch": 0.03358925143953935, "grad_norm": 4.86277843753179, "learning_rate": 1.6786570743405277e-07, "logits/chosen": -0.59493488073349, "logits/rejected": -0.6423755288124084, "logps/chosen": -296.8682861328125, "logps/rejected": -286.8326721191406, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.0003669736906886101, "rewards/margins": 0.006478472147136927, "rewards/rejected": -0.006845445372164249, "step": 140 }, { "epoch": 0.03598848368522073, "grad_norm": 5.140685164747867, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -0.7444754838943481, "logits/rejected": -0.7507014870643616, "logps/chosen": -225.19686889648438, "logps/rejected": -223.80783081054688, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0018180795013904572, "rewards/margins": 0.006333982106298208, "rewards/rejected": -0.008152060210704803, "step": 150 }, { "epoch": 0.03838771593090211, "grad_norm": 5.210882534550865, "learning_rate": 1.9184652278177456e-07, "logits/chosen": -0.6324438452720642, "logits/rejected": -0.6643397212028503, "logps/chosen": -304.49700927734375, "logps/rejected": -235.69424438476562, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": -0.0027639209292829037, "rewards/margins": 0.008128685876727104, "rewards/rejected": -0.010892605409026146, "step": 160 }, { "epoch": 0.040786948176583494, "grad_norm": 4.6747728490323635, "learning_rate": 2.038369304556355e-07, "logits/chosen": -0.6203088164329529, "logits/rejected": -0.6542561650276184, "logps/chosen": -337.5506591796875, "logps/rejected": -324.4564208984375, "loss": 0.6879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0021853535436093807, "rewards/margins": 0.010294707491993904, "rewards/rejected": -0.012480061501264572, "step": 170 }, { "epoch": 0.04318618042226487, "grad_norm": 5.220785522583115, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -0.8055013418197632, "logits/rejected": -0.8089167475700378, "logps/chosen": -238.187744140625, "logps/rejected": -231.9917449951172, "loss": 0.6886, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004614435136318207, "rewards/margins": 0.018629100173711777, "rewards/rejected": -0.023243537172675133, "step": 180 }, { "epoch": 0.04558541266794626, "grad_norm": 5.9389729833935325, "learning_rate": 2.278177458033573e-07, "logits/chosen": -0.6565154790878296, "logits/rejected": -0.7033632397651672, "logps/chosen": -318.5955505371094, "logps/rejected": -258.2650451660156, "loss": 0.6882, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005404843017458916, "rewards/margins": 0.0013396486174315214, "rewards/rejected": -0.0067444914020597935, "step": 190 }, { "epoch": 0.04798464491362764, "grad_norm": 4.680422689180021, "learning_rate": 2.398081534772182e-07, "logits/chosen": -0.7583560347557068, "logits/rejected": -0.7128076553344727, "logps/chosen": -315.15093994140625, "logps/rejected": -300.1588134765625, "loss": 0.685, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0010681712301447988, "rewards/margins": 0.02815566025674343, "rewards/rejected": -0.029223833233118057, "step": 200 }, { "epoch": 0.05038387715930902, "grad_norm": 4.9609467030055505, "learning_rate": 2.517985611510791e-07, "logits/chosen": -0.7415071725845337, "logits/rejected": -0.7684369683265686, "logps/chosen": -241.5952911376953, "logps/rejected": -265.6112976074219, "loss": 0.6852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006404706742614508, "rewards/margins": 0.019062474370002747, "rewards/rejected": -0.02546718157827854, "step": 210 }, { "epoch": 0.052783109404990404, "grad_norm": 4.801299794937751, "learning_rate": 2.637889688249401e-07, "logits/chosen": -0.6760513186454773, "logits/rejected": -0.6948543190956116, "logps/chosen": -311.57281494140625, "logps/rejected": -320.0372619628906, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.010622961446642876, "rewards/margins": 0.013131847605109215, "rewards/rejected": -0.02375480905175209, "step": 220 }, { "epoch": 0.05518234165067178, "grad_norm": 5.226626430555928, "learning_rate": 2.7577937649880093e-07, "logits/chosen": -0.6577489376068115, "logits/rejected": -0.5907109975814819, "logps/chosen": -237.6232147216797, "logps/rejected": -272.1260986328125, "loss": 0.6815, "rewards/accuracies": 0.5, "rewards/chosen": -0.017997443675994873, "rewards/margins": 0.015421544201672077, "rewards/rejected": -0.033418990671634674, "step": 230 }, { "epoch": 0.05758157389635317, "grad_norm": 5.781159666071322, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -0.6846515536308289, "logits/rejected": -0.7044352293014526, "logps/chosen": -290.9536437988281, "logps/rejected": -245.861083984375, "loss": 0.6786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00569694209843874, "rewards/margins": 0.03246745467185974, "rewards/rejected": -0.038164399564266205, "step": 240 }, { "epoch": 0.05998080614203455, "grad_norm": 5.24934898048967, "learning_rate": 2.997601918465228e-07, "logits/chosen": -0.6797415614128113, "logits/rejected": -0.6883940696716309, "logps/chosen": -233.06948852539062, "logps/rejected": -224.38671875, "loss": 0.6791, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0182084571570158, "rewards/margins": 0.017979206517338753, "rewards/rejected": -0.03618766739964485, "step": 250 }, { "epoch": 0.06238003838771593, "grad_norm": 4.992367199190442, "learning_rate": 3.1175059952038366e-07, "logits/chosen": -0.7312067747116089, "logits/rejected": -0.6521024703979492, "logps/chosen": -271.38824462890625, "logps/rejected": -275.63653564453125, "loss": 0.6747, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02519085630774498, "rewards/margins": 0.04188547283411026, "rewards/rejected": -0.06707633286714554, "step": 260 }, { "epoch": 0.0647792706333973, "grad_norm": 5.17440656592256, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -0.6548904180526733, "logits/rejected": -0.7951699495315552, "logps/chosen": -282.63873291015625, "logps/rejected": -225.5301971435547, "loss": 0.6735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02142253890633583, "rewards/margins": 0.012398405000567436, "rewards/rejected": -0.033820949494838715, "step": 270 }, { "epoch": 0.0671785028790787, "grad_norm": 5.277296067925045, "learning_rate": 3.3573141486810554e-07, "logits/chosen": -0.7172076106071472, "logits/rejected": -0.6975899934768677, "logps/chosen": -296.5246887207031, "logps/rejected": -288.5944519042969, "loss": 0.6668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.018556680530309677, "rewards/margins": 0.05493815615773201, "rewards/rejected": -0.07349482923746109, "step": 280 }, { "epoch": 0.06957773512476008, "grad_norm": 4.781163975921149, "learning_rate": 3.477218225419664e-07, "logits/chosen": -0.6228010654449463, "logits/rejected": -0.5732084512710571, "logps/chosen": -288.7183532714844, "logps/rejected": -263.90496826171875, "loss": 0.6668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.021407321095466614, "rewards/margins": 0.04037974029779434, "rewards/rejected": -0.061787061393260956, "step": 290 }, { "epoch": 0.07197696737044146, "grad_norm": 5.423349784612897, "learning_rate": 3.597122302158273e-07, "logits/chosen": -0.7558736801147461, "logits/rejected": -0.7680533528327942, "logps/chosen": -264.5168762207031, "logps/rejected": -289.5086669921875, "loss": 0.6685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04736015945672989, "rewards/margins": 0.04246506839990616, "rewards/rejected": -0.08982523530721664, "step": 300 }, { "epoch": 0.07437619961612284, "grad_norm": 5.058813842555934, "learning_rate": 3.7170263788968827e-07, "logits/chosen": -0.6308411359786987, "logits/rejected": -0.6718063950538635, "logps/chosen": -270.92779541015625, "logps/rejected": -247.31472778320312, "loss": 0.6721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.044375158846378326, "rewards/margins": 0.06773975491523743, "rewards/rejected": -0.11211492121219635, "step": 310 }, { "epoch": 0.07677543186180422, "grad_norm": 4.805850462678792, "learning_rate": 3.836930455635491e-07, "logits/chosen": -0.7107186913490295, "logits/rejected": -0.729761004447937, "logps/chosen": -273.9128723144531, "logps/rejected": -246.8804168701172, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": -0.05261250585317612, "rewards/margins": 0.05193439871072769, "rewards/rejected": -0.10454690456390381, "step": 320 }, { "epoch": 0.07917466410748561, "grad_norm": 4.995629742536248, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -0.6203581094741821, "logits/rejected": -0.5326763391494751, "logps/chosen": -261.4637756347656, "logps/rejected": -304.72015380859375, "loss": 0.6605, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09535319358110428, "rewards/margins": 0.07053264230489731, "rewards/rejected": -0.1658858358860016, "step": 330 }, { "epoch": 0.08157389635316699, "grad_norm": 4.930244515909295, "learning_rate": 4.07673860911271e-07, "logits/chosen": -0.5719800591468811, "logits/rejected": -0.576286792755127, "logps/chosen": -240.22964477539062, "logps/rejected": -272.12823486328125, "loss": 0.6597, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05066823214292526, "rewards/margins": 0.10919372737407684, "rewards/rejected": -0.1598619669675827, "step": 340 }, { "epoch": 0.08397312859884837, "grad_norm": 5.465735130601733, "learning_rate": 4.1966426858513185e-07, "logits/chosen": -0.7455052137374878, "logits/rejected": -0.7238417863845825, "logps/chosen": -306.32135009765625, "logps/rejected": -303.4272766113281, "loss": 0.661, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10594834387302399, "rewards/margins": 0.061454661190509796, "rewards/rejected": -0.16740299761295319, "step": 350 }, { "epoch": 0.08637236084452975, "grad_norm": 5.474448104756549, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -0.6257158517837524, "logits/rejected": -0.6943267583847046, "logps/chosen": -263.67681884765625, "logps/rejected": -227.45315551757812, "loss": 0.6617, "rewards/accuracies": 0.625, "rewards/chosen": -0.10281842947006226, "rewards/margins": 0.06660200655460358, "rewards/rejected": -0.16942045092582703, "step": 360 }, { "epoch": 0.08877159309021113, "grad_norm": 5.5768966423340265, "learning_rate": 4.436450839328537e-07, "logits/chosen": -0.6406761407852173, "logits/rejected": -0.6252005696296692, "logps/chosen": -254.68783569335938, "logps/rejected": -282.90228271484375, "loss": 0.6526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13191966712474823, "rewards/margins": 0.1186646968126297, "rewards/rejected": -0.25058436393737793, "step": 370 }, { "epoch": 0.09117082533589252, "grad_norm": 4.831375981831261, "learning_rate": 4.556354916067146e-07, "logits/chosen": -0.6866484880447388, "logits/rejected": -0.633335530757904, "logps/chosen": -240.5032196044922, "logps/rejected": -261.5648498535156, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": -0.10136518627405167, "rewards/margins": 0.14594808220863342, "rewards/rejected": -0.2473132610321045, "step": 380 }, { "epoch": 0.0935700575815739, "grad_norm": 5.472261214448905, "learning_rate": 4.676258992805755e-07, "logits/chosen": -0.564619243144989, "logits/rejected": -0.5628719329833984, "logps/chosen": -286.52899169921875, "logps/rejected": -262.25323486328125, "loss": 0.6426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15715694427490234, "rewards/margins": 0.07161404192447662, "rewards/rejected": -0.22877097129821777, "step": 390 }, { "epoch": 0.09596928982725528, "grad_norm": 5.347372222074903, "learning_rate": 4.796163069544364e-07, "logits/chosen": -0.6121346354484558, "logits/rejected": -0.6788171529769897, "logps/chosen": -266.82220458984375, "logps/rejected": -270.36724853515625, "loss": 0.6421, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1677761971950531, "rewards/margins": 0.17789766192436218, "rewards/rejected": -0.3456738591194153, "step": 400 }, { "epoch": 0.09836852207293666, "grad_norm": 5.340397422554379, "learning_rate": 4.916067146282974e-07, "logits/chosen": -0.6538274884223938, "logits/rejected": -0.6384181976318359, "logps/chosen": -270.4076843261719, "logps/rejected": -313.26544189453125, "loss": 0.6258, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1868169903755188, "rewards/margins": 0.1523607075214386, "rewards/rejected": -0.3391777276992798, "step": 410 }, { "epoch": 0.10076775431861804, "grad_norm": 5.46694192660825, "learning_rate": 4.999992108529978e-07, "logits/chosen": -0.585496723651886, "logits/rejected": -0.5861325263977051, "logps/chosen": -343.7078857421875, "logps/rejected": -326.8913269042969, "loss": 0.6332, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23270578682422638, "rewards/margins": 0.18326610326766968, "rewards/rejected": -0.41597190499305725, "step": 420 }, { "epoch": 0.10316698656429943, "grad_norm": 5.835218906444854, "learning_rate": 4.999851817115532e-07, "logits/chosen": -0.7467209100723267, "logits/rejected": -0.6660154461860657, "logps/chosen": -266.6215515136719, "logps/rejected": -292.622802734375, "loss": 0.6348, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.22090856730937958, "rewards/margins": 0.29071298241615295, "rewards/rejected": -0.5116215348243713, "step": 430 }, { "epoch": 0.10556621880998081, "grad_norm": 5.421070680650146, "learning_rate": 4.999536171027889e-07, "logits/chosen": -0.5800718069076538, "logits/rejected": -0.6239966154098511, "logps/chosen": -310.7662353515625, "logps/rejected": -315.01727294921875, "loss": 0.6241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.30067840218544006, "rewards/margins": 0.14732010662555695, "rewards/rejected": -0.4479985237121582, "step": 440 }, { "epoch": 0.10796545105566219, "grad_norm": 6.239774740521307, "learning_rate": 4.999045192408369e-07, "logits/chosen": -0.5860768556594849, "logits/rejected": -0.5695077180862427, "logps/chosen": -266.48162841796875, "logps/rejected": -264.7235107421875, "loss": 0.6248, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3346542716026306, "rewards/margins": 0.12011837959289551, "rewards/rejected": -0.4547726511955261, "step": 450 }, { "epoch": 0.11036468330134357, "grad_norm": 6.029927081598656, "learning_rate": 4.998378915697171e-07, "logits/chosen": -0.6232699155807495, "logits/rejected": -0.6357511878013611, "logps/chosen": -296.1357116699219, "logps/rejected": -316.9218444824219, "loss": 0.6016, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.28828713297843933, "rewards/margins": 0.2776426374912262, "rewards/rejected": -0.5659297704696655, "step": 460 }, { "epoch": 0.11276391554702495, "grad_norm": 5.6604945230319625, "learning_rate": 4.997537387630958e-07, "logits/chosen": -0.5975057482719421, "logits/rejected": -0.6152404546737671, "logps/chosen": -235.7269744873047, "logps/rejected": -265.6780090332031, "loss": 0.5969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2783138155937195, "rewards/margins": 0.27526405453681946, "rewards/rejected": -0.5535778403282166, "step": 470 }, { "epoch": 0.11516314779270634, "grad_norm": 6.693255407381944, "learning_rate": 4.996520667239582e-07, "logits/chosen": -0.7205396294593811, "logits/rejected": -0.6762118339538574, "logps/chosen": -265.1204833984375, "logps/rejected": -349.3818054199219, "loss": 0.6045, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3487080931663513, "rewards/margins": 0.32969897985458374, "rewards/rejected": -0.6784070730209351, "step": 480 }, { "epoch": 0.11756238003838772, "grad_norm": 6.713235956519299, "learning_rate": 4.995328825841939e-07, "logits/chosen": -0.5713664293289185, "logits/rejected": -0.553689181804657, "logps/chosen": -246.03598022460938, "logps/rejected": -300.48797607421875, "loss": 0.5929, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.35947948694229126, "rewards/margins": 0.5059111714363098, "rewards/rejected": -0.8653906583786011, "step": 490 }, { "epoch": 0.1199616122840691, "grad_norm": 6.608316940106426, "learning_rate": 4.993961947040967e-07, "logits/chosen": -0.5497556924819946, "logits/rejected": -0.5836997032165527, "logps/chosen": -330.4705505371094, "logps/rejected": -313.43511962890625, "loss": 0.6159, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5261253118515015, "rewards/margins": 0.1777980625629425, "rewards/rejected": -0.7039234042167664, "step": 500 }, { "epoch": 0.12236084452975048, "grad_norm": 6.046831260369165, "learning_rate": 4.992420126717784e-07, "logits/chosen": -0.5983260869979858, "logits/rejected": -0.5698710680007935, "logps/chosen": -275.0146789550781, "logps/rejected": -332.9678039550781, "loss": 0.5951, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3116165101528168, "rewards/margins": 0.5498673319816589, "rewards/rejected": -0.8614838719367981, "step": 510 }, { "epoch": 0.12476007677543186, "grad_norm": 7.348411835249425, "learning_rate": 4.990703473024958e-07, "logits/chosen": -0.5138384103775024, "logits/rejected": -0.5233681201934814, "logps/chosen": -332.6457214355469, "logps/rejected": -353.2841796875, "loss": 0.621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5376469492912292, "rewards/margins": 0.28140488266944885, "rewards/rejected": -0.8190518617630005, "step": 520 }, { "epoch": 0.12715930902111325, "grad_norm": 7.638245462015975, "learning_rate": 4.98881210637893e-07, "logits/chosen": -0.6331408619880676, "logits/rejected": -0.5843578577041626, "logps/chosen": -253.6781005859375, "logps/rejected": -326.880126953125, "loss": 0.6034, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.44268542528152466, "rewards/margins": 0.36151397228240967, "rewards/rejected": -0.8041993379592896, "step": 530 }, { "epoch": 0.1295585412667946, "grad_norm": 5.437673001258359, "learning_rate": 4.986746159451553e-07, "logits/chosen": -0.5594351887702942, "logits/rejected": -0.5495598912239075, "logps/chosen": -293.2762756347656, "logps/rejected": -318.1295166015625, "loss": 0.6058, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.37716802954673767, "rewards/margins": 0.341768354177475, "rewards/rejected": -0.7189363837242126, "step": 540 }, { "epoch": 0.131957773512476, "grad_norm": 5.462303995641746, "learning_rate": 4.984505777160795e-07, "logits/chosen": -0.499727725982666, "logits/rejected": -0.5212177038192749, "logps/chosen": -356.2312927246094, "logps/rejected": -389.5164794921875, "loss": 0.6123, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5117184519767761, "rewards/margins": 0.31264665722846985, "rewards/rejected": -0.8243652582168579, "step": 550 }, { "epoch": 0.1343570057581574, "grad_norm": 6.563474963984782, "learning_rate": 4.982091116660574e-07, "logits/chosen": -0.6962921023368835, "logits/rejected": -0.7188105583190918, "logps/chosen": -248.63955688476562, "logps/rejected": -241.68521118164062, "loss": 0.6252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5236629843711853, "rewards/margins": 0.1912154257297516, "rewards/rejected": -0.7148783802986145, "step": 560 }, { "epoch": 0.13675623800383876, "grad_norm": 7.806740806376691, "learning_rate": 4.979502347329732e-07, "logits/chosen": -0.531540036201477, "logits/rejected": -0.5084825754165649, "logps/chosen": -359.24871826171875, "logps/rejected": -422.90606689453125, "loss": 0.6116, "rewards/accuracies": 0.75, "rewards/chosen": -0.6582085490226746, "rewards/margins": 0.3848935067653656, "rewards/rejected": -1.0431021451950073, "step": 570 }, { "epoch": 0.13915547024952016, "grad_norm": 8.43363363786984, "learning_rate": 4.976739650760151e-07, "logits/chosen": -0.6741775274276733, "logits/rejected": -0.6754758358001709, "logps/chosen": -318.57708740234375, "logps/rejected": -327.61669921875, "loss": 0.5905, "rewards/accuracies": 0.625, "rewards/chosen": -0.5984118580818176, "rewards/margins": 0.26561444997787476, "rewards/rejected": -0.8640263676643372, "step": 580 }, { "epoch": 0.14155470249520152, "grad_norm": 8.305901855269768, "learning_rate": 4.97380322074402e-07, "logits/chosen": -0.5298658609390259, "logits/rejected": -0.5446540117263794, "logps/chosen": -281.94403076171875, "logps/rejected": -312.7386474609375, "loss": 0.6096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6375762820243835, "rewards/margins": 0.3014167249202728, "rewards/rejected": -0.9389930963516235, "step": 590 }, { "epoch": 0.14395393474088292, "grad_norm": 6.932480462761205, "learning_rate": 4.970693263260237e-07, "logits/chosen": -0.6071778535842896, "logits/rejected": -0.6448204517364502, "logps/chosen": -334.240478515625, "logps/rejected": -351.1219177246094, "loss": 0.6118, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5482883453369141, "rewards/margins": 0.4772118031978607, "rewards/rejected": -1.0255001783370972, "step": 600 }, { "epoch": 0.1463531669865643, "grad_norm": 7.988462606950694, "learning_rate": 4.967409996459966e-07, "logits/chosen": -0.6251802444458008, "logits/rejected": -0.6402324438095093, "logps/chosen": -337.7198486328125, "logps/rejected": -350.0640563964844, "loss": 0.5869, "rewards/accuracies": 0.75, "rewards/chosen": -0.5524300932884216, "rewards/margins": 0.395042359828949, "rewards/rejected": -0.9474723935127258, "step": 610 }, { "epoch": 0.14875239923224567, "grad_norm": 6.807256286481946, "learning_rate": 4.963953650651326e-07, "logits/chosen": -0.5530382394790649, "logits/rejected": -0.5686200857162476, "logps/chosen": -411.8724670410156, "logps/rejected": -351.87310791015625, "loss": 0.5886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6493805646896362, "rewards/margins": 0.2970190942287445, "rewards/rejected": -0.9463998079299927, "step": 620 }, { "epoch": 0.15115163147792707, "grad_norm": 6.66103506943935, "learning_rate": 4.960324468283248e-07, "logits/chosen": -0.7188149690628052, "logits/rejected": -0.7464720010757446, "logps/chosen": -291.5167541503906, "logps/rejected": -326.8276672363281, "loss": 0.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6833261251449585, "rewards/margins": 0.3538287281990051, "rewards/rejected": -1.0371549129486084, "step": 630 }, { "epoch": 0.15355086372360843, "grad_norm": 7.385295168977064, "learning_rate": 4.956522703928451e-07, "logits/chosen": -0.7066096663475037, "logits/rejected": -0.6415206789970398, "logps/chosen": -306.70269775390625, "logps/rejected": -343.36724853515625, "loss": 0.5626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7203932404518127, "rewards/margins": 0.3621978163719177, "rewards/rejected": -1.082590937614441, "step": 640 }, { "epoch": 0.15595009596928983, "grad_norm": 9.96633693590498, "learning_rate": 4.952548624265606e-07, "logits/chosen": -0.5960395336151123, "logits/rejected": -0.5949414372444153, "logps/chosen": -375.6698303222656, "logps/rejected": -379.85662841796875, "loss": 0.6024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9392830729484558, "rewards/margins": 0.2532385289669037, "rewards/rejected": -1.1925214529037476, "step": 650 }, { "epoch": 0.15834932821497122, "grad_norm": 7.433452911452113, "learning_rate": 4.948402508060607e-07, "logits/chosen": -0.6946985721588135, "logits/rejected": -0.714805006980896, "logps/chosen": -307.7847900390625, "logps/rejected": -352.8008117675781, "loss": 0.6016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7437388896942139, "rewards/margins": 0.5146031379699707, "rewards/rejected": -1.2583420276641846, "step": 660 }, { "epoch": 0.16074856046065258, "grad_norm": 8.260908471494544, "learning_rate": 4.944084646147038e-07, "logits/chosen": -0.6452184319496155, "logits/rejected": -0.6763893961906433, "logps/chosen": -397.63629150390625, "logps/rejected": -404.8137512207031, "loss": 0.6115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8227389454841614, "rewards/margins": 0.29742100834846497, "rewards/rejected": -1.1201599836349487, "step": 670 }, { "epoch": 0.16314779270633398, "grad_norm": 8.266084905632274, "learning_rate": 4.939595341405754e-07, "logits/chosen": -0.7573758363723755, "logits/rejected": -0.780553936958313, "logps/chosen": -331.1211853027344, "logps/rejected": -373.88775634765625, "loss": 0.5759, "rewards/accuracies": 0.75, "rewards/chosen": -0.7678017616271973, "rewards/margins": 0.4917237162590027, "rewards/rejected": -1.2595255374908447, "step": 680 }, { "epoch": 0.16554702495201534, "grad_norm": 7.625957912581509, "learning_rate": 4.93493490874365e-07, "logits/chosen": -0.6451135873794556, "logits/rejected": -0.6591531038284302, "logps/chosen": -325.8547058105469, "logps/rejected": -366.7559814453125, "loss": 0.5467, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.821052074432373, "rewards/margins": 0.33492714166641235, "rewards/rejected": -1.1559793949127197, "step": 690 }, { "epoch": 0.16794625719769674, "grad_norm": 10.17202506828973, "learning_rate": 4.93010367507156e-07, "logits/chosen": -0.7482548356056213, "logits/rejected": -0.7481337189674377, "logps/chosen": -276.33294677734375, "logps/rejected": -313.0743103027344, "loss": 0.5607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.673913300037384, "rewards/margins": 0.6004130244255066, "rewards/rejected": -1.2743263244628906, "step": 700 }, { "epoch": 0.17034548944337813, "grad_norm": 9.522996639673643, "learning_rate": 4.925101979281332e-07, "logits/chosen": -0.6719304919242859, "logits/rejected": -0.7497730851173401, "logps/chosen": -369.2864685058594, "logps/rejected": -377.1894226074219, "loss": 0.5774, "rewards/accuracies": 0.75, "rewards/chosen": -0.7069920301437378, "rewards/margins": 0.6167112588882446, "rewards/rejected": -1.3237032890319824, "step": 710 }, { "epoch": 0.1727447216890595, "grad_norm": 7.878026358158539, "learning_rate": 4.919930172222054e-07, "logits/chosen": -0.7336487174034119, "logits/rejected": -0.7846344709396362, "logps/chosen": -344.8207702636719, "logps/rejected": -387.56201171875, "loss": 0.5369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8977760076522827, "rewards/margins": 0.49707236886024475, "rewards/rejected": -1.3948484659194946, "step": 720 }, { "epoch": 0.1751439539347409, "grad_norm": 8.810480029842584, "learning_rate": 4.914588616675445e-07, "logits/chosen": -0.8329795598983765, "logits/rejected": -0.8481542468070984, "logps/chosen": -279.27081298828125, "logps/rejected": -337.36883544921875, "loss": 0.5906, "rewards/accuracies": 0.75, "rewards/chosen": -0.61821049451828, "rewards/margins": 0.4857397675514221, "rewards/rejected": -1.1039502620697021, "step": 730 }, { "epoch": 0.17754318618042225, "grad_norm": 9.762970453211091, "learning_rate": 4.909077687330404e-07, "logits/chosen": -0.6972378492355347, "logits/rejected": -0.7455834150314331, "logps/chosen": -354.2886657714844, "logps/rejected": -357.53558349609375, "loss": 0.553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7641565799713135, "rewards/margins": 0.3690626621246338, "rewards/rejected": -1.1332192420959473, "step": 740 }, { "epoch": 0.17994241842610365, "grad_norm": 9.234397345125467, "learning_rate": 4.903397770756729e-07, "logits/chosen": -0.7595505714416504, "logits/rejected": -0.7833656668663025, "logps/chosen": -351.7059020996094, "logps/rejected": -407.5093688964844, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8937844038009644, "rewards/margins": 0.57142174243927, "rewards/rejected": -1.4652063846588135, "step": 750 }, { "epoch": 0.18234165067178504, "grad_norm": 7.71688649402669, "learning_rate": 4.897549265378004e-07, "logits/chosen": -0.7180362939834595, "logits/rejected": -0.7230840921401978, "logps/chosen": -430.9496154785156, "logps/rejected": -468.16937255859375, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": -1.040961503982544, "rewards/margins": 0.4251536428928375, "rewards/rejected": -1.4661149978637695, "step": 760 }, { "epoch": 0.1847408829174664, "grad_norm": 8.153325048910528, "learning_rate": 4.891532581443643e-07, "logits/chosen": -0.8323251008987427, "logits/rejected": -0.8525883555412292, "logps/chosen": -381.8115234375, "logps/rejected": -471.56719970703125, "loss": 0.5333, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7869538068771362, "rewards/margins": 0.9414682388305664, "rewards/rejected": -1.7284221649169922, "step": 770 }, { "epoch": 0.1871401151631478, "grad_norm": 8.422916796673588, "learning_rate": 4.885348141000122e-07, "logits/chosen": -0.7381910085678101, "logits/rejected": -0.7251767516136169, "logps/chosen": -332.1246032714844, "logps/rejected": -410.90057373046875, "loss": 0.5641, "rewards/accuracies": 0.75, "rewards/chosen": -0.8256322741508484, "rewards/margins": 0.6119931936264038, "rewards/rejected": -1.4376256465911865, "step": 780 }, { "epoch": 0.18953934740882916, "grad_norm": 9.466411263035182, "learning_rate": 4.878996377861367e-07, "logits/chosen": -0.86748206615448, "logits/rejected": -0.8805437088012695, "logps/chosen": -308.0349426269531, "logps/rejected": -359.1808166503906, "loss": 0.5249, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.915812611579895, "rewards/margins": 0.4137847423553467, "rewards/rejected": -1.3295972347259521, "step": 790 }, { "epoch": 0.19193857965451055, "grad_norm": 9.035814907343086, "learning_rate": 4.872477737578327e-07, "logits/chosen": -0.830212414264679, "logits/rejected": -0.7413343787193298, "logps/chosen": -373.04217529296875, "logps/rejected": -468.3797302246094, "loss": 0.5248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.003941535949707, "rewards/margins": 0.9732707738876343, "rewards/rejected": -1.9772125482559204, "step": 800 }, { "epoch": 0.19433781190019195, "grad_norm": 11.703896273776289, "learning_rate": 4.865792677407718e-07, "logits/chosen": -0.8188837766647339, "logits/rejected": -0.8448683023452759, "logps/chosen": -368.94512939453125, "logps/rejected": -377.5952453613281, "loss": 0.5712, "rewards/accuracies": 0.625, "rewards/chosen": -1.1495884656906128, "rewards/margins": 0.362571656703949, "rewards/rejected": -1.512160062789917, "step": 810 }, { "epoch": 0.1967370441458733, "grad_norm": 10.81943352397323, "learning_rate": 4.858941666279955e-07, "logits/chosen": -0.8488418459892273, "logits/rejected": -0.880780816078186, "logps/chosen": -353.0042419433594, "logps/rejected": -379.8878479003906, "loss": 0.5742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.79103684425354, "rewards/margins": 0.4048077464103699, "rewards/rejected": -1.1958444118499756, "step": 820 }, { "epoch": 0.1991362763915547, "grad_norm": 8.925559856487725, "learning_rate": 4.851925184766247e-07, "logits/chosen": -0.8389931917190552, "logits/rejected": -0.8778663873672485, "logps/chosen": -348.4869079589844, "logps/rejected": -395.1742248535156, "loss": 0.5534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9398209452629089, "rewards/margins": 0.6886889934539795, "rewards/rejected": -1.6285101175308228, "step": 830 }, { "epoch": 0.20153550863723607, "grad_norm": 11.109685433240054, "learning_rate": 4.844743725044897e-07, "logits/chosen": -0.8412739038467407, "logits/rejected": -0.9312038421630859, "logps/chosen": -339.4559631347656, "logps/rejected": -370.5246887207031, "loss": 0.5445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9467164278030396, "rewards/margins": 0.5613173246383667, "rewards/rejected": -1.5080337524414062, "step": 840 }, { "epoch": 0.20393474088291746, "grad_norm": 10.72753800655601, "learning_rate": 4.837397790866774e-07, "logits/chosen": -0.8491243124008179, "logits/rejected": -0.8605045080184937, "logps/chosen": -366.4127502441406, "logps/rejected": -440.4471130371094, "loss": 0.5521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7613908052444458, "rewards/margins": 0.9519069790840149, "rewards/rejected": -1.7132980823516846, "step": 850 }, { "epoch": 0.20633397312859886, "grad_norm": 10.004722580523001, "learning_rate": 4.829887897519974e-07, "logits/chosen": -0.9373795390129089, "logits/rejected": -0.9096584320068359, "logps/chosen": -319.7113952636719, "logps/rejected": -403.3350524902344, "loss": 0.5559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8845510482788086, "rewards/margins": 0.5768999457359314, "rewards/rejected": -1.4614509344100952, "step": 860 }, { "epoch": 0.20873320537428022, "grad_norm": 9.07756678312938, "learning_rate": 4.82221457179368e-07, "logits/chosen": -0.869964599609375, "logits/rejected": -0.871699333190918, "logps/chosen": -354.28704833984375, "logps/rejected": -417.8169860839844, "loss": 0.5254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8736898303031921, "rewards/margins": 0.754965603351593, "rewards/rejected": -1.6286554336547852, "step": 870 }, { "epoch": 0.21113243761996162, "grad_norm": 11.276381279159365, "learning_rate": 4.814378351941206e-07, "logits/chosen": -0.8650039434432983, "logits/rejected": -0.8803671002388, "logps/chosen": -343.48822021484375, "logps/rejected": -381.94207763671875, "loss": 0.5619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9373790621757507, "rewards/margins": 0.47534435987472534, "rewards/rejected": -1.4127235412597656, "step": 880 }, { "epoch": 0.21353166986564298, "grad_norm": 10.076876257674526, "learning_rate": 4.806379787642241e-07, "logits/chosen": -0.8381707072257996, "logits/rejected": -0.8108996152877808, "logps/chosen": -331.9487609863281, "logps/rejected": -403.20867919921875, "loss": 0.5839, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8763322830200195, "rewards/margins": 0.624114990234375, "rewards/rejected": -1.500447392463684, "step": 890 }, { "epoch": 0.21593090211132437, "grad_norm": 10.034596459189128, "learning_rate": 4.798219439964293e-07, "logits/chosen": -0.8501941561698914, "logits/rejected": -0.8989803194999695, "logps/chosen": -343.5841979980469, "logps/rejected": -392.810791015625, "loss": 0.5347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0320571660995483, "rewards/margins": 0.36501601338386536, "rewards/rejected": -1.3970732688903809, "step": 900 }, { "epoch": 0.21833013435700577, "grad_norm": 10.27150503174172, "learning_rate": 4.78989788132333e-07, "logits/chosen": -0.902818500995636, "logits/rejected": -0.8824566006660461, "logps/chosen": -300.7974853515625, "logps/rejected": -388.55126953125, "loss": 0.5081, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8453769683837891, "rewards/margins": 0.8020466566085815, "rewards/rejected": -1.647423505783081, "step": 910 }, { "epoch": 0.22072936660268713, "grad_norm": 8.577531584564799, "learning_rate": 4.781415695443631e-07, "logits/chosen": -0.7745347619056702, "logits/rejected": -0.7863970398902893, "logps/chosen": -420.1585388183594, "logps/rejected": -460.1143493652344, "loss": 0.5703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2678101062774658, "rewards/margins": 0.29927223920822144, "rewards/rejected": -1.5670822858810425, "step": 920 }, { "epoch": 0.22312859884836853, "grad_norm": 7.712609609832512, "learning_rate": 4.772773477316836e-07, "logits/chosen": -0.7534626722335815, "logits/rejected": -0.7615676522254944, "logps/chosen": -395.2740478515625, "logps/rejected": -448.7425231933594, "loss": 0.5444, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.115117073059082, "rewards/margins": 0.5547926425933838, "rewards/rejected": -1.6699097156524658, "step": 930 }, { "epoch": 0.2255278310940499, "grad_norm": 14.001350284298912, "learning_rate": 4.7639718331602117e-07, "logits/chosen": -0.7357865571975708, "logits/rejected": -0.7330624461174011, "logps/chosen": -353.99200439453125, "logps/rejected": -431.35565185546875, "loss": 0.5372, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8342105150222778, "rewards/margins": 0.878664493560791, "rewards/rejected": -1.7128750085830688, "step": 940 }, { "epoch": 0.22792706333973128, "grad_norm": 10.221546212484748, "learning_rate": 4.7550113803741275e-07, "logits/chosen": -0.7402353882789612, "logits/rejected": -0.8165884017944336, "logps/chosen": -379.7347717285156, "logps/rejected": -362.58721923828125, "loss": 0.5637, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0031925439834595, "rewards/margins": 0.4585431218147278, "rewards/rejected": -1.461735486984253, "step": 950 }, { "epoch": 0.23032629558541268, "grad_norm": 10.091860887109807, "learning_rate": 4.7458927474987454e-07, "logits/chosen": -0.699491560459137, "logits/rejected": -0.7056708931922913, "logps/chosen": -411.66754150390625, "logps/rejected": -392.9963073730469, "loss": 0.5194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8979185223579407, "rewards/margins": 0.4219675064086914, "rewards/rejected": -1.3198859691619873, "step": 960 }, { "epoch": 0.23272552783109404, "grad_norm": 10.700825723854395, "learning_rate": 4.7366165741699347e-07, "logits/chosen": -0.7538058161735535, "logits/rejected": -0.7752776145935059, "logps/chosen": -426.29840087890625, "logps/rejected": -456.43865966796875, "loss": 0.5367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0425876379013062, "rewards/margins": 0.5545080900192261, "rewards/rejected": -1.5970958471298218, "step": 970 }, { "epoch": 0.23512476007677544, "grad_norm": 9.318743253411762, "learning_rate": 4.727183511074401e-07, "logits/chosen": -0.8688480257987976, "logits/rejected": -0.8663204908370972, "logps/chosen": -376.7490234375, "logps/rejected": -404.92816162109375, "loss": 0.5436, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.009129285812378, "rewards/margins": 0.382347047328949, "rewards/rejected": -1.3914763927459717, "step": 980 }, { "epoch": 0.2375239923224568, "grad_norm": 10.050271627816938, "learning_rate": 4.717594219904043e-07, "logits/chosen": -0.7565699815750122, "logits/rejected": -0.805103600025177, "logps/chosen": -377.40472412109375, "logps/rejected": -379.1291809082031, "loss": 0.5345, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0759207010269165, "rewards/margins": 0.45810168981552124, "rewards/rejected": -1.534022331237793, "step": 990 }, { "epoch": 0.2399232245681382, "grad_norm": 9.560726789203981, "learning_rate": 4.7078493733095393e-07, "logits/chosen": -0.9027126431465149, "logits/rejected": -0.9078402519226074, "logps/chosen": -370.6812744140625, "logps/rejected": -446.0091247558594, "loss": 0.5318, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.204393982887268, "rewards/margins": 0.6276249289512634, "rewards/rejected": -1.8320188522338867, "step": 1000 }, { "epoch": 0.2423224568138196, "grad_norm": 9.375367945145504, "learning_rate": 4.6979496548531614e-07, "logits/chosen": -0.8843992352485657, "logits/rejected": -0.8497310876846313, "logps/chosen": -399.2445373535156, "logps/rejected": -512.7918701171875, "loss": 0.5464, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4072428941726685, "rewards/margins": 0.6233394742012024, "rewards/rejected": -2.0305821895599365, "step": 1010 }, { "epoch": 0.24472168905950095, "grad_norm": 9.752181299642505, "learning_rate": 4.6878957589608293e-07, "logits/chosen": -0.8666139841079712, "logits/rejected": -0.8512627482414246, "logps/chosen": -390.6292724609375, "logps/rejected": -506.28228759765625, "loss": 0.5451, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2309002876281738, "rewards/margins": 0.7173486351966858, "rewards/rejected": -1.9482488632202148, "step": 1020 }, { "epoch": 0.24712092130518235, "grad_norm": 9.424652839992445, "learning_rate": 4.6776883908733956e-07, "logits/chosen": -0.9692492485046387, "logits/rejected": -1.0165684223175049, "logps/chosen": -394.48858642578125, "logps/rejected": -401.189697265625, "loss": 0.5154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0245649814605713, "rewards/margins": 0.7163118720054626, "rewards/rejected": -1.7408767938613892, "step": 1030 }, { "epoch": 0.2495201535508637, "grad_norm": 12.277316191566172, "learning_rate": 4.667328266597178e-07, "logits/chosen": -0.8959840536117554, "logits/rejected": -0.9235955476760864, "logps/chosen": -361.2985534667969, "logps/rejected": -423.58233642578125, "loss": 0.5001, "rewards/accuracies": 0.75, "rewards/chosen": -0.9520134925842285, "rewards/margins": 0.6992291808128357, "rewards/rejected": -1.6512426137924194, "step": 1040 }, { "epoch": 0.2519193857965451, "grad_norm": 8.94451963423609, "learning_rate": 4.6568161128537354e-07, "logits/chosen": -0.8420774340629578, "logits/rejected": -0.9481871724128723, "logps/chosen": -380.89874267578125, "logps/rejected": -393.58160400390625, "loss": 0.5246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2700251340866089, "rewards/margins": 0.634412407875061, "rewards/rejected": -1.9044376611709595, "step": 1050 }, { "epoch": 0.2543186180422265, "grad_norm": 14.034213989285222, "learning_rate": 4.6461526670288877e-07, "logits/chosen": -0.8429055213928223, "logits/rejected": -0.8352988958358765, "logps/chosen": -404.8660888671875, "logps/rejected": -438.3995056152344, "loss": 0.5668, "rewards/accuracies": 0.75, "rewards/chosen": -1.1988664865493774, "rewards/margins": 0.6478247046470642, "rewards/rejected": -1.8466911315917969, "step": 1060 }, { "epoch": 0.2567178502879079, "grad_norm": 13.373585055843934, "learning_rate": 4.635338677120994e-07, "logits/chosen": -0.9964144825935364, "logits/rejected": -0.9940506815910339, "logps/chosen": -377.0820617675781, "logps/rejected": -486.6973571777344, "loss": 0.4774, "rewards/accuracies": 0.75, "rewards/chosen": -1.1816045045852661, "rewards/margins": 0.8956181406974792, "rewards/rejected": -2.0772225856781006, "step": 1070 }, { "epoch": 0.2591170825335892, "grad_norm": 11.284998984427544, "learning_rate": 4.6243749016884835e-07, "logits/chosen": -0.7764107584953308, "logits/rejected": -0.8370550870895386, "logps/chosen": -418.0027770996094, "logps/rejected": -593.8638916015625, "loss": 0.5251, "rewards/accuracies": 0.875, "rewards/chosen": -1.493062138557434, "rewards/margins": 1.221855640411377, "rewards/rejected": -2.7149176597595215, "step": 1080 }, { "epoch": 0.2615163147792706, "grad_norm": 16.100264142741796, "learning_rate": 4.613262109796645e-07, "logits/chosen": -0.9073816537857056, "logits/rejected": -0.8367312550544739, "logps/chosen": -432.54095458984375, "logps/rejected": -573.1912841796875, "loss": 0.5319, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6614210605621338, "rewards/margins": 0.9968196749687195, "rewards/rejected": -2.658240556716919, "step": 1090 }, { "epoch": 0.263915547024952, "grad_norm": 12.598181058336907, "learning_rate": 4.602001080963678e-07, "logits/chosen": -0.8620105981826782, "logits/rejected": -0.8818934559822083, "logps/chosen": -452.8822326660156, "logps/rejected": -499.30694580078125, "loss": 0.5321, "rewards/accuracies": 0.75, "rewards/chosen": -1.6771061420440674, "rewards/margins": 0.7576761841773987, "rewards/rejected": -2.4347822666168213, "step": 1100 }, { "epoch": 0.2663147792706334, "grad_norm": 11.020616319405283, "learning_rate": 4.590592605106017e-07, "logits/chosen": -0.925918698310852, "logits/rejected": -0.9431027173995972, "logps/chosen": -427.30279541015625, "logps/rejected": -476.481201171875, "loss": 0.5515, "rewards/accuracies": 0.875, "rewards/chosen": -1.2021554708480835, "rewards/margins": 0.7562609910964966, "rewards/rejected": -1.9584165811538696, "step": 1110 }, { "epoch": 0.2687140115163148, "grad_norm": 11.43283460001136, "learning_rate": 4.5790374824829165e-07, "logits/chosen": -0.8499002456665039, "logits/rejected": -0.8679935336112976, "logps/chosen": -310.17327880859375, "logps/rejected": -387.49249267578125, "loss": 0.5063, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2178689241409302, "rewards/margins": 0.6209184527397156, "rewards/rejected": -1.8387874364852905, "step": 1120 }, { "epoch": 0.27111324376199614, "grad_norm": 10.510361605616744, "learning_rate": 4.5673365236403216e-07, "logits/chosen": -0.882469654083252, "logits/rejected": -0.9395925402641296, "logps/chosen": -300.57720947265625, "logps/rejected": -434.6649475097656, "loss": 0.5097, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0626921653747559, "rewards/margins": 1.0293641090393066, "rewards/rejected": -2.0920560359954834, "step": 1130 }, { "epoch": 0.27351247600767753, "grad_norm": 11.418966211240742, "learning_rate": 4.5554905493540075e-07, "logits/chosen": -0.9051049947738647, "logits/rejected": -0.880601704120636, "logps/chosen": -340.77911376953125, "logps/rejected": -454.90252685546875, "loss": 0.4885, "rewards/accuracies": 0.75, "rewards/chosen": -1.225037932395935, "rewards/margins": 1.0688270330429077, "rewards/rejected": -2.2938647270202637, "step": 1140 }, { "epoch": 0.2759117082533589, "grad_norm": 9.72460464054391, "learning_rate": 4.5435003905720074e-07, "logits/chosen": -0.8102267980575562, "logits/rejected": -0.8643985986709595, "logps/chosen": -390.0484619140625, "logps/rejected": -450.08868408203125, "loss": 0.5012, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0651195049285889, "rewards/margins": 0.8835436701774597, "rewards/rejected": -1.9486631155014038, "step": 1150 }, { "epoch": 0.2783109404990403, "grad_norm": 13.360087004422779, "learning_rate": 4.531366888356324e-07, "logits/chosen": -0.8684479594230652, "logits/rejected": -0.8134763836860657, "logps/chosen": -311.7840881347656, "logps/rejected": -458.7945861816406, "loss": 0.4901, "rewards/accuracies": 0.75, "rewards/chosen": -1.3148356676101685, "rewards/margins": 1.0154473781585693, "rewards/rejected": -2.3302829265594482, "step": 1160 }, { "epoch": 0.2807101727447217, "grad_norm": 17.680025948725728, "learning_rate": 4.519090893823931e-07, "logits/chosen": -0.830313503742218, "logits/rejected": -0.8483401536941528, "logps/chosen": -398.6609802246094, "logps/rejected": -483.01348876953125, "loss": 0.4966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.591963291168213, "rewards/margins": 0.8999356031417847, "rewards/rejected": -2.491899013519287, "step": 1170 }, { "epoch": 0.28310940499040305, "grad_norm": 10.953334824928836, "learning_rate": 4.5066732680870734e-07, "logits/chosen": -0.8117620348930359, "logits/rejected": -0.855734646320343, "logps/chosen": -401.9914245605469, "logps/rejected": -469.42218017578125, "loss": 0.5003, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4848546981811523, "rewards/margins": 1.069037914276123, "rewards/rejected": -2.5538926124572754, "step": 1180 }, { "epoch": 0.28550863723608444, "grad_norm": 12.957936678071963, "learning_rate": 4.494114882192862e-07, "logits/chosen": -0.8582413792610168, "logits/rejected": -0.8433802723884583, "logps/chosen": -400.0009460449219, "logps/rejected": -485.3779296875, "loss": 0.4816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.388290524482727, "rewards/margins": 1.1432268619537354, "rewards/rejected": -2.531517267227173, "step": 1190 }, { "epoch": 0.28790786948176583, "grad_norm": 12.79986874331055, "learning_rate": 4.4814166170621735e-07, "logits/chosen": -0.8589996099472046, "logits/rejected": -0.8760782480239868, "logps/chosen": -367.97552490234375, "logps/rejected": -463.1073303222656, "loss": 0.5092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2390401363372803, "rewards/margins": 1.1767470836639404, "rewards/rejected": -2.4157872200012207, "step": 1200 }, { "epoch": 0.2903071017274472, "grad_norm": 12.234631147028441, "learning_rate": 4.468579363427858e-07, "logits/chosen": -0.8185877799987793, "logits/rejected": -0.8393834829330444, "logps/chosen": -394.2176513671875, "logps/rejected": -454.900146484375, "loss": 0.5058, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4075071811676025, "rewards/margins": 0.8511323928833008, "rewards/rejected": -2.2586395740509033, "step": 1210 }, { "epoch": 0.2927063339731286, "grad_norm": 17.553407135322153, "learning_rate": 4.4556040217722555e-07, "logits/chosen": -0.8887416124343872, "logits/rejected": -0.860831618309021, "logps/chosen": -352.8416748046875, "logps/rejected": -510.03546142578125, "loss": 0.4867, "rewards/accuracies": 0.875, "rewards/chosen": -1.1481072902679443, "rewards/margins": 1.2108467817306519, "rewards/rejected": -2.3589539527893066, "step": 1220 }, { "epoch": 0.29510556621880996, "grad_norm": 11.868156097873015, "learning_rate": 4.442491502264033e-07, "logits/chosen": -0.8076246976852417, "logits/rejected": -0.8352873921394348, "logps/chosen": -364.78826904296875, "logps/rejected": -407.4501953125, "loss": 0.5063, "rewards/accuracies": 0.625, "rewards/chosen": -1.4375666379928589, "rewards/margins": 0.5911107659339905, "rewards/rejected": -2.028677463531494, "step": 1230 }, { "epoch": 0.29750479846449135, "grad_norm": 11.089930522236687, "learning_rate": 4.429242724694338e-07, "logits/chosen": -0.8699033856391907, "logits/rejected": -0.8417544364929199, "logps/chosen": -404.20159912109375, "logps/rejected": -516.6590576171875, "loss": 0.5064, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.583553433418274, "rewards/margins": 0.9518542289733887, "rewards/rejected": -2.535407543182373, "step": 1240 }, { "epoch": 0.29990403071017274, "grad_norm": 11.987389579927841, "learning_rate": 4.4158586184122817e-07, "logits/chosen": -0.8027983903884888, "logits/rejected": -0.8380182981491089, "logps/chosen": -424.4278259277344, "logps/rejected": -504.82476806640625, "loss": 0.4858, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.338322401046753, "rewards/margins": 1.1540087461471558, "rewards/rejected": -2.4923312664031982, "step": 1250 }, { "epoch": 0.30230326295585414, "grad_norm": 11.726927303441652, "learning_rate": 4.4023401222597443e-07, "logits/chosen": -0.7710140943527222, "logits/rejected": -0.8672100901603699, "logps/chosen": -440.61566162109375, "logps/rejected": -509.686767578125, "loss": 0.4782, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5568013191223145, "rewards/margins": 0.9064668416976929, "rewards/rejected": -2.463268518447876, "step": 1260 }, { "epoch": 0.30470249520153553, "grad_norm": 13.87871055626238, "learning_rate": 4.3886881845055235e-07, "logits/chosen": -0.8198641538619995, "logits/rejected": -0.8790807723999023, "logps/chosen": -392.24658203125, "logps/rejected": -504.7674255371094, "loss": 0.4534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4889254570007324, "rewards/margins": 1.196481466293335, "rewards/rejected": -2.6854069232940674, "step": 1270 }, { "epoch": 0.30710172744721687, "grad_norm": 10.998393295453823, "learning_rate": 4.374903762778814e-07, "logits/chosen": -0.8656896352767944, "logits/rejected": -0.8914599418640137, "logps/chosen": -444.3971252441406, "logps/rejected": -520.9356689453125, "loss": 0.4696, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8613601922988892, "rewards/margins": 1.0016160011291504, "rewards/rejected": -2.862975835800171, "step": 1280 }, { "epoch": 0.30950095969289826, "grad_norm": 12.50531025670152, "learning_rate": 4.3609878240020356e-07, "logits/chosen": -0.8085900545120239, "logits/rejected": -0.8679038882255554, "logps/chosen": -484.76251220703125, "logps/rejected": -533.459716796875, "loss": 0.4863, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8432880640029907, "rewards/margins": 1.1199102401733398, "rewards/rejected": -2.963197946548462, "step": 1290 }, { "epoch": 0.31190019193857965, "grad_norm": 11.511964627048417, "learning_rate": 4.346941344323005e-07, "logits/chosen": -0.8386822938919067, "logits/rejected": -0.9056866765022278, "logps/chosen": -432.3689880371094, "logps/rejected": -450.2909240722656, "loss": 0.5441, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9001528024673462, "rewards/margins": 0.6564325094223022, "rewards/rejected": -2.5565853118896484, "step": 1300 }, { "epoch": 0.31429942418426104, "grad_norm": 11.097528659174904, "learning_rate": 4.332765309046467e-07, "logits/chosen": -0.7923992276191711, "logits/rejected": -0.7886919379234314, "logps/chosen": -419.7078552246094, "logps/rejected": -483.94049072265625, "loss": 0.5005, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4908955097198486, "rewards/margins": 1.0034749507904053, "rewards/rejected": -2.494370460510254, "step": 1310 }, { "epoch": 0.31669865642994244, "grad_norm": 15.474416362716356, "learning_rate": 4.3184607125649754e-07, "logits/chosen": -0.8138014078140259, "logits/rejected": -0.8160893321037292, "logps/chosen": -396.61572265625, "logps/rejected": -518.4467163085938, "loss": 0.5069, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2408325672149658, "rewards/margins": 1.0694630146026611, "rewards/rejected": -2.310295581817627, "step": 1320 }, { "epoch": 0.3190978886756238, "grad_norm": 10.27191089173125, "learning_rate": 4.304028558289141e-07, "logits/chosen": -0.8450434803962708, "logits/rejected": -0.8716680407524109, "logps/chosen": -392.87164306640625, "logps/rejected": -477.4569396972656, "loss": 0.4715, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1086870431900024, "rewards/margins": 1.0651975870132446, "rewards/rejected": -2.173884630203247, "step": 1330 }, { "epoch": 0.32149712092130517, "grad_norm": 9.036616057089963, "learning_rate": 4.28946985857725e-07, "logits/chosen": -0.7608897686004639, "logits/rejected": -0.7740424871444702, "logps/chosen": -430.8748474121094, "logps/rejected": -552.3256225585938, "loss": 0.463, "rewards/accuracies": 0.875, "rewards/chosen": -1.5485337972640991, "rewards/margins": 1.306312084197998, "rewards/rejected": -2.8548457622528076, "step": 1340 }, { "epoch": 0.32389635316698656, "grad_norm": 11.634362526668829, "learning_rate": 4.2747856346642445e-07, "logits/chosen": -0.8262165188789368, "logits/rejected": -0.8555091023445129, "logps/chosen": -375.15472412109375, "logps/rejected": -471.82293701171875, "loss": 0.4726, "rewards/accuracies": 0.875, "rewards/chosen": -1.4888992309570312, "rewards/margins": 1.016867995262146, "rewards/rejected": -2.505767345428467, "step": 1350 }, { "epoch": 0.32629558541266795, "grad_norm": 13.398876290373195, "learning_rate": 4.2599769165900933e-07, "logits/chosen": -0.7887164950370789, "logits/rejected": -0.8400223851203918, "logps/chosen": -464.3109436035156, "logps/rejected": -523.8682861328125, "loss": 0.5048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0703892707824707, "rewards/margins": 0.833997905254364, "rewards/rejected": -2.9043872356414795, "step": 1360 }, { "epoch": 0.32869481765834935, "grad_norm": 10.664824987437187, "learning_rate": 4.245044743127535e-07, "logits/chosen": -0.9320866465568542, "logits/rejected": -0.9028736352920532, "logps/chosen": -407.45880126953125, "logps/rejected": -506.1048889160156, "loss": 0.4833, "rewards/accuracies": 0.75, "rewards/chosen": -1.5945441722869873, "rewards/margins": 0.8470728993415833, "rewards/rejected": -2.441617250442505, "step": 1370 }, { "epoch": 0.3310940499040307, "grad_norm": 14.46708847771426, "learning_rate": 4.229990161709214e-07, "logits/chosen": -0.7717675566673279, "logits/rejected": -0.7377297282218933, "logps/chosen": -367.9638977050781, "logps/rejected": -525.7390747070312, "loss": 0.5272, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.408815622329712, "rewards/margins": 1.3415130376815796, "rewards/rejected": -2.750328540802002, "step": 1380 }, { "epoch": 0.3334932821497121, "grad_norm": 8.175303654043583, "learning_rate": 4.214814228354204e-07, "logits/chosen": -0.7827272415161133, "logits/rejected": -0.7984440326690674, "logps/chosen": -451.130126953125, "logps/rejected": -563.998046875, "loss": 0.4767, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7949268817901611, "rewards/margins": 1.3159399032592773, "rewards/rejected": -3.1108667850494385, "step": 1390 }, { "epoch": 0.33589251439539347, "grad_norm": 12.456155197274827, "learning_rate": 4.1995180075939375e-07, "logits/chosen": -0.8594837188720703, "logits/rejected": -0.8565725088119507, "logps/chosen": -457.4839782714844, "logps/rejected": -527.7620239257812, "loss": 0.4705, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6926393508911133, "rewards/margins": 0.9544218182563782, "rewards/rejected": -2.647061347961426, "step": 1400 }, { "epoch": 0.33829174664107486, "grad_norm": 10.492615269241377, "learning_rate": 4.1841025723975297e-07, "logits/chosen": -0.8039811849594116, "logits/rejected": -0.809655487537384, "logps/chosen": -395.67999267578125, "logps/rejected": -496.8030700683594, "loss": 0.4625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0631120204925537, "rewards/margins": 1.2070002555847168, "rewards/rejected": -2.2701122760772705, "step": 1410 }, { "epoch": 0.34069097888675626, "grad_norm": 11.628088236483801, "learning_rate": 4.168569004096516e-07, "logits/chosen": -0.7779537439346313, "logits/rejected": -0.732982337474823, "logps/chosen": -365.4278259277344, "logps/rejected": -518.9385986328125, "loss": 0.4622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3053802251815796, "rewards/margins": 1.3282335996627808, "rewards/rejected": -2.6336138248443604, "step": 1420 }, { "epoch": 0.3430902111324376, "grad_norm": 9.810034255281902, "learning_rate": 4.152918392308997e-07, "logits/chosen": -0.9322064518928528, "logits/rejected": -0.9035415649414062, "logps/chosen": -429.0191345214844, "logps/rejected": -481.86846923828125, "loss": 0.4737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8905305862426758, "rewards/margins": 0.6541526317596436, "rewards/rejected": -2.5446829795837402, "step": 1430 }, { "epoch": 0.345489443378119, "grad_norm": 16.302876025790795, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.8372869491577148, "logits/rejected": -0.7931715250015259, "logps/chosen": -425.23651123046875, "logps/rejected": -571.8636474609375, "loss": 0.5414, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9143438339233398, "rewards/margins": 1.09660804271698, "rewards/rejected": -3.0109522342681885, "step": 1440 }, { "epoch": 0.3478886756238004, "grad_norm": 14.266445268878716, "learning_rate": 4.121270437720526e-07, "logits/chosen": -0.7531959414482117, "logits/rejected": -0.7015701532363892, "logps/chosen": -415.9129943847656, "logps/rejected": -544.4853515625, "loss": 0.4963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1056931018829346, "rewards/margins": 0.7997118234634399, "rewards/rejected": -2.905405044555664, "step": 1450 }, { "epoch": 0.3502879078694818, "grad_norm": 8.666123707311465, "learning_rate": 4.105275314897852e-07, "logits/chosen": -0.8094059228897095, "logits/rejected": -0.8042441606521606, "logps/chosen": -387.4685363769531, "logps/rejected": -575.4444580078125, "loss": 0.497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7935253381729126, "rewards/margins": 1.4926183223724365, "rewards/rejected": -3.2861435413360596, "step": 1460 }, { "epoch": 0.35268714011516317, "grad_norm": 10.639348735955451, "learning_rate": 4.089167588389508e-07, "logits/chosen": -0.7043929100036621, "logits/rejected": -0.7534819841384888, "logps/chosen": -515.1029663085938, "logps/rejected": -574.6419067382812, "loss": 0.4865, "rewards/accuracies": 0.75, "rewards/chosen": -1.7564353942871094, "rewards/margins": 1.080705165863037, "rewards/rejected": -2.8371405601501465, "step": 1470 }, { "epoch": 0.3550863723608445, "grad_norm": 13.106435290020489, "learning_rate": 4.072948388088515e-07, "logits/chosen": -0.6628540754318237, "logits/rejected": -0.6470414400100708, "logps/chosen": -432.12005615234375, "logps/rejected": -549.7205200195312, "loss": 0.4851, "rewards/accuracies": 0.75, "rewards/chosen": -1.650691270828247, "rewards/margins": 1.125057578086853, "rewards/rejected": -2.7757484912872314, "step": 1480 }, { "epoch": 0.3574856046065259, "grad_norm": 13.06259828621992, "learning_rate": 4.056618851707334e-07, "logits/chosen": -0.6907030344009399, "logits/rejected": -0.7093620300292969, "logps/chosen": -397.7687683105469, "logps/rejected": -526.349853515625, "loss": 0.4663, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3325411081314087, "rewards/margins": 1.221077561378479, "rewards/rejected": -2.553618907928467, "step": 1490 }, { "epoch": 0.3598848368522073, "grad_norm": 12.917502818953556, "learning_rate": 4.0401801246980675e-07, "logits/chosen": -0.8259037137031555, "logits/rejected": -0.8357691764831543, "logps/chosen": -384.6356506347656, "logps/rejected": -453.1322326660156, "loss": 0.4882, "rewards/accuracies": 0.75, "rewards/chosen": -1.7201900482177734, "rewards/margins": 0.8327716588973999, "rewards/rejected": -2.552961587905884, "step": 1500 }, { "epoch": 0.3622840690978887, "grad_norm": 12.433094406122816, "learning_rate": 4.0236333601721043e-07, "logits/chosen": -0.7449339628219604, "logits/rejected": -0.7388188242912292, "logps/chosen": -460.6793518066406, "logps/rejected": -544.5335693359375, "loss": 0.5044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6877784729003906, "rewards/margins": 0.7297952175140381, "rewards/rejected": -2.417573928833008, "step": 1510 }, { "epoch": 0.3646833013435701, "grad_norm": 13.800192452791908, "learning_rate": 4.0069797188192364e-07, "logits/chosen": -0.7582114934921265, "logits/rejected": -0.742210328578949, "logps/chosen": -439.11114501953125, "logps/rejected": -521.8330688476562, "loss": 0.4944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5508705377578735, "rewards/margins": 1.0782378911972046, "rewards/rejected": -2.6291086673736572, "step": 1520 }, { "epoch": 0.3670825335892514, "grad_norm": 13.571364318064191, "learning_rate": 3.9902203688262417e-07, "logits/chosen": -0.721932590007782, "logits/rejected": -0.7364694476127625, "logps/chosen": -405.0948791503906, "logps/rejected": -501.7353515625, "loss": 0.4661, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3923845291137695, "rewards/margins": 1.094702959060669, "rewards/rejected": -2.4870872497558594, "step": 1530 }, { "epoch": 0.3694817658349328, "grad_norm": 11.011970777974243, "learning_rate": 3.9733564857949365e-07, "logits/chosen": -0.6986292600631714, "logits/rejected": -0.6912825703620911, "logps/chosen": -500.9569396972656, "logps/rejected": -553.8840942382812, "loss": 0.4869, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.857325553894043, "rewards/margins": 0.9566340446472168, "rewards/rejected": -2.8139595985412598, "step": 1540 }, { "epoch": 0.3718809980806142, "grad_norm": 10.905693632178256, "learning_rate": 3.9563892526597177e-07, "logits/chosen": -0.7152280807495117, "logits/rejected": -0.6881515383720398, "logps/chosen": -388.0267333984375, "logps/rejected": -521.2376708984375, "loss": 0.4572, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6515610218048096, "rewards/margins": 0.8464974164962769, "rewards/rejected": -2.498058319091797, "step": 1550 }, { "epoch": 0.3742802303262956, "grad_norm": 10.08743827300553, "learning_rate": 3.9393198596045795e-07, "logits/chosen": -0.7806371450424194, "logits/rejected": -0.76411372423172, "logps/chosen": -399.4765930175781, "logps/rejected": -516.7181396484375, "loss": 0.5182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7115662097930908, "rewards/margins": 0.941506028175354, "rewards/rejected": -2.6530721187591553, "step": 1560 }, { "epoch": 0.376679462571977, "grad_norm": 8.868869870998214, "learning_rate": 3.922149503979628e-07, "logits/chosen": -0.6893107295036316, "logits/rejected": -0.7296844124794006, "logps/chosen": -438.3675231933594, "logps/rejected": -609.4227905273438, "loss": 0.4809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.628042221069336, "rewards/margins": 1.6386429071426392, "rewards/rejected": -3.2666850090026855, "step": 1570 }, { "epoch": 0.3790786948176583, "grad_norm": 13.078715942181777, "learning_rate": 3.904879390217095e-07, "logits/chosen": -0.799870491027832, "logits/rejected": -0.8303624987602234, "logps/chosen": -410.641845703125, "logps/rejected": -479.2076721191406, "loss": 0.458, "rewards/accuracies": 0.75, "rewards/chosen": -1.6100763082504272, "rewards/margins": 0.8601690530776978, "rewards/rejected": -2.470245361328125, "step": 1580 }, { "epoch": 0.3814779270633397, "grad_norm": 12.71197235549074, "learning_rate": 3.8875107297468463e-07, "logits/chosen": -0.7600913643836975, "logits/rejected": -0.7607609033584595, "logps/chosen": -388.62042236328125, "logps/rejected": -583.7432250976562, "loss": 0.4781, "rewards/accuracies": 0.875, "rewards/chosen": -1.4480386972427368, "rewards/margins": 1.5103285312652588, "rewards/rejected": -2.958367109298706, "step": 1590 }, { "epoch": 0.3838771593090211, "grad_norm": 13.466303880551026, "learning_rate": 3.87004474091141e-07, "logits/chosen": -0.6408634781837463, "logits/rejected": -0.6452223062515259, "logps/chosen": -373.9280700683594, "logps/rejected": -497.0530700683594, "loss": 0.4874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4634227752685547, "rewards/margins": 1.0529407262802124, "rewards/rejected": -2.5163636207580566, "step": 1600 }, { "epoch": 0.3862763915547025, "grad_norm": 12.405090587714351, "learning_rate": 3.8524826488805114e-07, "logits/chosen": -0.812592625617981, "logits/rejected": -0.774621844291687, "logps/chosen": -457.66961669921875, "logps/rejected": -521.51904296875, "loss": 0.5016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.722423791885376, "rewards/margins": 1.091522455215454, "rewards/rejected": -2.81394624710083, "step": 1610 }, { "epoch": 0.3886756238003839, "grad_norm": 10.910380188178692, "learning_rate": 3.834825685565133e-07, "logits/chosen": -0.8149593472480774, "logits/rejected": -0.8616162538528442, "logps/chosen": -388.2657165527344, "logps/rejected": -452.9178161621094, "loss": 0.4461, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5048949718475342, "rewards/margins": 1.0155197381973267, "rewards/rejected": -2.5204145908355713, "step": 1620 }, { "epoch": 0.39107485604606523, "grad_norm": 12.461150034624684, "learning_rate": 3.8170750895311007e-07, "logits/chosen": -0.7702925205230713, "logits/rejected": -0.7936859726905823, "logps/chosen": -419.08270263671875, "logps/rejected": -508.16571044921875, "loss": 0.4739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3406755924224854, "rewards/margins": 1.148717999458313, "rewards/rejected": -2.489393711090088, "step": 1630 }, { "epoch": 0.3934740882917466, "grad_norm": 10.482680213963254, "learning_rate": 3.7992321059122045e-07, "logits/chosen": -0.7163397073745728, "logits/rejected": -0.7596901059150696, "logps/chosen": -418.07147216796875, "logps/rejected": -498.05078125, "loss": 0.4946, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8541247844696045, "rewards/margins": 0.9185699224472046, "rewards/rejected": -2.7726948261260986, "step": 1640 }, { "epoch": 0.395873320537428, "grad_norm": 12.021180525729404, "learning_rate": 3.7812979863228576e-07, "logits/chosen": -0.8531166911125183, "logits/rejected": -0.8554477691650391, "logps/chosen": -381.1147155761719, "logps/rejected": -508.91668701171875, "loss": 0.4537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7566955089569092, "rewards/margins": 1.1585729122161865, "rewards/rejected": -2.9152684211730957, "step": 1650 }, { "epoch": 0.3982725527831094, "grad_norm": 12.979308903163362, "learning_rate": 3.763273988770296e-07, "logits/chosen": -0.7346752285957336, "logits/rejected": -0.7792466878890991, "logps/chosen": -394.4084167480469, "logps/rejected": -528.5557861328125, "loss": 0.455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4199392795562744, "rewards/margins": 1.3389031887054443, "rewards/rejected": -2.7588424682617188, "step": 1660 }, { "epoch": 0.4006717850287908, "grad_norm": 12.280364894112322, "learning_rate": 3.7451613775663405e-07, "logits/chosen": -0.7985413670539856, "logits/rejected": -0.7544962763786316, "logps/chosen": -410.6094665527344, "logps/rejected": -567.0599365234375, "loss": 0.4986, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7071869373321533, "rewards/margins": 1.444684624671936, "rewards/rejected": -3.1518714427948, "step": 1670 }, { "epoch": 0.40307101727447214, "grad_norm": 11.808414350786867, "learning_rate": 3.726961423238706e-07, "logits/chosen": -0.8868053555488586, "logits/rejected": -0.9050809741020203, "logps/chosen": -378.8218078613281, "logps/rejected": -536.3219604492188, "loss": 0.4686, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5447314977645874, "rewards/margins": 1.3333966732025146, "rewards/rejected": -2.8781278133392334, "step": 1680 }, { "epoch": 0.40547024952015354, "grad_norm": 12.787716674975018, "learning_rate": 3.708675402441882e-07, "logits/chosen": -0.7429116368293762, "logits/rejected": -0.7805765867233276, "logps/chosen": -458.7632751464844, "logps/rejected": -516.9869384765625, "loss": 0.4928, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7429218292236328, "rewards/margins": 0.9082300066947937, "rewards/rejected": -2.6511518955230713, "step": 1690 }, { "epoch": 0.40786948176583493, "grad_norm": 10.598422088340676, "learning_rate": 3.6903045978675775e-07, "logits/chosen": -0.7389672994613647, "logits/rejected": -0.7964872717857361, "logps/chosen": -411.5867614746094, "logps/rejected": -550.23876953125, "loss": 0.4801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7794177532196045, "rewards/margins": 1.554722785949707, "rewards/rejected": -3.3341403007507324, "step": 1700 }, { "epoch": 0.4102687140115163, "grad_norm": 10.761090409994697, "learning_rate": 3.6718502981547474e-07, "logits/chosen": -0.7715443968772888, "logits/rejected": -0.7918425798416138, "logps/chosen": -430.85003662109375, "logps/rejected": -561.6787109375, "loss": 0.4899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.676509141921997, "rewards/margins": 0.8913475871086121, "rewards/rejected": -2.567856788635254, "step": 1710 }, { "epoch": 0.4126679462571977, "grad_norm": 9.729723638899472, "learning_rate": 3.6533137977991986e-07, "logits/chosen": -0.755111575126648, "logits/rejected": -0.7594307065010071, "logps/chosen": -430.3711853027344, "logps/rejected": -534.5823974609375, "loss": 0.5115, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5598688125610352, "rewards/margins": 0.8007798194885254, "rewards/rejected": -2.3606486320495605, "step": 1720 }, { "epoch": 0.41506717850287905, "grad_norm": 9.372402487840775, "learning_rate": 3.6346963970627865e-07, "logits/chosen": -0.6953638195991516, "logits/rejected": -0.6692907214164734, "logps/chosen": -390.3316955566406, "logps/rejected": -514.3018798828125, "loss": 0.4468, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3828855752944946, "rewards/margins": 1.066463828086853, "rewards/rejected": -2.4493489265441895, "step": 1730 }, { "epoch": 0.41746641074856045, "grad_norm": 11.954691591503995, "learning_rate": 3.615999401882207e-07, "logits/chosen": -0.8805049657821655, "logits/rejected": -0.8518358469009399, "logps/chosen": -412.022216796875, "logps/rejected": -552.1925659179688, "loss": 0.4814, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9677274227142334, "rewards/margins": 1.194000005722046, "rewards/rejected": -3.1617274284362793, "step": 1740 }, { "epoch": 0.41986564299424184, "grad_norm": 11.021704456258432, "learning_rate": 3.597224123777389e-07, "logits/chosen": -0.7318686246871948, "logits/rejected": -0.7280600666999817, "logps/chosen": -430.3202209472656, "logps/rejected": -572.8101806640625, "loss": 0.4749, "rewards/accuracies": 0.75, "rewards/chosen": -1.8212677240371704, "rewards/margins": 1.2428979873657227, "rewards/rejected": -3.0641655921936035, "step": 1750 }, { "epoch": 0.42226487523992323, "grad_norm": 10.015819519130572, "learning_rate": 3.5783718797595e-07, "logits/chosen": -0.8521868586540222, "logits/rejected": -0.8556682467460632, "logps/chosen": -473.8487243652344, "logps/rejected": -544.9535522460938, "loss": 0.4797, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8864997625350952, "rewards/margins": 1.0899362564086914, "rewards/rejected": -2.976435661315918, "step": 1760 }, { "epoch": 0.4246641074856046, "grad_norm": 11.86534189148174, "learning_rate": 3.559443992238558e-07, "logits/chosen": -0.7805435061454773, "logits/rejected": -0.8231045007705688, "logps/chosen": -400.96697998046875, "logps/rejected": -577.6618041992188, "loss": 0.4997, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5335006713867188, "rewards/margins": 1.4942717552185059, "rewards/rejected": -3.0277724266052246, "step": 1770 }, { "epoch": 0.42706333973128596, "grad_norm": 10.141163144169631, "learning_rate": 3.540441788930673e-07, "logits/chosen": -0.7176542282104492, "logits/rejected": -0.7344351410865784, "logps/chosen": -456.525146484375, "logps/rejected": -563.7665405273438, "loss": 0.4548, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6193135976791382, "rewards/margins": 1.4220505952835083, "rewards/rejected": -3.0413641929626465, "step": 1780 }, { "epoch": 0.42946257197696736, "grad_norm": 12.961054235399837, "learning_rate": 3.5213666027649123e-07, "logits/chosen": -0.7940319180488586, "logits/rejected": -0.8246362805366516, "logps/chosen": -489.11004638671875, "logps/rejected": -525.5428466796875, "loss": 0.4772, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1909642219543457, "rewards/margins": 0.7601931691169739, "rewards/rejected": -2.951157808303833, "step": 1790 }, { "epoch": 0.43186180422264875, "grad_norm": 11.84758063846055, "learning_rate": 3.5022197717898017e-07, "logits/chosen": -0.8181630969047546, "logits/rejected": -0.8500107526779175, "logps/chosen": -393.4739685058594, "logps/rejected": -486.96917724609375, "loss": 0.4156, "rewards/accuracies": 0.75, "rewards/chosen": -1.7599443197250366, "rewards/margins": 1.216341257095337, "rewards/rejected": -2.976285457611084, "step": 1800 }, { "epoch": 0.43426103646833014, "grad_norm": 10.86372215583825, "learning_rate": 3.4830026390794633e-07, "logits/chosen": -0.8050792813301086, "logits/rejected": -0.8329674005508423, "logps/chosen": -507.2084045410156, "logps/rejected": -583.702880859375, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": -2.1328043937683105, "rewards/margins": 1.2732837200164795, "rewards/rejected": -3.406088352203369, "step": 1810 }, { "epoch": 0.43666026871401153, "grad_norm": 7.738248672410445, "learning_rate": 3.4637165526394104e-07, "logits/chosen": -0.7997580766677856, "logits/rejected": -0.8064180612564087, "logps/chosen": -423.777587890625, "logps/rejected": -542.863525390625, "loss": 0.4725, "rewards/accuracies": 0.75, "rewards/chosen": -1.9629684686660767, "rewards/margins": 1.0524226427078247, "rewards/rejected": -3.0153908729553223, "step": 1820 }, { "epoch": 0.43905950095969287, "grad_norm": 11.736843534921078, "learning_rate": 3.4443628653119814e-07, "logits/chosen": -0.7079404592514038, "logits/rejected": -0.7036377191543579, "logps/chosen": -483.62896728515625, "logps/rejected": -678.022216796875, "loss": 0.4814, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1326420307159424, "rewards/margins": 1.5004966259002686, "rewards/rejected": -3.633139133453369, "step": 1830 }, { "epoch": 0.44145873320537427, "grad_norm": 10.785415108021422, "learning_rate": 3.424942934681453e-07, "logits/chosen": -0.8217443227767944, "logits/rejected": -0.8721915483474731, "logps/chosen": -385.87860107421875, "logps/rejected": -556.9625854492188, "loss": 0.4584, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4244273900985718, "rewards/margins": 1.7936569452285767, "rewards/rejected": -3.2180843353271484, "step": 1840 }, { "epoch": 0.44385796545105566, "grad_norm": 14.588516416025033, "learning_rate": 3.405458122978804e-07, "logits/chosen": -0.8775800466537476, "logits/rejected": -0.8566424250602722, "logps/chosen": -458.1707458496094, "logps/rejected": -545.3057861328125, "loss": 0.4726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7237602472305298, "rewards/margins": 1.154767632484436, "rewards/rejected": -2.878527879714966, "step": 1850 }, { "epoch": 0.44625719769673705, "grad_norm": 13.706269723531113, "learning_rate": 3.3859097969861633e-07, "logits/chosen": -0.7905477285385132, "logits/rejected": -0.7530331015586853, "logps/chosen": -460.21728515625, "logps/rejected": -553.216796875, "loss": 0.4504, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8379026651382446, "rewards/margins": 1.2619093656539917, "rewards/rejected": -3.0998120307922363, "step": 1860 }, { "epoch": 0.44865642994241844, "grad_norm": 13.3769074850146, "learning_rate": 3.366299327940936e-07, "logits/chosen": -0.8109074831008911, "logits/rejected": -0.773389458656311, "logps/chosen": -470.64947509765625, "logps/rejected": -587.9104614257812, "loss": 0.4834, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7903703451156616, "rewards/margins": 0.9450393915176392, "rewards/rejected": -2.7354094982147217, "step": 1870 }, { "epoch": 0.4510556621880998, "grad_norm": 11.753551278940575, "learning_rate": 3.3466280914396117e-07, "logits/chosen": -0.773266077041626, "logits/rejected": -0.7779923677444458, "logps/chosen": -402.0985412597656, "logps/rejected": -570.60693359375, "loss": 0.4764, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6223961114883423, "rewards/margins": 1.4883135557174683, "rewards/rejected": -3.1107096672058105, "step": 1880 }, { "epoch": 0.4534548944337812, "grad_norm": 14.233604576877488, "learning_rate": 3.326897467341281e-07, "logits/chosen": -0.8112883567810059, "logits/rejected": -0.8336831331253052, "logps/chosen": -351.8514404296875, "logps/rejected": -480.0999450683594, "loss": 0.474, "rewards/accuracies": 0.75, "rewards/chosen": -1.529261827468872, "rewards/margins": 1.1229501962661743, "rewards/rejected": -2.652211904525757, "step": 1890 }, { "epoch": 0.45585412667946257, "grad_norm": 13.300598272317474, "learning_rate": 3.3071088396708335e-07, "logits/chosen": -0.8645914196968079, "logits/rejected": -0.8331616520881653, "logps/chosen": -363.18206787109375, "logps/rejected": -533.6971435546875, "loss": 0.4819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6752821207046509, "rewards/margins": 1.4778271913528442, "rewards/rejected": -3.153109550476074, "step": 1900 }, { "epoch": 0.45825335892514396, "grad_norm": 12.640086996532164, "learning_rate": 3.2872635965218824e-07, "logits/chosen": -0.6304786205291748, "logits/rejected": -0.6618056297302246, "logps/chosen": -448.3641052246094, "logps/rejected": -589.9913330078125, "loss": 0.5109, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9804298877716064, "rewards/margins": 1.1321611404418945, "rewards/rejected": -3.112590789794922, "step": 1910 }, { "epoch": 0.46065259117082535, "grad_norm": 9.986496934257348, "learning_rate": 3.2673631299593905e-07, "logits/chosen": -0.7311594486236572, "logits/rejected": -0.8009797930717468, "logps/chosen": -461.6552734375, "logps/rejected": -577.480712890625, "loss": 0.4668, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8996919393539429, "rewards/margins": 1.237914800643921, "rewards/rejected": -3.1376068592071533, "step": 1920 }, { "epoch": 0.4630518234165067, "grad_norm": 13.218071975981442, "learning_rate": 3.247408835922024e-07, "logits/chosen": -0.7413262724876404, "logits/rejected": -0.7506011724472046, "logps/chosen": -488.389404296875, "logps/rejected": -615.5206298828125, "loss": 0.4835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.845880150794983, "rewards/margins": 1.2004640102386475, "rewards/rejected": -3.046344041824341, "step": 1930 }, { "epoch": 0.4654510556621881, "grad_norm": 12.635690431262567, "learning_rate": 3.2274021141242306e-07, "logits/chosen": -0.6942049860954285, "logits/rejected": -0.7160819172859192, "logps/chosen": -423.4578552246094, "logps/rejected": -537.9940795898438, "loss": 0.4495, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6584323644638062, "rewards/margins": 1.0953803062438965, "rewards/rejected": -2.753812551498413, "step": 1940 }, { "epoch": 0.4678502879078695, "grad_norm": 15.405837310763367, "learning_rate": 3.2073443679580613e-07, "logits/chosen": -0.7654497027397156, "logits/rejected": -0.7917548418045044, "logps/chosen": -448.06378173828125, "logps/rejected": -538.6315307617188, "loss": 0.4705, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.71183180809021, "rewards/margins": 0.9023053050041199, "rewards/rejected": -2.614137649536133, "step": 1950 }, { "epoch": 0.47024952015355087, "grad_norm": 9.857507863821315, "learning_rate": 3.1872370043947194e-07, "logits/chosen": -0.8800600171089172, "logits/rejected": -0.888513445854187, "logps/chosen": -407.2889709472656, "logps/rejected": -570.4630126953125, "loss": 0.4494, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3753950595855713, "rewards/margins": 1.693584680557251, "rewards/rejected": -3.0689799785614014, "step": 1960 }, { "epoch": 0.47264875239923226, "grad_norm": 11.49826557648692, "learning_rate": 3.167081433885874e-07, "logits/chosen": -0.6018909215927124, "logits/rejected": -0.6167675852775574, "logps/chosen": -508.90655517578125, "logps/rejected": -644.7164916992188, "loss": 0.4066, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8501713275909424, "rewards/margins": 0.9607030749320984, "rewards/rejected": -2.8108744621276855, "step": 1970 }, { "epoch": 0.4750479846449136, "grad_norm": 13.29052258173837, "learning_rate": 3.14687907026472e-07, "logits/chosen": -0.6973208785057068, "logits/rejected": -0.7304965257644653, "logps/chosen": -395.8529357910156, "logps/rejected": -551.4783935546875, "loss": 0.4563, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6641004085540771, "rewards/margins": 1.4298467636108398, "rewards/rejected": -3.093946933746338, "step": 1980 }, { "epoch": 0.477447216890595, "grad_norm": 12.240475995337231, "learning_rate": 3.126631330646801e-07, "logits/chosen": -0.699920117855072, "logits/rejected": -0.7062256932258606, "logps/chosen": -519.5510864257812, "logps/rejected": -607.6595458984375, "loss": 0.4849, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1099679470062256, "rewards/margins": 0.9010626077651978, "rewards/rejected": -3.011030673980713, "step": 1990 }, { "epoch": 0.4798464491362764, "grad_norm": 13.241550075194965, "learning_rate": 3.1063396353306097e-07, "logits/chosen": -0.726953387260437, "logits/rejected": -0.788418173789978, "logps/chosen": -436.1697692871094, "logps/rejected": -513.4632568359375, "loss": 0.4416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6597273349761963, "rewards/margins": 1.236422061920166, "rewards/rejected": -2.896149158477783, "step": 2000 }, { "epoch": 0.4798464491362764, "eval_logits/chosen": -0.7618333697319031, "eval_logits/rejected": -0.7765002846717834, "eval_logps/chosen": -424.0602111816406, "eval_logps/rejected": -575.2555541992188, "eval_loss": 0.45312055945396423, "eval_rewards/accuracies": 0.8160714507102966, "eval_rewards/chosen": -1.7141555547714233, "eval_rewards/margins": 1.430633306503296, "eval_rewards/rejected": -3.144789457321167, "eval_runtime": 233.581, "eval_samples_per_second": 19.098, "eval_steps_per_second": 0.3, "step": 2000 }, { "epoch": 0.4822456813819578, "grad_norm": 11.1357246980386, "learning_rate": 3.0860054076979535e-07, "logits/chosen": -0.7470555901527405, "logits/rejected": -0.74172443151474, "logps/chosen": -460.82867431640625, "logps/rejected": -552.6365356445312, "loss": 0.4743, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9067308902740479, "rewards/margins": 1.165321946144104, "rewards/rejected": -3.0720529556274414, "step": 2010 }, { "epoch": 0.4846449136276392, "grad_norm": 17.785449989220815, "learning_rate": 3.065630074114115e-07, "logits/chosen": -0.7443466782569885, "logits/rejected": -0.762556254863739, "logps/chosen": -465.6786193847656, "logps/rejected": -580.1879272460938, "loss": 0.4669, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7899072170257568, "rewards/margins": 1.538434624671936, "rewards/rejected": -3.3283417224884033, "step": 2020 }, { "epoch": 0.4870441458733205, "grad_norm": 16.171594176924803, "learning_rate": 3.0452150638277947e-07, "logits/chosen": -0.6747657060623169, "logits/rejected": -0.6558694839477539, "logps/chosen": -392.1339416503906, "logps/rejected": -500.20025634765625, "loss": 0.5029, "rewards/accuracies": 0.75, "rewards/chosen": -1.720434546470642, "rewards/margins": 0.9752155542373657, "rewards/rejected": -2.695650339126587, "step": 2030 }, { "epoch": 0.4894433781190019, "grad_norm": 8.227849718874795, "learning_rate": 3.024761808870856e-07, "logits/chosen": -0.7772229313850403, "logits/rejected": -0.7858240008354187, "logps/chosen": -381.36236572265625, "logps/rejected": -597.6624145507812, "loss": 0.4504, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.510353684425354, "rewards/margins": 2.1284565925598145, "rewards/rejected": -3.6388099193573, "step": 2040 }, { "epoch": 0.4918426103646833, "grad_norm": 20.051647299898132, "learning_rate": 3.004271743957875e-07, "logits/chosen": -0.6749522089958191, "logits/rejected": -0.6905564069747925, "logps/chosen": -484.854736328125, "logps/rejected": -604.3311767578125, "loss": 0.4945, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.175485134124756, "rewards/margins": 0.9892854690551758, "rewards/rejected": -3.1647706031799316, "step": 2050 }, { "epoch": 0.4942418426103647, "grad_norm": 12.004150798678578, "learning_rate": 2.983746306385499e-07, "logits/chosen": -0.8129485845565796, "logits/rejected": -0.7655819654464722, "logps/chosen": -414.5586853027344, "logps/rejected": -578.7470703125, "loss": 0.4453, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7315165996551514, "rewards/margins": 1.4258909225463867, "rewards/rejected": -3.1574079990386963, "step": 2060 }, { "epoch": 0.4966410748560461, "grad_norm": 11.308824836556166, "learning_rate": 2.963186935931628e-07, "logits/chosen": -0.7485088109970093, "logits/rejected": -0.7199539542198181, "logps/chosen": -450.9239196777344, "logps/rejected": -583.173583984375, "loss": 0.4482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.732980728149414, "rewards/margins": 1.3427022695541382, "rewards/rejected": -3.075683116912842, "step": 2070 }, { "epoch": 0.4990403071017274, "grad_norm": 9.373923594426847, "learning_rate": 2.9425950747544176e-07, "logits/chosen": -0.6613216400146484, "logits/rejected": -0.7013477683067322, "logps/chosen": -505.81573486328125, "logps/rejected": -642.0211181640625, "loss": 0.4343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.986654281616211, "rewards/margins": 1.6596873998641968, "rewards/rejected": -3.646341323852539, "step": 2080 }, { "epoch": 0.5014395393474088, "grad_norm": 15.749741167722831, "learning_rate": 2.921972167291119e-07, "logits/chosen": -0.7112385630607605, "logits/rejected": -0.7526477575302124, "logps/chosen": -436.2796325683594, "logps/rejected": -586.3831787109375, "loss": 0.4399, "rewards/accuracies": 0.75, "rewards/chosen": -1.5150210857391357, "rewards/margins": 1.2452566623687744, "rewards/rejected": -2.760277271270752, "step": 2090 }, { "epoch": 0.5038387715930902, "grad_norm": 13.419312414930658, "learning_rate": 2.9013196601567567e-07, "logits/chosen": -0.7128900289535522, "logits/rejected": -0.730756402015686, "logps/chosen": -419.7554626464844, "logps/rejected": -551.4823608398438, "loss": 0.5299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.718276023864746, "rewards/margins": 1.1899365186691284, "rewards/rejected": -2.908212661743164, "step": 2100 }, { "epoch": 0.5062380038387716, "grad_norm": 9.651741347788393, "learning_rate": 2.8806390020426555e-07, "logits/chosen": -0.7773910760879517, "logits/rejected": -0.7731062173843384, "logps/chosen": -431.44317626953125, "logps/rejected": -574.38720703125, "loss": 0.4257, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.687414526939392, "rewards/margins": 1.4306485652923584, "rewards/rejected": -3.11806321144104, "step": 2110 }, { "epoch": 0.508637236084453, "grad_norm": 13.48177862842598, "learning_rate": 2.8599316436148187e-07, "logits/chosen": -0.6940504312515259, "logits/rejected": -0.6836844682693481, "logps/chosen": -445.3380432128906, "logps/rejected": -537.2091064453125, "loss": 0.4344, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9495445489883423, "rewards/margins": 0.9152109026908875, "rewards/rejected": -2.864755153656006, "step": 2120 }, { "epoch": 0.5110364683301344, "grad_norm": 13.289305185609427, "learning_rate": 2.8391990374121723e-07, "logits/chosen": -0.726272463798523, "logits/rejected": -0.7124502062797546, "logps/chosen": -440.59832763671875, "logps/rejected": -601.8394775390625, "loss": 0.4771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0392353534698486, "rewards/margins": 1.3474972248077393, "rewards/rejected": -3.386732816696167, "step": 2130 }, { "epoch": 0.5134357005758158, "grad_norm": 12.7036271351969, "learning_rate": 2.818442637744669e-07, "logits/chosen": -0.7679746747016907, "logits/rejected": -0.7643837332725525, "logps/chosen": -455.5375061035156, "logps/rejected": -597.4730224609375, "loss": 0.4634, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0901169776916504, "rewards/margins": 1.3490521907806396, "rewards/rejected": -3.439169406890869, "step": 2140 }, { "epoch": 0.5158349328214972, "grad_norm": 10.384572302471735, "learning_rate": 2.797663900591284e-07, "logits/chosen": -0.7398999333381653, "logits/rejected": -0.7659087181091309, "logps/chosen": -447.51593017578125, "logps/rejected": -535.8583984375, "loss": 0.4256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8267303705215454, "rewards/margins": 1.21419358253479, "rewards/rejected": -3.040923833847046, "step": 2150 }, { "epoch": 0.5182341650671785, "grad_norm": 12.512282425185987, "learning_rate": 2.776864283497874e-07, "logits/chosen": -0.7339873909950256, "logits/rejected": -0.7800690531730652, "logps/chosen": -417.85107421875, "logps/rejected": -591.3509521484375, "loss": 0.4684, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8667776584625244, "rewards/margins": 1.7389957904815674, "rewards/rejected": -3.6057732105255127, "step": 2160 }, { "epoch": 0.5206333973128598, "grad_norm": 9.865835212510957, "learning_rate": 2.756045245474943e-07, "logits/chosen": -0.6862331628799438, "logits/rejected": -0.6869142651557922, "logps/chosen": -441.952880859375, "logps/rejected": -546.9207153320312, "loss": 0.4634, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6532140970230103, "rewards/margins": 0.8384655714035034, "rewards/rejected": -2.4916796684265137, "step": 2170 }, { "epoch": 0.5230326295585412, "grad_norm": 16.184183395491957, "learning_rate": 2.7352082468952977e-07, "logits/chosen": -0.7555044889450073, "logits/rejected": -0.7893722057342529, "logps/chosen": -431.03448486328125, "logps/rejected": -618.13427734375, "loss": 0.4812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9443515539169312, "rewards/margins": 1.6091206073760986, "rewards/rejected": -3.5534720420837402, "step": 2180 }, { "epoch": 0.5254318618042226, "grad_norm": 14.810315270160304, "learning_rate": 2.7143547493916e-07, "logits/chosen": -0.7784820795059204, "logits/rejected": -0.7843117117881775, "logps/chosen": -398.50506591796875, "logps/rejected": -604.5455322265625, "loss": 0.4501, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4673595428466797, "rewards/margins": 2.0520424842834473, "rewards/rejected": -3.519402027130127, "step": 2190 }, { "epoch": 0.527831094049904, "grad_norm": 14.870480901797459, "learning_rate": 2.693486215753853e-07, "logits/chosen": -0.7710849046707153, "logits/rejected": -0.7755874395370483, "logps/chosen": -432.92803955078125, "logps/rejected": -596.2999877929688, "loss": 0.467, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9311811923980713, "rewards/margins": 1.8095871210098267, "rewards/rejected": -3.7407684326171875, "step": 2200 }, { "epoch": 0.5302303262955854, "grad_norm": 15.695210321976734, "learning_rate": 2.6726041098267805e-07, "logits/chosen": -0.8125902414321899, "logits/rejected": -0.8185180425643921, "logps/chosen": -507.8814392089844, "logps/rejected": -558.718994140625, "loss": 0.4788, "rewards/accuracies": 0.75, "rewards/chosen": -2.2443294525146484, "rewards/margins": 0.8388055562973022, "rewards/rejected": -3.083134889602661, "step": 2210 }, { "epoch": 0.5326295585412668, "grad_norm": 16.11416540421591, "learning_rate": 2.6517098964071507e-07, "logits/chosen": -0.7117936015129089, "logits/rejected": -0.7254031896591187, "logps/chosen": -454.80242919921875, "logps/rejected": -549.0224609375, "loss": 0.511, "rewards/accuracies": 0.75, "rewards/chosen": -1.9632365703582764, "rewards/margins": 0.8028262257575989, "rewards/rejected": -2.7660632133483887, "step": 2220 }, { "epoch": 0.5350287907869482, "grad_norm": 13.538871078171768, "learning_rate": 2.630805041141023e-07, "logits/chosen": -0.7859424352645874, "logits/rejected": -0.7693944573402405, "logps/chosen": -402.9466857910156, "logps/rejected": -582.7056884765625, "loss": 0.4555, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7725486755371094, "rewards/margins": 1.6191837787628174, "rewards/rejected": -3.3917324542999268, "step": 2230 }, { "epoch": 0.5374280230326296, "grad_norm": 14.707908117003884, "learning_rate": 2.609891010420941e-07, "logits/chosen": -0.8051595687866211, "logits/rejected": -0.7927287817001343, "logps/chosen": -450.76519775390625, "logps/rejected": -590.6575317382812, "loss": 0.4335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.906309723854065, "rewards/margins": 1.394263744354248, "rewards/rejected": -3.3005733489990234, "step": 2240 }, { "epoch": 0.539827255278311, "grad_norm": 13.589410553380741, "learning_rate": 2.5889692712830674e-07, "logits/chosen": -0.7646986246109009, "logits/rejected": -0.7824346423149109, "logps/chosen": -391.0314025878906, "logps/rejected": -513.9915771484375, "loss": 0.4367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.665148377418518, "rewards/margins": 1.241813063621521, "rewards/rejected": -2.90696120262146, "step": 2250 }, { "epoch": 0.5422264875239923, "grad_norm": 13.492840980385337, "learning_rate": 2.5680412913042843e-07, "logits/chosen": -0.7536166906356812, "logits/rejected": -0.7371748685836792, "logps/chosen": -442.3362731933594, "logps/rejected": -595.1715087890625, "loss": 0.4245, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0182454586029053, "rewards/margins": 1.4872602224349976, "rewards/rejected": -3.5055058002471924, "step": 2260 }, { "epoch": 0.5446257197696737, "grad_norm": 14.894715683363456, "learning_rate": 2.5471085384992404e-07, "logits/chosen": -0.8191806674003601, "logits/rejected": -0.806193470954895, "logps/chosen": -411.95147705078125, "logps/rejected": -647.9240112304688, "loss": 0.421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7768102884292603, "rewards/margins": 2.2000908851623535, "rewards/rejected": -3.9769012928009033, "step": 2270 }, { "epoch": 0.5470249520153551, "grad_norm": 11.287950370775208, "learning_rate": 2.526172481217381e-07, "logits/chosen": -0.7717296481132507, "logits/rejected": -0.7522517442703247, "logps/chosen": -446.72607421875, "logps/rejected": -596.8885498046875, "loss": 0.4738, "rewards/accuracies": 0.75, "rewards/chosen": -2.3675601482391357, "rewards/margins": 1.335943579673767, "rewards/rejected": -3.703503370285034, "step": 2280 }, { "epoch": 0.5494241842610365, "grad_norm": 17.56101375995895, "learning_rate": 2.5052345880399456e-07, "logits/chosen": -0.8398829698562622, "logits/rejected": -0.8727308511734009, "logps/chosen": -449.156982421875, "logps/rejected": -570.5628662109375, "loss": 0.434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.23502779006958, "rewards/margins": 1.1653441190719604, "rewards/rejected": -3.400371551513672, "step": 2290 }, { "epoch": 0.5518234165067178, "grad_norm": 11.302567518557918, "learning_rate": 2.4842963276769555e-07, "logits/chosen": -0.7016631364822388, "logits/rejected": -0.6639502048492432, "logps/chosen": -437.28399658203125, "logps/rejected": -608.0369873046875, "loss": 0.4484, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.180513858795166, "rewards/margins": 1.2660033702850342, "rewards/rejected": -3.446516752243042, "step": 2300 }, { "epoch": 0.5542226487523992, "grad_norm": 11.742895780710647, "learning_rate": 2.463359168864189e-07, "logits/chosen": -0.6675876379013062, "logits/rejected": -0.7527247667312622, "logps/chosen": -516.6282958984375, "logps/rejected": -603.8510131835938, "loss": 0.4765, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1180880069732666, "rewards/margins": 1.259250283241272, "rewards/rejected": -3.377338409423828, "step": 2310 }, { "epoch": 0.5566218809980806, "grad_norm": 15.611988191868031, "learning_rate": 2.4424245802601555e-07, "logits/chosen": -0.7548837065696716, "logits/rejected": -0.7677526473999023, "logps/chosen": -436.63763427734375, "logps/rejected": -580.12109375, "loss": 0.448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0357186794281006, "rewards/margins": 0.8878978490829468, "rewards/rejected": -2.923617124557495, "step": 2320 }, { "epoch": 0.559021113243762, "grad_norm": 11.963131637466097, "learning_rate": 2.421494030343072e-07, "logits/chosen": -0.6403541564941406, "logits/rejected": -0.6971467137336731, "logps/chosen": -463.41668701171875, "logps/rejected": -511.3497009277344, "loss": 0.5057, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.029705047607422, "rewards/margins": 0.9956240653991699, "rewards/rejected": -3.025329113006592, "step": 2330 }, { "epoch": 0.5614203454894434, "grad_norm": 12.457563977370029, "learning_rate": 2.400568987307861e-07, "logits/chosen": -0.6719120740890503, "logits/rejected": -0.6879553198814392, "logps/chosen": -428.35992431640625, "logps/rejected": -484.6227111816406, "loss": 0.4224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0458950996398926, "rewards/margins": 0.6281440258026123, "rewards/rejected": -2.674039363861084, "step": 2340 }, { "epoch": 0.5638195777351248, "grad_norm": 11.92177172958945, "learning_rate": 2.379650918963156e-07, "logits/chosen": -0.7526946663856506, "logits/rejected": -0.7413941025733948, "logps/chosen": -406.4283447265625, "logps/rejected": -570.8126220703125, "loss": 0.4205, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.06895112991333, "rewards/margins": 1.500595211982727, "rewards/rejected": -3.5695462226867676, "step": 2350 }, { "epoch": 0.5662188099808061, "grad_norm": 16.624039363219666, "learning_rate": 2.3587412926283438e-07, "logits/chosen": -0.7837602496147156, "logits/rejected": -0.7445356249809265, "logps/chosen": -491.05059814453125, "logps/rejected": -638.3920288085938, "loss": 0.4727, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9546314477920532, "rewards/margins": 1.7906033992767334, "rewards/rejected": -3.745234966278076, "step": 2360 }, { "epoch": 0.5686180422264875, "grad_norm": 9.978773646984067, "learning_rate": 2.337841575030642e-07, "logits/chosen": -0.6353663802146912, "logits/rejected": -0.670727014541626, "logps/chosen": -482.7936096191406, "logps/rejected": -629.0820922851562, "loss": 0.4658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9853990077972412, "rewards/margins": 1.31783127784729, "rewards/rejected": -3.3032302856445312, "step": 2370 }, { "epoch": 0.5710172744721689, "grad_norm": 10.175503774195834, "learning_rate": 2.316953232202206e-07, "logits/chosen": -0.6700790524482727, "logits/rejected": -0.7611835598945618, "logps/chosen": -429.82666015625, "logps/rejected": -465.277099609375, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9541041851043701, "rewards/margins": 0.9658061861991882, "rewards/rejected": -2.9199106693267822, "step": 2380 }, { "epoch": 0.5734165067178503, "grad_norm": 11.211484923712767, "learning_rate": 2.2960777293772958e-07, "logits/chosen": -0.6513696908950806, "logits/rejected": -0.6979095339775085, "logps/chosen": -405.17120361328125, "logps/rejected": -547.6156005859375, "loss": 0.4611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.898032546043396, "rewards/margins": 1.5655709505081177, "rewards/rejected": -3.4636034965515137, "step": 2390 }, { "epoch": 0.5758157389635317, "grad_norm": 10.432280326904475, "learning_rate": 2.2752165308894974e-07, "logits/chosen": -0.6997479200363159, "logits/rejected": -0.6795819401741028, "logps/chosen": -405.59454345703125, "logps/rejected": -530.2125244140625, "loss": 0.4443, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1597695350646973, "rewards/margins": 1.3249900341033936, "rewards/rejected": -3.484759569168091, "step": 2400 }, { "epoch": 0.5782149712092131, "grad_norm": 16.8329792009058, "learning_rate": 2.254371100069005e-07, "logits/chosen": -0.6621764898300171, "logits/rejected": -0.612916111946106, "logps/chosen": -440.20538330078125, "logps/rejected": -585.4799194335938, "loss": 0.4638, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.897138237953186, "rewards/margins": 1.18193781375885, "rewards/rejected": -3.0790762901306152, "step": 2410 }, { "epoch": 0.5806142034548945, "grad_norm": 16.7833808451815, "learning_rate": 2.2335428991399725e-07, "logits/chosen": -0.6124377846717834, "logits/rejected": -0.6342424154281616, "logps/chosen": -424.91595458984375, "logps/rejected": -682.1309814453125, "loss": 0.4579, "rewards/accuracies": 0.75, "rewards/chosen": -2.3315539360046387, "rewards/margins": 2.402623414993286, "rewards/rejected": -4.734177112579346, "step": 2420 }, { "epoch": 0.5830134357005758, "grad_norm": 11.37683798430042, "learning_rate": 2.2127333891179458e-07, "logits/chosen": -0.6946436166763306, "logits/rejected": -0.7337637543678284, "logps/chosen": -401.97235107421875, "logps/rejected": -590.7584228515625, "loss": 0.4648, "rewards/accuracies": 0.75, "rewards/chosen": -1.964185118675232, "rewards/margins": 1.5734838247299194, "rewards/rejected": -3.5376694202423096, "step": 2430 }, { "epoch": 0.5854126679462572, "grad_norm": 14.137637654420558, "learning_rate": 2.1919440297073782e-07, "logits/chosen": -0.6848565340042114, "logits/rejected": -0.7031614184379578, "logps/chosen": -389.65863037109375, "logps/rejected": -572.2056884765625, "loss": 0.4726, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.85574471950531, "rewards/margins": 1.6927335262298584, "rewards/rejected": -3.548478364944458, "step": 2440 }, { "epoch": 0.5878119001919386, "grad_norm": 11.976818733935065, "learning_rate": 2.1711762791992368e-07, "logits/chosen": -0.6271128058433533, "logits/rejected": -0.6209608316421509, "logps/chosen": -453.7582092285156, "logps/rejected": -558.2598876953125, "loss": 0.464, "rewards/accuracies": 0.875, "rewards/chosen": -1.6763954162597656, "rewards/margins": 1.3264000415802002, "rewards/rejected": -3.002795696258545, "step": 2450 }, { "epoch": 0.5902111324376199, "grad_norm": 12.353087005836548, "learning_rate": 2.1504315943687114e-07, "logits/chosen": -0.7643033862113953, "logits/rejected": -0.736054539680481, "logps/chosen": -416.31512451171875, "logps/rejected": -612.2337646484375, "loss": 0.4454, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.881116509437561, "rewards/margins": 1.4656778573989868, "rewards/rejected": -3.346794605255127, "step": 2460 }, { "epoch": 0.5926103646833013, "grad_norm": 15.676634180170838, "learning_rate": 2.1297114303730248e-07, "logits/chosen": -0.6767653226852417, "logits/rejected": -0.6186730265617371, "logps/chosen": -407.5095520019531, "logps/rejected": -585.6527709960938, "loss": 0.4968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.816948652267456, "rewards/margins": 1.2787069082260132, "rewards/rejected": -3.0956554412841797, "step": 2470 }, { "epoch": 0.5950095969289827, "grad_norm": 12.693263479126406, "learning_rate": 2.1090172406493616e-07, "logits/chosen": -0.6466863751411438, "logits/rejected": -0.6392595171928406, "logps/chosen": -390.559326171875, "logps/rejected": -549.0421142578125, "loss": 0.4125, "rewards/accuracies": 0.875, "rewards/chosen": -1.5861185789108276, "rewards/margins": 1.4114429950714111, "rewards/rejected": -2.997561454772949, "step": 2480 }, { "epoch": 0.5974088291746641, "grad_norm": 14.530343733322098, "learning_rate": 2.0883504768129146e-07, "logits/chosen": -0.7057468295097351, "logits/rejected": -0.7234060764312744, "logps/chosen": -470.16229248046875, "logps/rejected": -603.843994140625, "loss": 0.478, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9950097799301147, "rewards/margins": 1.3909389972686768, "rewards/rejected": -3.385948896408081, "step": 2490 }, { "epoch": 0.5998080614203455, "grad_norm": 13.287779534586859, "learning_rate": 2.0677125885550571e-07, "logits/chosen": -0.5642179846763611, "logits/rejected": -0.6275255084037781, "logps/chosen": -422.0602111816406, "logps/rejected": -509.85833740234375, "loss": 0.4554, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8453181982040405, "rewards/margins": 1.3027547597885132, "rewards/rejected": -3.1480727195739746, "step": 2500 }, { "epoch": 0.6022072936660269, "grad_norm": 15.517152991224327, "learning_rate": 2.0471050235416587e-07, "logits/chosen": -0.6314137578010559, "logits/rejected": -0.7092536091804504, "logps/chosen": -456.7703552246094, "logps/rejected": -560.7614135742188, "loss": 0.4175, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.005941390991211, "rewards/margins": 1.4926408529281616, "rewards/rejected": -3.498582363128662, "step": 2510 }, { "epoch": 0.6046065259117083, "grad_norm": 15.430444876775148, "learning_rate": 2.026529227311532e-07, "logits/chosen": -0.696445107460022, "logits/rejected": -0.6817110776901245, "logps/chosen": -419.6112365722656, "logps/rejected": -545.1299438476562, "loss": 0.4725, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.074556827545166, "rewards/margins": 1.122429609298706, "rewards/rejected": -3.196986436843872, "step": 2520 }, { "epoch": 0.6070057581573897, "grad_norm": 12.135680251339695, "learning_rate": 2.005986643175036e-07, "logits/chosen": -0.6033408045768738, "logits/rejected": -0.5708281993865967, "logps/chosen": -444.126220703125, "logps/rejected": -621.4119873046875, "loss": 0.3872, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6977285146713257, "rewards/margins": 1.8118762969970703, "rewards/rejected": -3.5096049308776855, "step": 2530 }, { "epoch": 0.6094049904030711, "grad_norm": 16.900656042802993, "learning_rate": 1.9854787121128328e-07, "logits/chosen": -0.6752146482467651, "logits/rejected": -0.6918280124664307, "logps/chosen": -413.39178466796875, "logps/rejected": -453.151123046875, "loss": 0.4863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9585962295532227, "rewards/margins": 0.8429145812988281, "rewards/rejected": -2.8015105724334717, "step": 2540 }, { "epoch": 0.6118042226487524, "grad_norm": 14.460169065470863, "learning_rate": 1.9650068726748106e-07, "logits/chosen": -0.6060270667076111, "logits/rejected": -0.6441822052001953, "logps/chosen": -460.1835021972656, "logps/rejected": -609.755126953125, "loss": 0.4705, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.047046184539795, "rewards/margins": 1.4443639516830444, "rewards/rejected": -3.4914097785949707, "step": 2550 }, { "epoch": 0.6142034548944337, "grad_norm": 14.47008564939435, "learning_rate": 1.9445725608791718e-07, "logits/chosen": -0.6000035405158997, "logits/rejected": -0.6268490552902222, "logps/chosen": -412.12322998046875, "logps/rejected": -622.6077270507812, "loss": 0.4549, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5705702304840088, "rewards/margins": 2.0281901359558105, "rewards/rejected": -3.5987606048583984, "step": 2560 }, { "epoch": 0.6166026871401151, "grad_norm": 16.15106890491746, "learning_rate": 1.924177210111705e-07, "logits/chosen": -0.6954480409622192, "logits/rejected": -0.7049099206924438, "logps/chosen": -394.63970947265625, "logps/rejected": -582.3255004882812, "loss": 0.4508, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.777197241783142, "rewards/margins": 1.7624857425689697, "rewards/rejected": -3.5396828651428223, "step": 2570 }, { "epoch": 0.6190019193857965, "grad_norm": 11.273315658642467, "learning_rate": 1.9038222510252364e-07, "logits/chosen": -0.6852430105209351, "logits/rejected": -0.6738708019256592, "logps/chosen": -441.40020751953125, "logps/rejected": -517.69921875, "loss": 0.4767, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9201513528823853, "rewards/margins": 0.8821185231208801, "rewards/rejected": -2.8022701740264893, "step": 2580 }, { "epoch": 0.6214011516314779, "grad_norm": 13.419885609170674, "learning_rate": 1.883509111439277e-07, "logits/chosen": -0.5949097275733948, "logits/rejected": -0.5894945859909058, "logps/chosen": -434.9554138183594, "logps/rejected": -670.6376953125, "loss": 0.4836, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0458106994628906, "rewards/margins": 1.706055998802185, "rewards/rejected": -3.7518672943115234, "step": 2590 }, { "epoch": 0.6238003838771593, "grad_norm": 10.25337511089072, "learning_rate": 1.8632392162398665e-07, "logits/chosen": -0.6396509408950806, "logits/rejected": -0.6442984342575073, "logps/chosen": -447.4217834472656, "logps/rejected": -649.7928466796875, "loss": 0.4354, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6608057022094727, "rewards/margins": 2.017603874206543, "rewards/rejected": -3.6784095764160156, "step": 2600 }, { "epoch": 0.6261996161228407, "grad_norm": 12.615964200890309, "learning_rate": 1.84301398727962e-07, "logits/chosen": -0.6086243391036987, "logits/rejected": -0.5457442998886108, "logps/chosen": -363.9615173339844, "logps/rejected": -625.5221557617188, "loss": 0.4274, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8102554082870483, "rewards/margins": 2.2328672409057617, "rewards/rejected": -4.0431227684021, "step": 2610 }, { "epoch": 0.6285988483685221, "grad_norm": 13.561387993292696, "learning_rate": 1.8228348432779966e-07, "logits/chosen": -0.6876164078712463, "logits/rejected": -0.6831247210502625, "logps/chosen": -427.48388671875, "logps/rejected": -554.5697631835938, "loss": 0.5102, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0380465984344482, "rewards/margins": 1.3742176294326782, "rewards/rejected": -3.412264347076416, "step": 2620 }, { "epoch": 0.6309980806142035, "grad_norm": 11.884741198970012, "learning_rate": 1.8027031997217773e-07, "logits/chosen": -0.7005245089530945, "logits/rejected": -0.7127174139022827, "logps/chosen": -418.88031005859375, "logps/rejected": -650.9946899414062, "loss": 0.3794, "rewards/accuracies": 0.875, "rewards/chosen": -2.1531500816345215, "rewards/margins": 2.098936080932617, "rewards/rejected": -4.2520856857299805, "step": 2630 }, { "epoch": 0.6333973128598849, "grad_norm": 13.756259667587228, "learning_rate": 1.7826204687657758e-07, "logits/chosen": -0.6128356456756592, "logits/rejected": -0.5883530378341675, "logps/chosen": -479.2039489746094, "logps/rejected": -541.1580200195312, "loss": 0.4189, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0355639457702637, "rewards/margins": 1.0127185583114624, "rewards/rejected": -3.0482823848724365, "step": 2640 }, { "epoch": 0.6357965451055663, "grad_norm": 16.334230649896075, "learning_rate": 1.762588059133781e-07, "logits/chosen": -0.5741305947303772, "logits/rejected": -0.5979640483856201, "logps/chosen": -492.15985107421875, "logps/rejected": -616.077880859375, "loss": 0.4344, "rewards/accuracies": 0.875, "rewards/chosen": -2.001828670501709, "rewards/margins": 1.6204664707183838, "rewards/rejected": -3.6222949028015137, "step": 2650 }, { "epoch": 0.6381957773512476, "grad_norm": 12.818149694157945, "learning_rate": 1.7426073760197406e-07, "logits/chosen": -0.7115119099617004, "logits/rejected": -0.7030835151672363, "logps/chosen": -436.45440673828125, "logps/rejected": -656.4097900390625, "loss": 0.464, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0036306381225586, "rewards/margins": 1.860400915145874, "rewards/rejected": -3.864032030105591, "step": 2660 }, { "epoch": 0.6405950095969289, "grad_norm": 10.772055196711287, "learning_rate": 1.7226798209891935e-07, "logits/chosen": -0.5705487132072449, "logits/rejected": -0.6100784540176392, "logps/chosen": -454.169677734375, "logps/rejected": -567.9891967773438, "loss": 0.4243, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.027036666870117, "rewards/margins": 1.6898279190063477, "rewards/rejected": -3.716864824295044, "step": 2670 }, { "epoch": 0.6429942418426103, "grad_norm": 12.732993242920942, "learning_rate": 1.7028067918809535e-07, "logits/chosen": -0.6443219184875488, "logits/rejected": -0.6607564687728882, "logps/chosen": -408.63385009765625, "logps/rejected": -678.4671630859375, "loss": 0.4429, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9784246683120728, "rewards/margins": 2.1958975791931152, "rewards/rejected": -4.174322605133057, "step": 2680 }, { "epoch": 0.6453934740882917, "grad_norm": 16.43098937212258, "learning_rate": 1.6829896827090584e-07, "logits/chosen": -0.6939103007316589, "logits/rejected": -0.6823415756225586, "logps/chosen": -444.9854431152344, "logps/rejected": -528.9821166992188, "loss": 0.4604, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0313303470611572, "rewards/margins": 1.1062291860580444, "rewards/rejected": -3.137559652328491, "step": 2690 }, { "epoch": 0.6477927063339731, "grad_norm": 9.363672145947863, "learning_rate": 1.6632298835649844e-07, "logits/chosen": -0.5836836099624634, "logits/rejected": -0.5799709558486938, "logps/chosen": -470.08197021484375, "logps/rejected": -686.9669799804688, "loss": 0.4069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0098516941070557, "rewards/margins": 1.7816604375839233, "rewards/rejected": -3.7915122509002686, "step": 2700 }, { "epoch": 0.6501919385796545, "grad_norm": 17.2324020009346, "learning_rate": 1.6435287805201364e-07, "logits/chosen": -0.5617779493331909, "logits/rejected": -0.5524694919586182, "logps/chosen": -467.46075439453125, "logps/rejected": -561.6852416992188, "loss": 0.4668, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1388306617736816, "rewards/margins": 1.0130404233932495, "rewards/rejected": -3.1518714427948, "step": 2710 }, { "epoch": 0.6525911708253359, "grad_norm": 12.016483428799015, "learning_rate": 1.6238877555286207e-07, "logits/chosen": -0.6310284733772278, "logits/rejected": -0.6076905727386475, "logps/chosen": -451.049560546875, "logps/rejected": -625.4998779296875, "loss": 0.4317, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.715829849243164, "rewards/margins": 1.5645663738250732, "rewards/rejected": -3.280395984649658, "step": 2720 }, { "epoch": 0.6549904030710173, "grad_norm": 13.942118191847904, "learning_rate": 1.60430818633031e-07, "logits/chosen": -0.6367970705032349, "logits/rejected": -0.6443176865577698, "logps/chosen": -442.45428466796875, "logps/rejected": -603.2288818359375, "loss": 0.4292, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.833968162536621, "rewards/margins": 1.6198402643203735, "rewards/rejected": -3.453808546066284, "step": 2730 }, { "epoch": 0.6573896353166987, "grad_norm": 12.960755123491403, "learning_rate": 1.5847914463541939e-07, "logits/chosen": -0.6094954013824463, "logits/rejected": -0.616841197013855, "logps/chosen": -387.78448486328125, "logps/rejected": -546.4832763671875, "loss": 0.4165, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.933266043663025, "rewards/margins": 1.3234989643096924, "rewards/rejected": -3.2567648887634277, "step": 2740 }, { "epoch": 0.6597888675623801, "grad_norm": 11.216331431155993, "learning_rate": 1.5653389046220427e-07, "logits/chosen": -0.558444619178772, "logits/rejected": -0.5738928318023682, "logps/chosen": -416.5931091308594, "logps/rejected": -565.6954345703125, "loss": 0.4206, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8503406047821045, "rewards/margins": 1.2912790775299072, "rewards/rejected": -3.141619920730591, "step": 2750 }, { "epoch": 0.6621880998080614, "grad_norm": 14.796243391579123, "learning_rate": 1.545951925652375e-07, "logits/chosen": -0.5394322872161865, "logits/rejected": -0.5567634105682373, "logps/chosen": -502.76947021484375, "logps/rejected": -626.8363037109375, "loss": 0.4198, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9490293264389038, "rewards/margins": 1.7876237630844116, "rewards/rejected": -3.7366535663604736, "step": 2760 }, { "epoch": 0.6645873320537428, "grad_norm": 13.413386618717803, "learning_rate": 1.5266318693647423e-07, "logits/chosen": -0.5417942404747009, "logits/rejected": -0.5404913425445557, "logps/chosen": -455.65264892578125, "logps/rejected": -571.8460693359375, "loss": 0.4333, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9460630416870117, "rewards/margins": 1.299930453300476, "rewards/rejected": -3.2459938526153564, "step": 2770 }, { "epoch": 0.6669865642994242, "grad_norm": 16.515416936082715, "learning_rate": 1.5073800909843353e-07, "logits/chosen": -0.5896440744400024, "logits/rejected": -0.6135233640670776, "logps/chosen": -448.375732421875, "logps/rejected": -550.3060302734375, "loss": 0.4321, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8330185413360596, "rewards/margins": 1.5391408205032349, "rewards/rejected": -3.372159481048584, "step": 2780 }, { "epoch": 0.6693857965451055, "grad_norm": 16.63287415870903, "learning_rate": 1.488197940946922e-07, "logits/chosen": -0.5957229733467102, "logits/rejected": -0.5872025489807129, "logps/chosen": -433.13287353515625, "logps/rejected": -544.17724609375, "loss": 0.4116, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5587198734283447, "rewards/margins": 1.6922566890716553, "rewards/rejected": -3.250977039337158, "step": 2790 }, { "epoch": 0.6717850287907869, "grad_norm": 17.943322401731898, "learning_rate": 1.4690867648041167e-07, "logits/chosen": -0.5465134978294373, "logits/rejected": -0.5925148725509644, "logps/chosen": -427.1700134277344, "logps/rejected": -597.7564697265625, "loss": 0.463, "rewards/accuracies": 0.875, "rewards/chosen": -1.7293113470077515, "rewards/margins": 1.9395873546600342, "rewards/rejected": -3.668898820877075, "step": 2800 }, { "epoch": 0.6741842610364683, "grad_norm": 13.110879243171492, "learning_rate": 1.4500479031289987e-07, "logits/chosen": -0.5818893313407898, "logits/rejected": -0.6302607655525208, "logps/chosen": -445.3763732910156, "logps/rejected": -590.7268676757812, "loss": 0.4776, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6836143732070923, "rewards/margins": 1.5271230936050415, "rewards/rejected": -3.2107372283935547, "step": 2810 }, { "epoch": 0.6765834932821497, "grad_norm": 12.058976212342188, "learning_rate": 1.4310826914220747e-07, "logits/chosen": -0.6128555536270142, "logits/rejected": -0.6189436912536621, "logps/chosen": -497.07183837890625, "logps/rejected": -601.4428100585938, "loss": 0.4442, "rewards/accuracies": 0.75, "rewards/chosen": -1.8764116764068604, "rewards/margins": 1.2488231658935547, "rewards/rejected": -3.125235080718994, "step": 2820 }, { "epoch": 0.6789827255278311, "grad_norm": 17.97354363119042, "learning_rate": 1.412192460017597e-07, "logits/chosen": -0.6210779547691345, "logits/rejected": -0.6108576655387878, "logps/chosen": -455.32391357421875, "logps/rejected": -592.3138427734375, "loss": 0.4641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1085453033447266, "rewards/margins": 1.3274444341659546, "rewards/rejected": -3.4359898567199707, "step": 2830 }, { "epoch": 0.6813819577735125, "grad_norm": 11.08581581164283, "learning_rate": 1.3933785339902504e-07, "logits/chosen": -0.6120859384536743, "logits/rejected": -0.5777018666267395, "logps/chosen": -392.5328674316406, "logps/rejected": -571.60986328125, "loss": 0.4874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9550358057022095, "rewards/margins": 1.3144079446792603, "rewards/rejected": -3.2694435119628906, "step": 2840 }, { "epoch": 0.6837811900191939, "grad_norm": 11.419675800311689, "learning_rate": 1.374642233062197e-07, "logits/chosen": -0.5789315104484558, "logits/rejected": -0.5940367579460144, "logps/chosen": -489.6767578125, "logps/rejected": -603.5349731445312, "loss": 0.4489, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0280773639678955, "rewards/margins": 1.4480621814727783, "rewards/rejected": -3.476139545440674, "step": 2850 }, { "epoch": 0.6861804222648752, "grad_norm": 12.571278512714647, "learning_rate": 1.355984871510511e-07, "logits/chosen": -0.5551937818527222, "logits/rejected": -0.526736319065094, "logps/chosen": -485.8904724121094, "logps/rejected": -637.25341796875, "loss": 0.425, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9234853982925415, "rewards/margins": 1.3963334560394287, "rewards/rejected": -3.3198189735412598, "step": 2860 }, { "epoch": 0.6885796545105566, "grad_norm": 10.614013070716824, "learning_rate": 1.3374077580749783e-07, "logits/chosen": -0.576995313167572, "logits/rejected": -0.5995679497718811, "logps/chosen": -382.17498779296875, "logps/rejected": -547.46484375, "loss": 0.4393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.897216796875, "rewards/margins": 1.4525885581970215, "rewards/rejected": -3.3498051166534424, "step": 2870 }, { "epoch": 0.690978886756238, "grad_norm": 18.09913751507019, "learning_rate": 1.3189121958663024e-07, "logits/chosen": -0.5638588666915894, "logits/rejected": -0.6245552897453308, "logps/chosen": -522.0628662109375, "logps/rejected": -575.4417724609375, "loss": 0.4336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.396254062652588, "rewards/margins": 0.8919731974601746, "rewards/rejected": -3.2882275581359863, "step": 2880 }, { "epoch": 0.6933781190019194, "grad_norm": 15.80771289819406, "learning_rate": 1.3004994822746895e-07, "logits/chosen": -0.7042198181152344, "logits/rejected": -0.697306752204895, "logps/chosen": -436.90386962890625, "logps/rejected": -577.8507080078125, "loss": 0.4769, "rewards/accuracies": 0.75, "rewards/chosen": -1.9693387746810913, "rewards/margins": 1.3294403553009033, "rewards/rejected": -3.298779010772705, "step": 2890 }, { "epoch": 0.6957773512476008, "grad_norm": 11.587138387929464, "learning_rate": 1.2821709088788434e-07, "logits/chosen": -0.5107399821281433, "logits/rejected": -0.5425523519515991, "logps/chosen": -380.4593505859375, "logps/rejected": -541.3460693359375, "loss": 0.4398, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7053035497665405, "rewards/margins": 1.585318922996521, "rewards/rejected": -3.2906222343444824, "step": 2900 }, { "epoch": 0.6981765834932822, "grad_norm": 15.327291081862692, "learning_rate": 1.2639277613553736e-07, "logits/chosen": -0.5802925825119019, "logits/rejected": -0.5720899105072021, "logps/chosen": -379.57318115234375, "logps/rejected": -496.7958068847656, "loss": 0.4473, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8436704874038696, "rewards/margins": 1.1200520992279053, "rewards/rejected": -2.9637222290039062, "step": 2910 }, { "epoch": 0.7005758157389635, "grad_norm": 11.468692622260464, "learning_rate": 1.2457713193885975e-07, "logits/chosen": -0.5771138072013855, "logits/rejected": -0.5807372331619263, "logps/chosen": -359.72613525390625, "logps/rejected": -550.9989013671875, "loss": 0.4176, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8891617059707642, "rewards/margins": 1.5464417934417725, "rewards/rejected": -3.435603618621826, "step": 2920 }, { "epoch": 0.7029750479846449, "grad_norm": 17.115987201031448, "learning_rate": 1.2277028565807838e-07, "logits/chosen": -0.5637086629867554, "logits/rejected": -0.584968090057373, "logps/chosen": -446.92767333984375, "logps/rejected": -574.7530517578125, "loss": 0.468, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9414465427398682, "rewards/margins": 1.3546481132507324, "rewards/rejected": -3.2960944175720215, "step": 2930 }, { "epoch": 0.7053742802303263, "grad_norm": 17.441141869897656, "learning_rate": 1.209723640362815e-07, "logits/chosen": -0.5792838335037231, "logits/rejected": -0.5826687216758728, "logps/chosen": -456.9283142089844, "logps/rejected": -620.7843627929688, "loss": 0.4813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8888061046600342, "rewards/margins": 1.7339365482330322, "rewards/rejected": -3.6227424144744873, "step": 2940 }, { "epoch": 0.7077735124760077, "grad_norm": 11.312151269139996, "learning_rate": 1.191834931905277e-07, "logits/chosen": -0.5471521615982056, "logits/rejected": -0.5616979598999023, "logps/chosen": -510.0934143066406, "logps/rejected": -647.6993408203125, "loss": 0.4134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.116403341293335, "rewards/margins": 1.3659651279449463, "rewards/rejected": -3.482367992401123, "step": 2950 }, { "epoch": 0.710172744721689, "grad_norm": 13.959291203129078, "learning_rate": 1.1740379860299988e-07, "logits/chosen": -0.5202777981758118, "logits/rejected": -0.5581659078598022, "logps/chosen": -476.20635986328125, "logps/rejected": -612.1878662109375, "loss": 0.4634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9924306869506836, "rewards/margins": 1.1773386001586914, "rewards/rejected": -3.169769287109375, "step": 2960 }, { "epoch": 0.7125719769673704, "grad_norm": 13.163034374925202, "learning_rate": 1.1563340511220254e-07, "logits/chosen": -0.5559359788894653, "logits/rejected": -0.5668517351150513, "logps/chosen": -511.17333984375, "logps/rejected": -623.8465576171875, "loss": 0.4817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2064290046691895, "rewards/margins": 1.2941501140594482, "rewards/rejected": -3.5005791187286377, "step": 2970 }, { "epoch": 0.7149712092130518, "grad_norm": 12.726063519299634, "learning_rate": 1.1387243690420556e-07, "logits/chosen": -0.5109056234359741, "logits/rejected": -0.5152195692062378, "logps/chosen": -488.728759765625, "logps/rejected": -659.3638305664062, "loss": 0.4383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.865386724472046, "rewards/margins": 1.8146740198135376, "rewards/rejected": -3.680060863494873, "step": 2980 }, { "epoch": 0.7173704414587332, "grad_norm": 15.060517596878292, "learning_rate": 1.1212101750393235e-07, "logits/chosen": -0.5651146173477173, "logits/rejected": -0.5633836984634399, "logps/chosen": -457.1798400878906, "logps/rejected": -594.5474853515625, "loss": 0.4016, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1187145709991455, "rewards/margins": 1.5976879596710205, "rewards/rejected": -3.716402530670166, "step": 2990 }, { "epoch": 0.7197696737044146, "grad_norm": 10.840848096813268, "learning_rate": 1.1037926976649562e-07, "logits/chosen": -0.6062291860580444, "logits/rejected": -0.5924742817878723, "logps/chosen": -467.4085388183594, "logps/rejected": -654.9591674804688, "loss": 0.4878, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1459498405456543, "rewards/margins": 1.5907325744628906, "rewards/rejected": -3.736682415008545, "step": 3000 }, { "epoch": 0.722168905950096, "grad_norm": 15.042187958894935, "learning_rate": 1.0864731586857936e-07, "logits/chosen": -0.4485263228416443, "logits/rejected": -0.44067448377609253, "logps/chosen": -470.13055419921875, "logps/rejected": -602.8482666015625, "loss": 0.4195, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9293149709701538, "rewards/margins": 1.6746339797973633, "rewards/rejected": -3.6039490699768066, "step": 3010 }, { "epoch": 0.7245681381957774, "grad_norm": 12.194246416479364, "learning_rate": 1.0692527729986839e-07, "logits/chosen": -0.5851191282272339, "logits/rejected": -0.592607855796814, "logps/chosen": -451.42376708984375, "logps/rejected": -582.010498046875, "loss": 0.3994, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9533573389053345, "rewards/margins": 1.4484539031982422, "rewards/rejected": -3.401811122894287, "step": 3020 }, { "epoch": 0.7269673704414588, "grad_norm": 14.340146970439987, "learning_rate": 1.0521327485452692e-07, "logits/chosen": -0.5049649477005005, "logits/rejected": -0.5048503875732422, "logps/chosen": -444.41015625, "logps/rejected": -574.8997802734375, "loss": 0.4438, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.013583183288574, "rewards/margins": 1.4341424703598022, "rewards/rejected": -3.447725296020508, "step": 3030 }, { "epoch": 0.7293666026871402, "grad_norm": 17.094188348902286, "learning_rate": 1.0351142862272468e-07, "logits/chosen": -0.4877733290195465, "logits/rejected": -0.542160153388977, "logps/chosen": -409.5499267578125, "logps/rejected": -598.6688232421875, "loss": 0.4483, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.956335425376892, "rewards/margins": 1.96954345703125, "rewards/rejected": -3.9258790016174316, "step": 3040 }, { "epoch": 0.7317658349328215, "grad_norm": 16.384363468476412, "learning_rate": 1.0181985798221343e-07, "logits/chosen": -0.44771862030029297, "logits/rejected": -0.48205646872520447, "logps/chosen": -469.80364990234375, "logps/rejected": -642.3453979492188, "loss": 0.4837, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.060243844985962, "rewards/margins": 1.60220468044281, "rewards/rejected": -3.6624481678009033, "step": 3050 }, { "epoch": 0.7341650671785028, "grad_norm": 15.159551813260798, "learning_rate": 1.0013868158995329e-07, "logits/chosen": -0.4246044158935547, "logits/rejected": -0.45489010214805603, "logps/chosen": -471.446533203125, "logps/rejected": -587.3192138671875, "loss": 0.4687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.233337879180908, "rewards/margins": 1.3818399906158447, "rewards/rejected": -3.615177869796753, "step": 3060 }, { "epoch": 0.7365642994241842, "grad_norm": 13.463524340329029, "learning_rate": 9.84680173737887e-08, "logits/chosen": -0.5496365427970886, "logits/rejected": -0.569757878780365, "logps/chosen": -493.57843017578125, "logps/rejected": -592.6453247070312, "loss": 0.4307, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2892556190490723, "rewards/margins": 1.3507412672042847, "rewards/rejected": -3.6399970054626465, "step": 3070 }, { "epoch": 0.7389635316698656, "grad_norm": 12.455106651157736, "learning_rate": 9.680798252417713e-08, "logits/chosen": -0.5762359499931335, "logits/rejected": -0.5943504571914673, "logps/chosen": -403.1676025390625, "logps/rejected": -588.2984619140625, "loss": 0.4171, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9747021198272705, "rewards/margins": 1.486598014831543, "rewards/rejected": -3.4612998962402344, "step": 3080 }, { "epoch": 0.741362763915547, "grad_norm": 14.204008950257876, "learning_rate": 9.515869348596808e-08, "logits/chosen": -0.5742790699005127, "logits/rejected": -0.6299481987953186, "logps/chosen": -498.3915100097656, "logps/rejected": -616.3846435546875, "loss": 0.453, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0285322666168213, "rewards/margins": 1.541265845298767, "rewards/rejected": -3.569798231124878, "step": 3090 }, { "epoch": 0.7437619961612284, "grad_norm": 11.674082631607327, "learning_rate": 9.352026595023493e-08, "logits/chosen": -0.6226130723953247, "logits/rejected": -0.6168379783630371, "logps/chosen": -476.732666015625, "logps/rejected": -552.8656005859375, "loss": 0.4441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.860538125038147, "rewards/margins": 1.0402759313583374, "rewards/rejected": -2.9008140563964844, "step": 3100 }, { "epoch": 0.7461612284069098, "grad_norm": 13.589284320232862, "learning_rate": 9.189281484616004e-08, "logits/chosen": -0.5403670072555542, "logits/rejected": -0.5388067960739136, "logps/chosen": -406.925048828125, "logps/rejected": -575.8323974609375, "loss": 0.4779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1276307106018066, "rewards/margins": 1.1783473491668701, "rewards/rejected": -3.3059780597686768, "step": 3110 }, { "epoch": 0.7485604606525912, "grad_norm": 14.083306203022303, "learning_rate": 9.027645433297249e-08, "logits/chosen": -0.5009843707084656, "logits/rejected": -0.5172004699707031, "logps/chosen": -545.912109375, "logps/rejected": -643.9588012695312, "loss": 0.4964, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4063868522644043, "rewards/margins": 1.3331215381622314, "rewards/rejected": -3.7395083904266357, "step": 3120 }, { "epoch": 0.7509596928982726, "grad_norm": 15.054060697910124, "learning_rate": 8.867129779194066e-08, "logits/chosen": -0.6103423833847046, "logits/rejected": -0.6305662393569946, "logps/chosen": -362.4765930175781, "logps/rejected": -533.6757202148438, "loss": 0.4347, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5669844150543213, "rewards/margins": 1.7321140766143799, "rewards/rejected": -3.2990989685058594, "step": 3130 }, { "epoch": 0.753358925143954, "grad_norm": 15.171972092803582, "learning_rate": 8.707745781841866e-08, "logits/chosen": -0.5541486144065857, "logits/rejected": -0.5569981932640076, "logps/chosen": -399.5630187988281, "logps/rejected": -563.7500610351562, "loss": 0.4655, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.850064992904663, "rewards/margins": 1.5996869802474976, "rewards/rejected": -3.44975209236145, "step": 3140 }, { "epoch": 0.7557581573896354, "grad_norm": 7.987545135931887, "learning_rate": 8.549504621394831e-08, "logits/chosen": -0.6387466192245483, "logits/rejected": -0.629570484161377, "logps/chosen": -387.7325134277344, "logps/rejected": -584.460693359375, "loss": 0.3641, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.571352243423462, "rewards/margins": 1.9252641201019287, "rewards/rejected": -3.4966163635253906, "step": 3150 }, { "epoch": 0.7581573896353166, "grad_norm": 13.9921540642964, "learning_rate": 8.392417397841703e-08, "logits/chosen": -0.5311389565467834, "logits/rejected": -0.563139796257019, "logps/chosen": -412.6419982910156, "logps/rejected": -557.9671630859375, "loss": 0.4553, "rewards/accuracies": 0.875, "rewards/chosen": -1.664912223815918, "rewards/margins": 1.2914505004882812, "rewards/rejected": -2.956362724304199, "step": 3160 }, { "epoch": 0.760556621880998, "grad_norm": 10.54509313617014, "learning_rate": 8.236495130227083e-08, "logits/chosen": -0.507122278213501, "logits/rejected": -0.5326481461524963, "logps/chosen": -449.63214111328125, "logps/rejected": -611.7938232421875, "loss": 0.4536, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.660041093826294, "rewards/margins": 1.9526357650756836, "rewards/rejected": -3.6126770973205566, "step": 3170 }, { "epoch": 0.7629558541266794, "grad_norm": 19.060642084344988, "learning_rate": 8.081748755878612e-08, "logits/chosen": -0.5622953176498413, "logits/rejected": -0.6007119417190552, "logps/chosen": -473.05029296875, "logps/rejected": -541.9755249023438, "loss": 0.4346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0736916065216064, "rewards/margins": 1.2077754735946655, "rewards/rejected": -3.2814669609069824, "step": 3180 }, { "epoch": 0.7653550863723608, "grad_norm": 12.641909967755108, "learning_rate": 7.928189129639632e-08, "logits/chosen": -0.5046022534370422, "logits/rejected": -0.4738716185092926, "logps/chosen": -424.58990478515625, "logps/rejected": -587.5633544921875, "loss": 0.4025, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.909470796585083, "rewards/margins": 1.5023930072784424, "rewards/rejected": -3.411863327026367, "step": 3190 }, { "epoch": 0.7677543186180422, "grad_norm": 17.284721871893858, "learning_rate": 7.775827023107834e-08, "logits/chosen": -0.5278437733650208, "logits/rejected": -0.5451136827468872, "logps/chosen": -438.02001953125, "logps/rejected": -594.689453125, "loss": 0.4761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9781577587127686, "rewards/margins": 1.3622440099716187, "rewards/rejected": -3.340402126312256, "step": 3200 }, { "epoch": 0.7701535508637236, "grad_norm": 16.225895751875765, "learning_rate": 7.624673123879682e-08, "logits/chosen": -0.6049574017524719, "logits/rejected": -0.6234583854675293, "logps/chosen": -408.36663818359375, "logps/rejected": -536.5324096679688, "loss": 0.4467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7185170650482178, "rewards/margins": 1.4202756881713867, "rewards/rejected": -3.1387927532196045, "step": 3210 }, { "epoch": 0.772552783109405, "grad_norm": 11.145879658096279, "learning_rate": 7.474738034800663e-08, "logits/chosen": -0.6624782085418701, "logits/rejected": -0.6585075259208679, "logps/chosen": -374.86956787109375, "logps/rejected": -557.4089965820312, "loss": 0.4768, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.858591079711914, "rewards/margins": 1.8989025354385376, "rewards/rejected": -3.7574939727783203, "step": 3220 }, { "epoch": 0.7749520153550864, "grad_norm": 12.542053839469856, "learning_rate": 7.326032273221606e-08, "logits/chosen": -0.5727890133857727, "logits/rejected": -0.5879526138305664, "logps/chosen": -486.57806396484375, "logps/rejected": -597.93115234375, "loss": 0.41, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0051982402801514, "rewards/margins": 1.297196626663208, "rewards/rejected": -3.3023948669433594, "step": 3230 }, { "epoch": 0.7773512476007678, "grad_norm": 14.33501525091917, "learning_rate": 7.178566270260872e-08, "logits/chosen": -0.5633407235145569, "logits/rejected": -0.5676406621932983, "logps/chosen": -460.4888610839844, "logps/rejected": -608.435791015625, "loss": 0.4768, "rewards/accuracies": 0.75, "rewards/chosen": -2.092958927154541, "rewards/margins": 1.1691919565200806, "rewards/rejected": -3.262151002883911, "step": 3240 }, { "epoch": 0.7797504798464492, "grad_norm": 12.819177195288049, "learning_rate": 7.032350370072709e-08, "logits/chosen": -0.5324596166610718, "logits/rejected": -0.5585157871246338, "logps/chosen": -442.884765625, "logps/rejected": -604.0455932617188, "loss": 0.4107, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7358335256576538, "rewards/margins": 1.707724928855896, "rewards/rejected": -3.44355845451355, "step": 3250 }, { "epoch": 0.7821497120921305, "grad_norm": 12.690278052375158, "learning_rate": 6.887394829121596e-08, "logits/chosen": -0.5658280849456787, "logits/rejected": -0.6075069308280945, "logps/chosen": -454.86376953125, "logps/rejected": -674.8836059570312, "loss": 0.4234, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9878448247909546, "rewards/margins": 2.3019330501556396, "rewards/rejected": -4.289777755737305, "step": 3260 }, { "epoch": 0.7845489443378119, "grad_norm": 13.632469694204937, "learning_rate": 6.743709815462833e-08, "logits/chosen": -0.6113773584365845, "logits/rejected": -0.62415611743927, "logps/chosen": -446.94708251953125, "logps/rejected": -559.8250732421875, "loss": 0.4336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9104185104370117, "rewards/margins": 1.419710397720337, "rewards/rejected": -3.3301289081573486, "step": 3270 }, { "epoch": 0.7869481765834933, "grad_norm": 11.823636613615504, "learning_rate": 6.601305408029287e-08, "logits/chosen": -0.47927188873291016, "logits/rejected": -0.48892560601234436, "logps/chosen": -441.11334228515625, "logps/rejected": -583.4063720703125, "loss": 0.4456, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.027625560760498, "rewards/margins": 1.3957432508468628, "rewards/rejected": -3.4233689308166504, "step": 3280 }, { "epoch": 0.7893474088291746, "grad_norm": 16.011451751603392, "learning_rate": 6.460191595924366e-08, "logits/chosen": -0.5301553010940552, "logits/rejected": -0.5367687940597534, "logps/chosen": -456.830322265625, "logps/rejected": -578.0042724609375, "loss": 0.4282, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9470264911651611, "rewards/margins": 1.198061227798462, "rewards/rejected": -3.145087718963623, "step": 3290 }, { "epoch": 0.791746641074856, "grad_norm": 12.691724390773901, "learning_rate": 6.320378277721342e-08, "logits/chosen": -0.496354877948761, "logits/rejected": -0.5095658898353577, "logps/chosen": -461.4742126464844, "logps/rejected": -546.1395874023438, "loss": 0.452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.140702724456787, "rewards/margins": 0.8940545320510864, "rewards/rejected": -3.034757137298584, "step": 3300 }, { "epoch": 0.7941458733205374, "grad_norm": 16.823392530566938, "learning_rate": 6.181875260769032e-08, "logits/chosen": -0.560473620891571, "logits/rejected": -0.5936623811721802, "logps/chosen": -451.93463134765625, "logps/rejected": -540.117431640625, "loss": 0.4733, "rewards/accuracies": 0.75, "rewards/chosen": -1.6345758438110352, "rewards/margins": 1.5807870626449585, "rewards/rejected": -3.215363025665283, "step": 3310 }, { "epoch": 0.7965451055662188, "grad_norm": 14.520894739599417, "learning_rate": 6.044692260503797e-08, "logits/chosen": -0.5175925493240356, "logits/rejected": -0.5312203764915466, "logps/chosen": -503.57965087890625, "logps/rejected": -643.9119873046875, "loss": 0.3855, "rewards/accuracies": 0.875, "rewards/chosen": -2.1000008583068848, "rewards/margins": 1.6387672424316406, "rewards/rejected": -3.7387681007385254, "step": 3320 }, { "epoch": 0.7989443378119002, "grad_norm": 14.229849309645635, "learning_rate": 5.9088388997680984e-08, "logits/chosen": -0.5600963830947876, "logits/rejected": -0.5659655332565308, "logps/chosen": -531.041259765625, "logps/rejected": -604.8890380859375, "loss": 0.4186, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.036707639694214, "rewards/margins": 1.4657360315322876, "rewards/rejected": -3.502443313598633, "step": 3330 }, { "epoch": 0.8013435700575816, "grad_norm": 14.73962835970847, "learning_rate": 5.774324708135439e-08, "logits/chosen": -0.6263202428817749, "logits/rejected": -0.6441248059272766, "logps/chosen": -392.24835205078125, "logps/rejected": -516.92236328125, "loss": 0.4425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7824525833129883, "rewards/margins": 1.451863169670105, "rewards/rejected": -3.2343153953552246, "step": 3340 }, { "epoch": 0.803742802303263, "grad_norm": 10.763534598750553, "learning_rate": 5.641159121241953e-08, "logits/chosen": -0.5910140872001648, "logits/rejected": -0.5627475380897522, "logps/chosen": -398.29547119140625, "logps/rejected": -605.0794067382812, "loss": 0.453, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8737194538116455, "rewards/margins": 1.7353298664093018, "rewards/rejected": -3.609048366546631, "step": 3350 }, { "epoch": 0.8061420345489443, "grad_norm": 13.4623185493143, "learning_rate": 5.5093514801245106e-08, "logits/chosen": -0.497117817401886, "logits/rejected": -0.5237521529197693, "logps/chosen": -448.5010681152344, "logps/rejected": -618.0789794921875, "loss": 0.4387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0547759532928467, "rewards/margins": 1.4181969165802002, "rewards/rejected": -3.472972869873047, "step": 3360 }, { "epoch": 0.8085412667946257, "grad_norm": 13.694316296216712, "learning_rate": 5.378911030565453e-08, "logits/chosen": -0.44851231575012207, "logits/rejected": -0.4383707046508789, "logps/chosen": -530.1419677734375, "logps/rejected": -675.80224609375, "loss": 0.4451, "rewards/accuracies": 0.75, "rewards/chosen": -2.4019455909729004, "rewards/margins": 1.1452901363372803, "rewards/rejected": -3.5472354888916016, "step": 3370 }, { "epoch": 0.8109404990403071, "grad_norm": 10.528600910072944, "learning_rate": 5.249846922444101e-08, "logits/chosen": -0.5667535066604614, "logits/rejected": -0.5957349538803101, "logps/chosen": -412.9205017089844, "logps/rejected": -626.1097412109375, "loss": 0.4203, "rewards/accuracies": 0.875, "rewards/chosen": -2.0233278274536133, "rewards/margins": 2.25299334526062, "rewards/rejected": -4.276320934295654, "step": 3380 }, { "epoch": 0.8133397312859885, "grad_norm": 14.951755734068287, "learning_rate": 5.122168209094865e-08, "logits/chosen": -0.49277129769325256, "logits/rejected": -0.5049806833267212, "logps/chosen": -425.42291259765625, "logps/rejected": -525.0523071289062, "loss": 0.4278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1365087032318115, "rewards/margins": 1.0094716548919678, "rewards/rejected": -3.1459803581237793, "step": 3390 }, { "epoch": 0.8157389635316699, "grad_norm": 10.733513837258595, "learning_rate": 4.995883846672222e-08, "logits/chosen": -0.5534166693687439, "logits/rejected": -0.5597847700119019, "logps/chosen": -573.193115234375, "logps/rejected": -633.9830322265625, "loss": 0.4259, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0612425804138184, "rewards/margins": 1.392027735710144, "rewards/rejected": -3.453270435333252, "step": 3400 }, { "epoch": 0.8181381957773513, "grad_norm": 14.785275906962475, "learning_rate": 4.871002693522486e-08, "logits/chosen": -0.601963222026825, "logits/rejected": -0.6202664971351624, "logps/chosen": -458.74700927734375, "logps/rejected": -539.7118530273438, "loss": 0.4571, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.000706434249878, "rewards/margins": 1.0372394323349, "rewards/rejected": -3.0379462242126465, "step": 3410 }, { "epoch": 0.8205374280230326, "grad_norm": 10.774672959999748, "learning_rate": 4.7475335095623956e-08, "logits/chosen": -0.5421626567840576, "logits/rejected": -0.5486319661140442, "logps/chosen": -448.543212890625, "logps/rejected": -592.2151489257812, "loss": 0.4437, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.940687894821167, "rewards/margins": 1.5618858337402344, "rewards/rejected": -3.5025742053985596, "step": 3420 }, { "epoch": 0.822936660268714, "grad_norm": 18.59487101118735, "learning_rate": 4.6254849556646714e-08, "logits/chosen": -0.4963017404079437, "logits/rejected": -0.4984667897224426, "logps/chosen": -487.8688049316406, "logps/rejected": -633.2571411132812, "loss": 0.4242, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9285959005355835, "rewards/margins": 1.74213445186615, "rewards/rejected": -3.6707305908203125, "step": 3430 }, { "epoch": 0.8253358925143954, "grad_norm": 14.44038111770707, "learning_rate": 4.504865593050483e-08, "logits/chosen": -0.5637535452842712, "logits/rejected": -0.5861309170722961, "logps/chosen": -481.045654296875, "logps/rejected": -599.7122192382812, "loss": 0.4611, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.163588047027588, "rewards/margins": 1.1554136276245117, "rewards/rejected": -3.3190014362335205, "step": 3440 }, { "epoch": 0.8277351247600768, "grad_norm": 15.606897326164194, "learning_rate": 4.385683882688895e-08, "logits/chosen": -0.5581148862838745, "logits/rejected": -0.5503061413764954, "logps/chosen": -502.58038330078125, "logps/rejected": -550.5048828125, "loss": 0.5052, "rewards/accuracies": 0.75, "rewards/chosen": -2.2417547702789307, "rewards/margins": 0.969860851764679, "rewards/rejected": -3.2116153240203857, "step": 3450 }, { "epoch": 0.8301343570057581, "grad_norm": 14.412578218384434, "learning_rate": 4.2679481847033985e-08, "logits/chosen": -0.5228904485702515, "logits/rejected": -0.5363970994949341, "logps/chosen": -470.89544677734375, "logps/rejected": -635.0361328125, "loss": 0.4608, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.150740623474121, "rewards/margins": 1.5161142349243164, "rewards/rejected": -3.6668548583984375, "step": 3460 }, { "epoch": 0.8325335892514395, "grad_norm": 12.138527821387553, "learning_rate": 4.151666757785435e-08, "logits/chosen": -0.6146914958953857, "logits/rejected": -0.6073625087738037, "logps/chosen": -406.7055969238281, "logps/rejected": -636.0123291015625, "loss": 0.4199, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6833385229110718, "rewards/margins": 2.205997943878174, "rewards/rejected": -3.889336347579956, "step": 3470 }, { "epoch": 0.8349328214971209, "grad_norm": 14.790830386193752, "learning_rate": 4.036847758615136e-08, "logits/chosen": -0.4863740801811218, "logits/rejected": -0.5530039668083191, "logps/chosen": -498.92742919921875, "logps/rejected": -627.8193359375, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": -2.640145778656006, "rewards/margins": 1.157325029373169, "rewards/rejected": -3.797470808029175, "step": 3480 }, { "epoch": 0.8373320537428023, "grad_norm": 10.798032597651305, "learning_rate": 3.923499241289113e-08, "logits/chosen": -0.6010452508926392, "logits/rejected": -0.6536823511123657, "logps/chosen": -528.5853881835938, "logps/rejected": -596.8575439453125, "loss": 0.4873, "rewards/accuracies": 0.75, "rewards/chosen": -2.2441864013671875, "rewards/margins": 1.2915928363800049, "rewards/rejected": -3.5357794761657715, "step": 3490 }, { "epoch": 0.8397312859884837, "grad_norm": 10.865503254373744, "learning_rate": 3.811629156755541e-08, "logits/chosen": -0.5816788077354431, "logits/rejected": -0.6040675640106201, "logps/chosen": -496.4483947753906, "logps/rejected": -629.3803100585938, "loss": 0.4407, "rewards/accuracies": 0.75, "rewards/chosen": -2.0880205631256104, "rewards/margins": 1.3838069438934326, "rewards/rejected": -3.471827268600464, "step": 3500 }, { "epoch": 0.8421305182341651, "grad_norm": 11.278652918262004, "learning_rate": 3.701245352256391e-08, "logits/chosen": -0.5675192475318909, "logits/rejected": -0.6156761050224304, "logps/chosen": -493.8138732910156, "logps/rejected": -581.7841796875, "loss": 0.449, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.045945882797241, "rewards/margins": 1.155053973197937, "rewards/rejected": -3.2009997367858887, "step": 3510 }, { "epoch": 0.8445297504798465, "grad_norm": 13.271574282231718, "learning_rate": 3.592355570776984e-08, "logits/chosen": -0.6514331102371216, "logits/rejected": -0.652426540851593, "logps/chosen": -390.1234436035156, "logps/rejected": -560.892822265625, "loss": 0.4027, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7292102575302124, "rewards/margins": 1.5589096546173096, "rewards/rejected": -3.2881197929382324, "step": 3520 }, { "epoch": 0.8469289827255279, "grad_norm": 12.264631757796657, "learning_rate": 3.484967450502904e-08, "logits/chosen": -0.5596613883972168, "logits/rejected": -0.5722562670707703, "logps/chosen": -371.233154296875, "logps/rejected": -591.3458862304688, "loss": 0.4278, "rewards/accuracies": 0.875, "rewards/chosen": -1.6561084985733032, "rewards/margins": 1.7522211074829102, "rewards/rejected": -3.408329725265503, "step": 3530 }, { "epoch": 0.8493282149712092, "grad_norm": 21.707760756412107, "learning_rate": 3.3790885242841296e-08, "logits/chosen": -0.5783206224441528, "logits/rejected": -0.6104044318199158, "logps/chosen": -453.5904235839844, "logps/rejected": -656.5443115234375, "loss": 0.4022, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1034021377563477, "rewards/margins": 2.0261425971984863, "rewards/rejected": -4.129544258117676, "step": 3540 }, { "epoch": 0.8517274472168906, "grad_norm": 17.547573796246525, "learning_rate": 3.274726219106677e-08, "logits/chosen": -0.5989304780960083, "logits/rejected": -0.6391880512237549, "logps/chosen": -502.286865234375, "logps/rejected": -645.5471801757812, "loss": 0.465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.106433868408203, "rewards/margins": 1.479160189628601, "rewards/rejected": -3.5855941772460938, "step": 3550 }, { "epoch": 0.8541266794625719, "grad_norm": 12.613105176272517, "learning_rate": 3.171887855571642e-08, "logits/chosen": -0.5413884520530701, "logits/rejected": -0.4987201690673828, "logps/chosen": -401.0578918457031, "logps/rejected": -514.5476684570312, "loss": 0.4373, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.801924467086792, "rewards/margins": 1.2051035165786743, "rewards/rejected": -3.007028102874756, "step": 3560 }, { "epoch": 0.8565259117082533, "grad_norm": 14.359058138666166, "learning_rate": 3.070580647381643e-08, "logits/chosen": -0.5530554056167603, "logits/rejected": -0.5961068272590637, "logps/chosen": -413.8770446777344, "logps/rejected": -572.0676879882812, "loss": 0.4628, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.856571912765503, "rewards/margins": 1.5779253244400024, "rewards/rejected": -3.434497356414795, "step": 3570 }, { "epoch": 0.8589251439539347, "grad_norm": 13.165867775542413, "learning_rate": 2.9708117008348576e-08, "logits/chosen": -0.5645931959152222, "logits/rejected": -0.5597985982894897, "logps/chosen": -502.2027893066406, "logps/rejected": -563.4842529296875, "loss": 0.4342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9971221685409546, "rewards/margins": 1.1243679523468018, "rewards/rejected": -3.121490240097046, "step": 3580 }, { "epoch": 0.8613243761996161, "grad_norm": 12.288962805449156, "learning_rate": 2.8725880143264992e-08, "logits/chosen": -0.6008769869804382, "logits/rejected": -0.6012517213821411, "logps/chosen": -473.57110595703125, "logps/rejected": -637.85546875, "loss": 0.4957, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3291287422180176, "rewards/margins": 1.2044109106063843, "rewards/rejected": -3.5335395336151123, "step": 3590 }, { "epoch": 0.8637236084452975, "grad_norm": 20.135961770884553, "learning_rate": 2.775916477857948e-08, "logits/chosen": -0.5295973420143127, "logits/rejected": -0.5415645837783813, "logps/chosen": -432.3699645996094, "logps/rejected": -547.469970703125, "loss": 0.4302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.17122220993042, "rewards/margins": 1.1380698680877686, "rewards/rejected": -3.3092925548553467, "step": 3600 }, { "epoch": 0.8661228406909789, "grad_norm": 14.482337367851747, "learning_rate": 2.680803872553408e-08, "logits/chosen": -0.5756790637969971, "logits/rejected": -0.651614248752594, "logps/chosen": -415.51275634765625, "logps/rejected": -632.2459716796875, "loss": 0.4528, "rewards/accuracies": 0.875, "rewards/chosen": -1.7809759378433228, "rewards/margins": 2.2642104625701904, "rewards/rejected": -4.045186519622803, "step": 3610 }, { "epoch": 0.8685220729366603, "grad_norm": 17.202988883981018, "learning_rate": 2.5872568701842706e-08, "logits/chosen": -0.5497530698776245, "logits/rejected": -0.566763162612915, "logps/chosen": -387.01171875, "logps/rejected": -564.3225708007812, "loss": 0.4966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8862674236297607, "rewards/margins": 1.5258787870407104, "rewards/rejected": -3.4121460914611816, "step": 3620 }, { "epoch": 0.8709213051823417, "grad_norm": 16.587596939503936, "learning_rate": 2.495282032701096e-08, "logits/chosen": -0.5850919485092163, "logits/rejected": -0.6418455839157104, "logps/chosen": -336.73431396484375, "logps/rejected": -482.5956115722656, "loss": 0.4306, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7132467031478882, "rewards/margins": 1.6492410898208618, "rewards/rejected": -3.36248779296875, "step": 3630 }, { "epoch": 0.8733205374280231, "grad_norm": 17.480004730447185, "learning_rate": 2.4048858117733133e-08, "logits/chosen": -0.6369383931159973, "logits/rejected": -0.6253448724746704, "logps/chosen": -445.79388427734375, "logps/rejected": -605.3549194335938, "loss": 0.4271, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9298584461212158, "rewards/margins": 2.020317554473877, "rewards/rejected": -3.9501757621765137, "step": 3640 }, { "epoch": 0.8757197696737045, "grad_norm": 15.836939219932262, "learning_rate": 2.3160745483366938e-08, "logits/chosen": -0.5421168208122253, "logits/rejected": -0.5399103760719299, "logps/chosen": -435.59417724609375, "logps/rejected": -604.5552978515625, "loss": 0.432, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.094374179840088, "rewards/margins": 1.313892126083374, "rewards/rejected": -3.408266067504883, "step": 3650 }, { "epoch": 0.8781190019193857, "grad_norm": 15.232342004495118, "learning_rate": 2.2288544721485197e-08, "logits/chosen": -0.6265963912010193, "logits/rejected": -0.6633044481277466, "logps/chosen": -367.7839660644531, "logps/rejected": -573.351318359375, "loss": 0.4105, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6053600311279297, "rewards/margins": 1.8461459875106812, "rewards/rejected": -3.4515061378479004, "step": 3660 }, { "epoch": 0.8805182341650671, "grad_norm": 14.172769513689536, "learning_rate": 2.1432317013506117e-08, "logits/chosen": -0.65406334400177, "logits/rejected": -0.6587377190589905, "logps/chosen": -456.6194763183594, "logps/rejected": -549.1373291015625, "loss": 0.4751, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.102041721343994, "rewards/margins": 1.3290612697601318, "rewards/rejected": -3.431102752685547, "step": 3670 }, { "epoch": 0.8829174664107485, "grad_norm": 14.701070733746729, "learning_rate": 2.0592122420401704e-08, "logits/chosen": -0.4700722098350525, "logits/rejected": -0.46845799684524536, "logps/chosen": -396.937255859375, "logps/rejected": -518.0514526367188, "loss": 0.4567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.834855318069458, "rewards/margins": 1.0623095035552979, "rewards/rejected": -2.8971645832061768, "step": 3680 }, { "epoch": 0.8853166986564299, "grad_norm": 12.846048547846815, "learning_rate": 1.976801987848459e-08, "logits/chosen": -0.6387890577316284, "logits/rejected": -0.6454359292984009, "logps/chosen": -442.06005859375, "logps/rejected": -623.1151733398438, "loss": 0.4364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8863179683685303, "rewards/margins": 1.654547095298767, "rewards/rejected": -3.540865421295166, "step": 3690 }, { "epoch": 0.8877159309021113, "grad_norm": 13.872418259324444, "learning_rate": 1.8960067195273987e-08, "logits/chosen": -0.6006834506988525, "logits/rejected": -0.6487979888916016, "logps/chosen": -395.2511901855469, "logps/rejected": -565.4385986328125, "loss": 0.4271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9122810363769531, "rewards/margins": 1.7357158660888672, "rewards/rejected": -3.647996425628662, "step": 3700 }, { "epoch": 0.8901151631477927, "grad_norm": 14.265685542528358, "learning_rate": 1.816832104544072e-08, "logits/chosen": -0.47106099128723145, "logits/rejected": -0.5008233189582825, "logps/chosen": -471.165283203125, "logps/rejected": -570.4010620117188, "loss": 0.4474, "rewards/accuracies": 0.75, "rewards/chosen": -2.060793876647949, "rewards/margins": 1.2439154386520386, "rewards/rejected": -3.3047091960906982, "step": 3710 }, { "epoch": 0.8925143953934741, "grad_norm": 12.43063163239937, "learning_rate": 1.7392836966831553e-08, "logits/chosen": -0.5043891668319702, "logits/rejected": -0.541654646396637, "logps/chosen": -437.30902099609375, "logps/rejected": -612.6980590820312, "loss": 0.4117, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7850959300994873, "rewards/margins": 2.040393352508545, "rewards/rejected": -3.825489044189453, "step": 3720 }, { "epoch": 0.8949136276391555, "grad_norm": 15.428500593517235, "learning_rate": 1.663366935657373e-08, "logits/chosen": -0.6029775738716125, "logits/rejected": -0.6049376726150513, "logps/chosen": -399.92132568359375, "logps/rejected": -564.7398681640625, "loss": 0.4785, "rewards/accuracies": 0.75, "rewards/chosen": -1.9421736001968384, "rewards/margins": 1.540740966796875, "rewards/rejected": -3.482914447784424, "step": 3730 }, { "epoch": 0.8973128598848369, "grad_norm": 15.901745967879851, "learning_rate": 1.5890871467258898e-08, "logits/chosen": -0.5083228945732117, "logits/rejected": -0.5067955851554871, "logps/chosen": -519.1453857421875, "logps/rejected": -613.5746459960938, "loss": 0.4358, "rewards/accuracies": 0.75, "rewards/chosen": -2.022681474685669, "rewards/margins": 1.2650549411773682, "rewards/rejected": -3.287736415863037, "step": 3740 }, { "epoch": 0.8997120921305183, "grad_norm": 12.288479611102789, "learning_rate": 1.5164495403207967e-08, "logits/chosen": -0.6138381958007812, "logits/rejected": -0.648627758026123, "logps/chosen": -492.9188537597656, "logps/rejected": -688.1244506835938, "loss": 0.4429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.270721673965454, "rewards/margins": 1.671618103981018, "rewards/rejected": -3.9423396587371826, "step": 3750 }, { "epoch": 0.9021113243761996, "grad_norm": 13.881705443551352, "learning_rate": 1.4454592116815962e-08, "logits/chosen": -0.5159580707550049, "logits/rejected": -0.5592847466468811, "logps/chosen": -446.95257568359375, "logps/rejected": -608.6639404296875, "loss": 0.4088, "rewards/accuracies": 0.75, "rewards/chosen": -1.826229453086853, "rewards/margins": 1.4294321537017822, "rewards/rejected": -3.255661725997925, "step": 3760 }, { "epoch": 0.904510556621881, "grad_norm": 9.457402626337869, "learning_rate": 1.3761211404977934e-08, "logits/chosen": -0.6075922250747681, "logits/rejected": -0.6064502596855164, "logps/chosen": -420.793701171875, "logps/rejected": -630.1043701171875, "loss": 0.3875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.974437952041626, "rewards/margins": 2.114816427230835, "rewards/rejected": -4.089253902435303, "step": 3770 }, { "epoch": 0.9069097888675623, "grad_norm": 15.273283970687197, "learning_rate": 1.3084401905596177e-08, "logits/chosen": -0.5783820152282715, "logits/rejected": -0.640872597694397, "logps/chosen": -474.72100830078125, "logps/rejected": -551.1171875, "loss": 0.4556, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9363771677017212, "rewards/margins": 1.1951727867126465, "rewards/rejected": -3.1315500736236572, "step": 3780 }, { "epoch": 0.9093090211132437, "grad_norm": 15.207248331238338, "learning_rate": 1.2424211094168053e-08, "logits/chosen": -0.44624510407447815, "logits/rejected": -0.45766526460647583, "logps/chosen": -504.5450134277344, "logps/rejected": -639.1659545898438, "loss": 0.4296, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8555368185043335, "rewards/margins": 1.4501540660858154, "rewards/rejected": -3.3056907653808594, "step": 3790 }, { "epoch": 0.9117082533589251, "grad_norm": 12.766030520120715, "learning_rate": 1.1780685280456143e-08, "logits/chosen": -0.561526894569397, "logits/rejected": -0.5872783064842224, "logps/chosen": -519.5202026367188, "logps/rejected": -713.41357421875, "loss": 0.4758, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.273921251296997, "rewards/margins": 1.8192304372787476, "rewards/rejected": -4.093151569366455, "step": 3800 }, { "epoch": 0.9141074856046065, "grad_norm": 14.582108396435707, "learning_rate": 1.1153869605239564e-08, "logits/chosen": -0.5614346265792847, "logits/rejected": -0.5825585722923279, "logps/chosen": -472.6640625, "logps/rejected": -527.7772216796875, "loss": 0.441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9882261753082275, "rewards/margins": 0.9708881378173828, "rewards/rejected": -2.9591145515441895, "step": 3810 }, { "epoch": 0.9165067178502879, "grad_norm": 13.50064006592574, "learning_rate": 1.0543808037147606e-08, "logits/chosen": -0.6324799060821533, "logits/rejected": -0.6540195345878601, "logps/chosen": -440.49188232421875, "logps/rejected": -644.3251342773438, "loss": 0.4312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.86978018283844, "rewards/margins": 2.010561227798462, "rewards/rejected": -3.8803412914276123, "step": 3820 }, { "epoch": 0.9189059500959693, "grad_norm": 10.258131441990278, "learning_rate": 9.95054336957557e-09, "logits/chosen": -0.5800519585609436, "logits/rejected": -0.6037092208862305, "logps/chosen": -437.89251708984375, "logps/rejected": -572.6655883789062, "loss": 0.3879, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8630110025405884, "rewards/margins": 1.2645984888076782, "rewards/rejected": -3.1276094913482666, "step": 3830 }, { "epoch": 0.9213051823416507, "grad_norm": 14.86589440455775, "learning_rate": 9.37411721768286e-09, "logits/chosen": -0.530808687210083, "logits/rejected": -0.5798245668411255, "logps/chosen": -473.449951171875, "logps/rejected": -675.644775390625, "loss": 0.4163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.131751298904419, "rewards/margins": 1.583280324935913, "rewards/rejected": -3.715031385421753, "step": 3840 }, { "epoch": 0.9237044145873321, "grad_norm": 13.05727695798275, "learning_rate": 8.81457001547392e-09, "logits/chosen": -0.5503281354904175, "logits/rejected": -0.533139169216156, "logps/chosen": -491.100341796875, "logps/rejected": -626.0603637695312, "loss": 0.4468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2812390327453613, "rewards/margins": 1.2485569715499878, "rewards/rejected": -3.5297958850860596, "step": 3850 }, { "epoch": 0.9261036468330134, "grad_norm": 14.54576174319674, "learning_rate": 8.271941012961942e-09, "logits/chosen": -0.48925477266311646, "logits/rejected": -0.4920194149017334, "logps/chosen": -403.09722900390625, "logps/rejected": -654.4439086914062, "loss": 0.4432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.928261399269104, "rewards/margins": 1.9341392517089844, "rewards/rejected": -3.862400531768799, "step": 3860 }, { "epoch": 0.9285028790786948, "grad_norm": 13.945738623677995, "learning_rate": 7.746268273415568e-09, "logits/chosen": -0.5374631285667419, "logits/rejected": -0.5382106900215149, "logps/chosen": -466.7080993652344, "logps/rejected": -586.3713989257812, "loss": 0.4421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0114636421203613, "rewards/margins": 0.7434648871421814, "rewards/rejected": -2.7549285888671875, "step": 3870 }, { "epoch": 0.9309021113243762, "grad_norm": 12.078781330743034, "learning_rate": 7.237588670689076e-09, "logits/chosen": -0.6346238255500793, "logits/rejected": -0.696995198726654, "logps/chosen": -439.1622619628906, "logps/rejected": -595.7288818359375, "loss": 0.4174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9709396362304688, "rewards/margins": 1.8637230396270752, "rewards/rejected": -3.834662675857544, "step": 3880 }, { "epoch": 0.9333013435700576, "grad_norm": 12.482916834754162, "learning_rate": 6.745937886635606e-09, "logits/chosen": -0.6023680567741394, "logits/rejected": -0.597190797328949, "logps/chosen": -462.8761291503906, "logps/rejected": -663.2772827148438, "loss": 0.42, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8327722549438477, "rewards/margins": 1.956402063369751, "rewards/rejected": -3.7891743183135986, "step": 3890 }, { "epoch": 0.935700575815739, "grad_norm": 14.24690648354876, "learning_rate": 6.271350408604409e-09, "logits/chosen": -0.5814956426620483, "logits/rejected": -0.5836547613143921, "logps/chosen": -371.41156005859375, "logps/rejected": -554.6315307617188, "loss": 0.419, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.471068024635315, "rewards/margins": 1.6207103729248047, "rewards/rejected": -3.09177827835083, "step": 3900 }, { "epoch": 0.9380998080614203, "grad_norm": 13.651182373973166, "learning_rate": 5.813859527021487e-09, "logits/chosen": -0.5507432222366333, "logits/rejected": -0.5853307843208313, "logps/chosen": -442.532470703125, "logps/rejected": -599.6609497070312, "loss": 0.4181, "rewards/accuracies": 0.875, "rewards/chosen": -1.9738194942474365, "rewards/margins": 1.8410329818725586, "rewards/rejected": -3.814852476119995, "step": 3910 }, { "epoch": 0.9404990403071017, "grad_norm": 12.561972537735913, "learning_rate": 5.373497333054616e-09, "logits/chosen": -0.6120710968971252, "logits/rejected": -0.620995044708252, "logps/chosen": -499.99139404296875, "logps/rejected": -590.181396484375, "loss": 0.4722, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.298414945602417, "rewards/margins": 0.9743332862854004, "rewards/rejected": -3.2727482318878174, "step": 3920 }, { "epoch": 0.9428982725527831, "grad_norm": 16.011643842899986, "learning_rate": 4.950294716362213e-09, "logits/chosen": -0.5541747212409973, "logits/rejected": -0.5821543335914612, "logps/chosen": -511.50970458984375, "logps/rejected": -614.6409301757812, "loss": 0.4314, "rewards/accuracies": 0.625, "rewards/chosen": -2.2209866046905518, "rewards/margins": 1.0603018999099731, "rewards/rejected": -3.2812886238098145, "step": 3930 }, { "epoch": 0.9452975047984645, "grad_norm": 10.553547990487088, "learning_rate": 4.544281362926422e-09, "logits/chosen": -0.6090785264968872, "logits/rejected": -0.6422208547592163, "logps/chosen": -493.9991760253906, "logps/rejected": -632.8602294921875, "loss": 0.4246, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9454662799835205, "rewards/margins": 1.474827527999878, "rewards/rejected": -3.4202942848205566, "step": 3940 }, { "epoch": 0.9476967370441459, "grad_norm": 12.9216379203511, "learning_rate": 4.15548575297095e-09, "logits/chosen": -0.6255580186843872, "logits/rejected": -0.6386123895645142, "logps/chosen": -431.75634765625, "logps/rejected": -608.4300537109375, "loss": 0.4087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9927335977554321, "rewards/margins": 1.8527963161468506, "rewards/rejected": -3.8455300331115723, "step": 3950 }, { "epoch": 0.9500959692898272, "grad_norm": 9.979083154735912, "learning_rate": 3.7839351589631366e-09, "logits/chosen": -0.5646272301673889, "logits/rejected": -0.5582197308540344, "logps/chosen": -406.14178466796875, "logps/rejected": -604.2752075195312, "loss": 0.415, "rewards/accuracies": 0.875, "rewards/chosen": -1.8224601745605469, "rewards/margins": 1.4527934789657593, "rewards/rejected": -3.275254011154175, "step": 3960 }, { "epoch": 0.9524952015355086, "grad_norm": 14.72280065382829, "learning_rate": 3.4296556437010405e-09, "logits/chosen": -0.6492162942886353, "logits/rejected": -0.6573163866996765, "logps/chosen": -392.5443420410156, "logps/rejected": -550.0157470703125, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9494428634643555, "rewards/margins": 1.5496604442596436, "rewards/rejected": -3.499103546142578, "step": 3970 }, { "epoch": 0.95489443378119, "grad_norm": 19.52331839763403, "learning_rate": 3.092672058485124e-09, "logits/chosen": -0.5937837362289429, "logits/rejected": -0.5867224931716919, "logps/chosen": -420.0135192871094, "logps/rejected": -637.5535888671875, "loss": 0.4764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9328027963638306, "rewards/margins": 1.9535869359970093, "rewards/rejected": -3.886389970779419, "step": 3980 }, { "epoch": 0.9572936660268714, "grad_norm": 15.465373415357526, "learning_rate": 2.7730080413750356e-09, "logits/chosen": -0.5144689083099365, "logits/rejected": -0.5366243720054626, "logps/chosen": -462.44580078125, "logps/rejected": -614.26904296875, "loss": 0.4304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9767814874649048, "rewards/margins": 1.4872825145721436, "rewards/rejected": -3.464064121246338, "step": 3990 }, { "epoch": 0.9596928982725528, "grad_norm": 12.554501395226785, "learning_rate": 2.4706860155316033e-09, "logits/chosen": -0.5792466402053833, "logits/rejected": -0.5846326947212219, "logps/chosen": -533.231201171875, "logps/rejected": -670.0733642578125, "loss": 0.46, "rewards/accuracies": 0.75, "rewards/chosen": -2.0717945098876953, "rewards/margins": 1.349689245223999, "rewards/rejected": -3.4214844703674316, "step": 4000 }, { "epoch": 0.9596928982725528, "eval_logits/chosen": -0.5574566721916199, "eval_logits/rejected": -0.5789428949356079, "eval_logps/chosen": -453.795654296875, "eval_logps/rejected": -623.4196166992188, "eval_loss": 0.42671090364456177, "eval_rewards/accuracies": 0.8446428775787354, "eval_rewards/chosen": -2.0115108489990234, "eval_rewards/margins": 1.6149202585220337, "eval_rewards/rejected": -3.6264309883117676, "eval_runtime": 208.7971, "eval_samples_per_second": 21.365, "eval_steps_per_second": 0.335, "step": 4000 }, { "epoch": 0.9620921305182342, "grad_norm": 14.306928956925486, "learning_rate": 2.185727187643843e-09, "logits/chosen": -0.5993860363960266, "logits/rejected": -0.6200038194656372, "logps/chosen": -403.6116943359375, "logps/rejected": -601.7921752929688, "loss": 0.4649, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9371687173843384, "rewards/margins": 1.8979822397232056, "rewards/rejected": -3.835150957107544, "step": 4010 }, { "epoch": 0.9644913627639156, "grad_norm": 16.135414053750264, "learning_rate": 1.9181515464413434e-09, "logits/chosen": -0.5625258088111877, "logits/rejected": -0.5911010503768921, "logps/chosen": -544.6771240234375, "logps/rejected": -725.96142578125, "loss": 0.3909, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8679109811782837, "rewards/margins": 1.8002150058746338, "rewards/rejected": -3.668125867843628, "step": 4020 }, { "epoch": 0.966890595009597, "grad_norm": 15.134608783297615, "learning_rate": 1.6679778612923302e-09, "logits/chosen": -0.5708626508712769, "logits/rejected": -0.5663528442382812, "logps/chosen": -496.3414001464844, "logps/rejected": -607.8493041992188, "loss": 0.403, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.051764965057373, "rewards/margins": 1.1176114082336426, "rewards/rejected": -3.1693766117095947, "step": 4030 }, { "epoch": 0.9692898272552783, "grad_norm": 15.867680764731526, "learning_rate": 1.43522368088686e-09, "logits/chosen": -0.5292009115219116, "logits/rejected": -0.5536502003669739, "logps/chosen": -493.24053955078125, "logps/rejected": -687.3652954101562, "loss": 0.4896, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3322641849517822, "rewards/margins": 1.9033256769180298, "rewards/rejected": -4.235589504241943, "step": 4040 }, { "epoch": 0.9716890595009597, "grad_norm": 18.145967539331632, "learning_rate": 1.2199053320059993e-09, "logits/chosen": -0.5495906472206116, "logits/rejected": -0.5624712109565735, "logps/chosen": -464.6747131347656, "logps/rejected": -626.3593139648438, "loss": 0.4444, "rewards/accuracies": 0.875, "rewards/chosen": -1.9679105281829834, "rewards/margins": 1.4822068214416504, "rewards/rejected": -3.4501171112060547, "step": 4050 }, { "epoch": 0.974088291746641, "grad_norm": 11.736148609957327, "learning_rate": 1.0220379183764338e-09, "logits/chosen": -0.6397042870521545, "logits/rejected": -0.6401645541191101, "logps/chosen": -382.80255126953125, "logps/rejected": -585.4246826171875, "loss": 0.4335, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8114726543426514, "rewards/margins": 1.9243446588516235, "rewards/rejected": -3.7358174324035645, "step": 4060 }, { "epoch": 0.9764875239923224, "grad_norm": 13.40993494403174, "learning_rate": 8.416353196111503e-10, "logits/chosen": -0.531282901763916, "logits/rejected": -0.5239174365997314, "logps/chosen": -447.2433166503906, "logps/rejected": -588.00341796875, "loss": 0.4917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.130341053009033, "rewards/margins": 1.5175530910491943, "rewards/rejected": -3.6478943824768066, "step": 4070 }, { "epoch": 0.9788867562380038, "grad_norm": 14.008572116854983, "learning_rate": 6.787101902356873e-10, "logits/chosen": -0.5458533763885498, "logits/rejected": -0.530588686466217, "logps/chosen": -478.8675842285156, "logps/rejected": -640.27783203125, "loss": 0.4184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1983494758605957, "rewards/margins": 1.3467283248901367, "rewards/rejected": -3.5450775623321533, "step": 4080 }, { "epoch": 0.9812859884836852, "grad_norm": 17.14797402633899, "learning_rate": 5.332739588005953e-10, "logits/chosen": -0.6215013861656189, "logits/rejected": -0.6441851854324341, "logps/chosen": -390.38421630859375, "logps/rejected": -607.4275512695312, "loss": 0.4348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.911041498184204, "rewards/margins": 1.8259137868881226, "rewards/rejected": -3.736954927444458, "step": 4090 }, { "epoch": 0.9836852207293666, "grad_norm": 14.786463692836161, "learning_rate": 4.053368270797164e-10, "logits/chosen": -0.5029186010360718, "logits/rejected": -0.5265758037567139, "logps/chosen": -459.8993225097656, "logps/rejected": -595.8419189453125, "loss": 0.4479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2223143577575684, "rewards/margins": 1.3676447868347168, "rewards/rejected": -3.589958667755127, "step": 4100 }, { "epoch": 0.986084452975048, "grad_norm": 10.752700482691958, "learning_rate": 2.949077693545354e-10, "logits/chosen": -0.46297192573547363, "logits/rejected": -0.522523045539856, "logps/chosen": -502.4552307128906, "logps/rejected": -644.851318359375, "loss": 0.4713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2377562522888184, "rewards/margins": 1.0717887878417969, "rewards/rejected": -3.3095450401306152, "step": 4110 }, { "epoch": 0.9884836852207294, "grad_norm": 13.109993080936734, "learning_rate": 2.0199453178471047e-10, "logits/chosen": -0.5104750394821167, "logits/rejected": -0.5807961225509644, "logps/chosen": -529.2632446289062, "logps/rejected": -605.51416015625, "loss": 0.4171, "rewards/accuracies": 0.875, "rewards/chosen": -2.1927380561828613, "rewards/margins": 1.1302746534347534, "rewards/rejected": -3.323012590408325, "step": 4120 }, { "epoch": 0.9908829174664108, "grad_norm": 14.68142360061518, "learning_rate": 1.266036318647301e-10, "logits/chosen": -0.5655652284622192, "logits/rejected": -0.5766940116882324, "logps/chosen": -503.4109802246094, "logps/rejected": -674.8923950195312, "loss": 0.4159, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8927507400512695, "rewards/margins": 1.9182952642440796, "rewards/rejected": -3.8110461235046387, "step": 4130 }, { "epoch": 0.9932821497120922, "grad_norm": 16.922904240171835, "learning_rate": 6.874035796672339e-11, "logits/chosen": -0.5960877537727356, "logits/rejected": -0.5835133194923401, "logps/chosen": -456.58172607421875, "logps/rejected": -612.15087890625, "loss": 0.4225, "rewards/accuracies": 0.875, "rewards/chosen": -1.7158613204956055, "rewards/margins": 2.088901996612549, "rewards/rejected": -3.8047633171081543, "step": 4140 }, { "epoch": 0.9956813819577736, "grad_norm": 16.596879517738124, "learning_rate": 2.8408768969423458e-11, "logits/chosen": -0.6005167961120605, "logits/rejected": -0.6138831377029419, "logps/chosen": -461.8202209472656, "logps/rejected": -612.9940185546875, "loss": 0.4126, "rewards/accuracies": 0.75, "rewards/chosen": -1.8562465906143188, "rewards/margins": 1.3878483772277832, "rewards/rejected": -3.2440948486328125, "step": 4150 }, { "epoch": 0.9980806142034548, "grad_norm": 15.505557355450042, "learning_rate": 5.611693973617271e-12, "logits/chosen": -0.5526952743530273, "logits/rejected": -0.5355725288391113, "logps/chosen": -412.329345703125, "logps/rejected": -580.66357421875, "loss": 0.4472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9485257863998413, "rewards/margins": 1.4926297664642334, "rewards/rejected": -3.4411556720733643, "step": 4160 }, { "epoch": 1.0, "step": 4168, "total_flos": 0.0, "train_loss": 0.49609584714538074, "train_runtime": 16148.8615, "train_samples_per_second": 8.259, "train_steps_per_second": 0.258 } ], "logging_steps": 10, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }