diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 4168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002399232245681382, + "grad_norm": 4.7795046499064915, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -0.7570170760154724, + "logits/rejected": -0.7606267929077148, + "logps/chosen": -147.62075805664062, + "logps/rejected": -139.63986206054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0023992322456813818, + "grad_norm": 5.0729607226586175, + "learning_rate": 1.199040767386091e-08, + "logits/chosen": -0.7337759137153625, + "logits/rejected": -0.8291671872138977, + "logps/chosen": -372.46026611328125, + "logps/rejected": -298.1966247558594, + "loss": 0.693, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.0005176405538804829, + "rewards/margins": 0.00119220616761595, + "rewards/rejected": -0.0006745656137354672, + "step": 10 + }, + { + "epoch": 0.0047984644913627635, + "grad_norm": 5.162819171123915, + "learning_rate": 2.398081534772182e-08, + "logits/chosen": -0.7522455453872681, + "logits/rejected": -0.7984375953674316, + "logps/chosen": -240.97720336914062, + "logps/rejected": -211.13278198242188, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00016315083485096693, + "rewards/margins": 0.0002667726075742394, + "rewards/rejected": -0.00010362181637901813, + "step": 20 + }, + { + "epoch": 0.007197696737044146, + "grad_norm": 4.74337539097734, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -0.7967968583106995, + "logits/rejected": -0.8497036099433899, + "logps/chosen": -252.3729705810547, + "logps/rejected": -261.5249328613281, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0017296618316322565, + "rewards/margins": -0.0007607643492519855, + "rewards/rejected": -0.0009688973659649491, + "step": 30 + }, + { + "epoch": 0.009596928982725527, + "grad_norm": 4.907565292084559, + "learning_rate": 4.796163069544364e-08, + "logits/chosen": -0.8299921154975891, + "logits/rejected": -0.883353054523468, + "logps/chosen": -268.02789306640625, + "logps/rejected": -251.27548217773438, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 6.413871597032994e-05, + "rewards/margins": 0.0001428989926353097, + "rewards/rejected": -7.876028394093737e-05, + "step": 40 + }, + { + "epoch": 0.01199616122840691, + "grad_norm": 5.291199439758451, + "learning_rate": 5.995203836930455e-08, + "logits/chosen": -0.7905577421188354, + "logits/rejected": -0.8132292032241821, + "logps/chosen": -273.465087890625, + "logps/rejected": -236.5275421142578, + "loss": 0.693, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00024220789782702923, + "rewards/margins": -0.0008948832983151078, + "rewards/rejected": 0.0006526754004880786, + "step": 50 + }, + { + "epoch": 0.014395393474088292, + "grad_norm": 5.461587897395331, + "learning_rate": 7.194244604316546e-08, + "logits/chosen": -0.8055087924003601, + "logits/rejected": -0.7774447202682495, + "logps/chosen": -279.95806884765625, + "logps/rejected": -260.2548828125, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0018558722222223878, + "rewards/margins": -0.0015968760708346963, + "rewards/rejected": -0.00025899597676470876, + "step": 60 + }, + { + "epoch": 0.016794625719769675, + "grad_norm": 4.789491285565402, + "learning_rate": 8.393285371702638e-08, + "logits/chosen": -0.6775354743003845, + "logits/rejected": -0.6865079998970032, + "logps/chosen": -284.73492431640625, + "logps/rejected": -268.7757263183594, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0004096252378076315, + "rewards/margins": -0.001045037293806672, + "rewards/rejected": 0.0006354121142067015, + "step": 70 + }, + { + "epoch": 0.019193857965451054, + "grad_norm": 5.22044457856631, + "learning_rate": 9.592326139088728e-08, + "logits/chosen": -0.7918148040771484, + "logits/rejected": -0.6770384907722473, + "logps/chosen": -193.21511840820312, + "logps/rejected": -248.8389892578125, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0001543355901958421, + "rewards/margins": 0.0017128061736002564, + "rewards/rejected": -0.0015584708889946342, + "step": 80 + }, + { + "epoch": 0.021593090211132437, + "grad_norm": 5.0135506626659065, + "learning_rate": 1.0791366906474819e-07, + "logits/chosen": -0.860626220703125, + "logits/rejected": -0.9020501971244812, + "logps/chosen": -332.2583312988281, + "logps/rejected": -287.39312744140625, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -5.847834836458787e-05, + "rewards/margins": 0.00035616609966382384, + "rewards/rejected": -0.00041464445530436933, + "step": 90 + }, + { + "epoch": 0.02399232245681382, + "grad_norm": 5.263212106273348, + "learning_rate": 1.199040767386091e-07, + "logits/chosen": -0.7230492234230042, + "logits/rejected": -0.6537036895751953, + "logps/chosen": -265.91143798828125, + "logps/rejected": -282.36163330078125, + "loss": 0.6927, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00011261176405241713, + "rewards/margins": 0.00016119341307785362, + "rewards/rejected": -0.00027380516985431314, + "step": 100 + }, + { + "epoch": 0.026391554702495202, + "grad_norm": 4.559210233997684, + "learning_rate": 1.3189448441247004e-07, + "logits/chosen": -0.8084124326705933, + "logits/rejected": -0.838187038898468, + "logps/chosen": -228.7566375732422, + "logps/rejected": -229.68017578125, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00025232377811335027, + "rewards/margins": 0.0021261090878397226, + "rewards/rejected": -0.002378433011472225, + "step": 110 + }, + { + "epoch": 0.028790786948176585, + "grad_norm": 4.987801425382141, + "learning_rate": 1.4388489208633092e-07, + "logits/chosen": -0.7631937265396118, + "logits/rejected": -0.8265846967697144, + "logps/chosen": -287.75518798828125, + "logps/rejected": -274.0089111328125, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0009368563769385219, + "rewards/margins": 0.0011595094110816717, + "rewards/rejected": -0.0020963659044355154, + "step": 120 + }, + { + "epoch": 0.031190019193857964, + "grad_norm": 4.539938061260808, + "learning_rate": 1.5587529976019183e-07, + "logits/chosen": -0.8044384717941284, + "logits/rejected": -0.7853862643241882, + "logps/chosen": -208.2334747314453, + "logps/rejected": -308.89727783203125, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0008903613197617233, + "rewards/margins": 0.003986647818237543, + "rewards/rejected": -0.004877009429037571, + "step": 130 + }, + { + "epoch": 0.03358925143953935, + "grad_norm": 4.86277843753179, + "learning_rate": 1.6786570743405277e-07, + "logits/chosen": -0.59493488073349, + "logits/rejected": -0.6423755288124084, + "logps/chosen": -296.8682861328125, + "logps/rejected": -286.8326721191406, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0003669736906886101, + "rewards/margins": 0.006478472147136927, + "rewards/rejected": -0.006845445372164249, + "step": 140 + }, + { + "epoch": 0.03598848368522073, + "grad_norm": 5.140685164747867, + "learning_rate": 1.7985611510791365e-07, + "logits/chosen": -0.7444754838943481, + "logits/rejected": -0.7507014870643616, + "logps/chosen": -225.19686889648438, + "logps/rejected": -223.80783081054688, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0018180795013904572, + "rewards/margins": 0.006333982106298208, + "rewards/rejected": -0.008152060210704803, + "step": 150 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 5.210882534550865, + "learning_rate": 1.9184652278177456e-07, + "logits/chosen": -0.6324438452720642, + "logits/rejected": -0.6643397212028503, + "logps/chosen": -304.49700927734375, + "logps/rejected": -235.69424438476562, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0027639209292829037, + "rewards/margins": 0.008128685876727104, + "rewards/rejected": -0.010892605409026146, + "step": 160 + }, + { + "epoch": 0.040786948176583494, + "grad_norm": 4.6747728490323635, + "learning_rate": 2.038369304556355e-07, + "logits/chosen": -0.6203088164329529, + "logits/rejected": -0.6542561650276184, + "logps/chosen": -337.5506591796875, + "logps/rejected": -324.4564208984375, + "loss": 0.6879, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0021853535436093807, + "rewards/margins": 0.010294707491993904, + "rewards/rejected": -0.012480061501264572, + "step": 170 + }, + { + "epoch": 0.04318618042226487, + "grad_norm": 5.220785522583115, + "learning_rate": 2.1582733812949638e-07, + "logits/chosen": -0.8055013418197632, + "logits/rejected": -0.8089167475700378, + "logps/chosen": -238.187744140625, + "logps/rejected": -231.9917449951172, + "loss": 0.6886, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.004614435136318207, + "rewards/margins": 0.018629100173711777, + "rewards/rejected": -0.023243537172675133, + "step": 180 + }, + { + "epoch": 0.04558541266794626, + "grad_norm": 5.9389729833935325, + "learning_rate": 2.278177458033573e-07, + "logits/chosen": -0.6565154790878296, + "logits/rejected": -0.7033632397651672, + "logps/chosen": -318.5955505371094, + "logps/rejected": -258.2650451660156, + "loss": 0.6882, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.005404843017458916, + "rewards/margins": 0.0013396486174315214, + "rewards/rejected": -0.0067444914020597935, + "step": 190 + }, + { + "epoch": 0.04798464491362764, + "grad_norm": 4.680422689180021, + "learning_rate": 2.398081534772182e-07, + "logits/chosen": -0.7583560347557068, + "logits/rejected": -0.7128076553344727, + "logps/chosen": -315.15093994140625, + "logps/rejected": -300.1588134765625, + "loss": 0.685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0010681712301447988, + "rewards/margins": 0.02815566025674343, + "rewards/rejected": -0.029223833233118057, + "step": 200 + }, + { + "epoch": 0.05038387715930902, + "grad_norm": 4.9609467030055505, + "learning_rate": 2.517985611510791e-07, + "logits/chosen": -0.7415071725845337, + "logits/rejected": -0.7684369683265686, + "logps/chosen": -241.5952911376953, + "logps/rejected": -265.6112976074219, + "loss": 0.6852, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.006404706742614508, + "rewards/margins": 0.019062474370002747, + "rewards/rejected": -0.02546718157827854, + "step": 210 + }, + { + "epoch": 0.052783109404990404, + "grad_norm": 4.801299794937751, + "learning_rate": 2.637889688249401e-07, + "logits/chosen": -0.6760513186454773, + "logits/rejected": -0.6948543190956116, + "logps/chosen": -311.57281494140625, + "logps/rejected": -320.0372619628906, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010622961446642876, + "rewards/margins": 0.013131847605109215, + "rewards/rejected": -0.02375480905175209, + "step": 220 + }, + { + "epoch": 0.05518234165067178, + "grad_norm": 5.226626430555928, + "learning_rate": 2.7577937649880093e-07, + "logits/chosen": -0.6577489376068115, + "logits/rejected": -0.5907109975814819, + "logps/chosen": -237.6232147216797, + "logps/rejected": -272.1260986328125, + "loss": 0.6815, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.017997443675994873, + "rewards/margins": 0.015421544201672077, + "rewards/rejected": -0.033418990671634674, + "step": 230 + }, + { + "epoch": 0.05758157389635317, + "grad_norm": 5.781159666071322, + "learning_rate": 2.8776978417266184e-07, + "logits/chosen": -0.6846515536308289, + "logits/rejected": -0.7044352293014526, + "logps/chosen": -290.9536437988281, + "logps/rejected": -245.861083984375, + "loss": 0.6786, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.00569694209843874, + "rewards/margins": 0.03246745467185974, + "rewards/rejected": -0.038164399564266205, + "step": 240 + }, + { + "epoch": 0.05998080614203455, + "grad_norm": 5.24934898048967, + "learning_rate": 2.997601918465228e-07, + "logits/chosen": -0.6797415614128113, + "logits/rejected": -0.6883940696716309, + "logps/chosen": -233.06948852539062, + "logps/rejected": -224.38671875, + "loss": 0.6791, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0182084571570158, + "rewards/margins": 0.017979206517338753, + "rewards/rejected": -0.03618766739964485, + "step": 250 + }, + { + "epoch": 0.06238003838771593, + "grad_norm": 4.992367199190442, + "learning_rate": 3.1175059952038366e-07, + "logits/chosen": -0.7312067747116089, + "logits/rejected": -0.6521024703979492, + "logps/chosen": -271.38824462890625, + "logps/rejected": -275.63653564453125, + "loss": 0.6747, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02519085630774498, + "rewards/margins": 0.04188547283411026, + "rewards/rejected": -0.06707633286714554, + "step": 260 + }, + { + "epoch": 0.0647792706333973, + "grad_norm": 5.17440656592256, + "learning_rate": 3.2374100719424457e-07, + "logits/chosen": -0.6548904180526733, + "logits/rejected": -0.7951699495315552, + "logps/chosen": -282.63873291015625, + "logps/rejected": -225.5301971435547, + "loss": 0.6735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02142253890633583, + "rewards/margins": 0.012398405000567436, + "rewards/rejected": -0.033820949494838715, + "step": 270 + }, + { + "epoch": 0.0671785028790787, + "grad_norm": 5.277296067925045, + "learning_rate": 3.3573141486810554e-07, + "logits/chosen": -0.7172076106071472, + "logits/rejected": -0.6975899934768677, + "logps/chosen": -296.5246887207031, + "logps/rejected": -288.5944519042969, + "loss": 0.6668, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.018556680530309677, + "rewards/margins": 0.05493815615773201, + "rewards/rejected": -0.07349482923746109, + "step": 280 + }, + { + "epoch": 0.06957773512476008, + "grad_norm": 4.781163975921149, + "learning_rate": 3.477218225419664e-07, + "logits/chosen": -0.6228010654449463, + "logits/rejected": -0.5732084512710571, + "logps/chosen": -288.7183532714844, + "logps/rejected": -263.90496826171875, + "loss": 0.6668, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.021407321095466614, + "rewards/margins": 0.04037974029779434, + "rewards/rejected": -0.061787061393260956, + "step": 290 + }, + { + "epoch": 0.07197696737044146, + "grad_norm": 5.423349784612897, + "learning_rate": 3.597122302158273e-07, + "logits/chosen": -0.7558736801147461, + "logits/rejected": -0.7680533528327942, + "logps/chosen": -264.5168762207031, + "logps/rejected": -289.5086669921875, + "loss": 0.6685, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04736015945672989, + "rewards/margins": 0.04246506839990616, + "rewards/rejected": -0.08982523530721664, + "step": 300 + }, + { + "epoch": 0.07437619961612284, + "grad_norm": 5.058813842555934, + "learning_rate": 3.7170263788968827e-07, + "logits/chosen": -0.6308411359786987, + "logits/rejected": -0.6718063950538635, + "logps/chosen": -270.92779541015625, + "logps/rejected": -247.31472778320312, + "loss": 0.6721, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.044375158846378326, + "rewards/margins": 0.06773975491523743, + "rewards/rejected": -0.11211492121219635, + "step": 310 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 4.805850462678792, + "learning_rate": 3.836930455635491e-07, + "logits/chosen": -0.7107186913490295, + "logits/rejected": -0.729761004447937, + "logps/chosen": -273.9128723144531, + "logps/rejected": -246.8804168701172, + "loss": 0.665, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05261250585317612, + "rewards/margins": 0.05193439871072769, + "rewards/rejected": -0.10454690456390381, + "step": 320 + }, + { + "epoch": 0.07917466410748561, + "grad_norm": 4.995629742536248, + "learning_rate": 3.9568345323741003e-07, + "logits/chosen": -0.6203581094741821, + "logits/rejected": -0.5326763391494751, + "logps/chosen": -261.4637756347656, + "logps/rejected": -304.72015380859375, + "loss": 0.6605, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09535319358110428, + "rewards/margins": 0.07053264230489731, + "rewards/rejected": -0.1658858358860016, + "step": 330 + }, + { + "epoch": 0.08157389635316699, + "grad_norm": 4.930244515909295, + "learning_rate": 4.07673860911271e-07, + "logits/chosen": -0.5719800591468811, + "logits/rejected": -0.576286792755127, + "logps/chosen": -240.22964477539062, + "logps/rejected": -272.12823486328125, + "loss": 0.6597, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05066823214292526, + "rewards/margins": 0.10919372737407684, + "rewards/rejected": -0.1598619669675827, + "step": 340 + }, + { + "epoch": 0.08397312859884837, + "grad_norm": 5.465735130601733, + "learning_rate": 4.1966426858513185e-07, + "logits/chosen": -0.7455052137374878, + "logits/rejected": -0.7238417863845825, + "logps/chosen": -306.32135009765625, + "logps/rejected": -303.4272766113281, + "loss": 0.661, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10594834387302399, + "rewards/margins": 0.061454661190509796, + "rewards/rejected": -0.16740299761295319, + "step": 350 + }, + { + "epoch": 0.08637236084452975, + "grad_norm": 5.474448104756549, + "learning_rate": 4.3165467625899276e-07, + "logits/chosen": -0.6257158517837524, + "logits/rejected": -0.6943267583847046, + "logps/chosen": -263.67681884765625, + "logps/rejected": -227.45315551757812, + "loss": 0.6617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10281842947006226, + "rewards/margins": 0.06660200655460358, + "rewards/rejected": -0.16942045092582703, + "step": 360 + }, + { + "epoch": 0.08877159309021113, + "grad_norm": 5.5768966423340265, + "learning_rate": 4.436450839328537e-07, + "logits/chosen": -0.6406761407852173, + "logits/rejected": -0.6252005696296692, + "logps/chosen": -254.68783569335938, + "logps/rejected": -282.90228271484375, + "loss": 0.6526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13191966712474823, + "rewards/margins": 0.1186646968126297, + "rewards/rejected": -0.25058436393737793, + "step": 370 + }, + { + "epoch": 0.09117082533589252, + "grad_norm": 4.831375981831261, + "learning_rate": 4.556354916067146e-07, + "logits/chosen": -0.6866484880447388, + "logits/rejected": -0.633335530757904, + "logps/chosen": -240.5032196044922, + "logps/rejected": -261.5648498535156, + "loss": 0.6392, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10136518627405167, + "rewards/margins": 0.14594808220863342, + "rewards/rejected": -0.2473132610321045, + "step": 380 + }, + { + "epoch": 0.0935700575815739, + "grad_norm": 5.472261214448905, + "learning_rate": 4.676258992805755e-07, + "logits/chosen": -0.564619243144989, + "logits/rejected": -0.5628719329833984, + "logps/chosen": -286.52899169921875, + "logps/rejected": -262.25323486328125, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15715694427490234, + "rewards/margins": 0.07161404192447662, + "rewards/rejected": -0.22877097129821777, + "step": 390 + }, + { + "epoch": 0.09596928982725528, + "grad_norm": 5.347372222074903, + "learning_rate": 4.796163069544364e-07, + "logits/chosen": -0.6121346354484558, + "logits/rejected": -0.6788171529769897, + "logps/chosen": -266.82220458984375, + "logps/rejected": -270.36724853515625, + "loss": 0.6421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1677761971950531, + "rewards/margins": 0.17789766192436218, + "rewards/rejected": -0.3456738591194153, + "step": 400 + }, + { + "epoch": 0.09836852207293666, + "grad_norm": 5.340397422554379, + "learning_rate": 4.916067146282974e-07, + "logits/chosen": -0.6538274884223938, + "logits/rejected": -0.6384181976318359, + "logps/chosen": -270.4076843261719, + "logps/rejected": -313.26544189453125, + "loss": 0.6258, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1868169903755188, + "rewards/margins": 0.1523607075214386, + "rewards/rejected": -0.3391777276992798, + "step": 410 + }, + { + "epoch": 0.10076775431861804, + "grad_norm": 5.46694192660825, + "learning_rate": 4.999992108529978e-07, + "logits/chosen": -0.585496723651886, + "logits/rejected": -0.5861325263977051, + "logps/chosen": -343.7078857421875, + "logps/rejected": -326.8913269042969, + "loss": 0.6332, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23270578682422638, + "rewards/margins": 0.18326610326766968, + "rewards/rejected": -0.41597190499305725, + "step": 420 + }, + { + "epoch": 0.10316698656429943, + "grad_norm": 5.835218906444854, + "learning_rate": 4.999851817115532e-07, + "logits/chosen": -0.7467209100723267, + "logits/rejected": -0.6660154461860657, + "logps/chosen": -266.6215515136719, + "logps/rejected": -292.622802734375, + "loss": 0.6348, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22090856730937958, + "rewards/margins": 0.29071298241615295, + "rewards/rejected": -0.5116215348243713, + "step": 430 + }, + { + "epoch": 0.10556621880998081, + "grad_norm": 5.421070680650146, + "learning_rate": 4.999536171027889e-07, + "logits/chosen": -0.5800718069076538, + "logits/rejected": -0.6239966154098511, + "logps/chosen": -310.7662353515625, + "logps/rejected": -315.01727294921875, + "loss": 0.6241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.30067840218544006, + "rewards/margins": 0.14732010662555695, + "rewards/rejected": -0.4479985237121582, + "step": 440 + }, + { + "epoch": 0.10796545105566219, + "grad_norm": 6.239774740521307, + "learning_rate": 4.999045192408369e-07, + "logits/chosen": -0.5860768556594849, + "logits/rejected": -0.5695077180862427, + "logps/chosen": -266.48162841796875, + "logps/rejected": -264.7235107421875, + "loss": 0.6248, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3346542716026306, + "rewards/margins": 0.12011837959289551, + "rewards/rejected": -0.4547726511955261, + "step": 450 + }, + { + "epoch": 0.11036468330134357, + "grad_norm": 6.029927081598656, + "learning_rate": 4.998378915697171e-07, + "logits/chosen": -0.6232699155807495, + "logits/rejected": -0.6357511878013611, + "logps/chosen": -296.1357116699219, + "logps/rejected": -316.9218444824219, + "loss": 0.6016, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28828713297843933, + "rewards/margins": 0.2776426374912262, + "rewards/rejected": -0.5659297704696655, + "step": 460 + }, + { + "epoch": 0.11276391554702495, + "grad_norm": 5.6604945230319625, + "learning_rate": 4.997537387630958e-07, + "logits/chosen": -0.5975057482719421, + "logits/rejected": -0.6152404546737671, + "logps/chosen": -235.7269744873047, + "logps/rejected": -265.6780090332031, + "loss": 0.5969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2783138155937195, + "rewards/margins": 0.27526405453681946, + "rewards/rejected": -0.5535778403282166, + "step": 470 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 6.693255407381944, + "learning_rate": 4.996520667239582e-07, + "logits/chosen": -0.7205396294593811, + "logits/rejected": -0.6762118339538574, + "logps/chosen": -265.1204833984375, + "logps/rejected": -349.3818054199219, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3487080931663513, + "rewards/margins": 0.32969897985458374, + "rewards/rejected": -0.6784070730209351, + "step": 480 + }, + { + "epoch": 0.11756238003838772, + "grad_norm": 6.713235956519299, + "learning_rate": 4.995328825841939e-07, + "logits/chosen": -0.5713664293289185, + "logits/rejected": -0.553689181804657, + "logps/chosen": -246.03598022460938, + "logps/rejected": -300.48797607421875, + "loss": 0.5929, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.35947948694229126, + "rewards/margins": 0.5059111714363098, + "rewards/rejected": -0.8653906583786011, + "step": 490 + }, + { + "epoch": 0.1199616122840691, + "grad_norm": 6.608316940106426, + "learning_rate": 4.993961947040967e-07, + "logits/chosen": -0.5497556924819946, + "logits/rejected": -0.5836997032165527, + "logps/chosen": -330.4705505371094, + "logps/rejected": -313.43511962890625, + "loss": 0.6159, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5261253118515015, + "rewards/margins": 0.1777980625629425, + "rewards/rejected": -0.7039234042167664, + "step": 500 + }, + { + "epoch": 0.12236084452975048, + "grad_norm": 6.046831260369165, + "learning_rate": 4.992420126717784e-07, + "logits/chosen": -0.5983260869979858, + "logits/rejected": -0.5698710680007935, + "logps/chosen": -275.0146789550781, + "logps/rejected": -332.9678039550781, + "loss": 0.5951, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3116165101528168, + "rewards/margins": 0.5498673319816589, + "rewards/rejected": -0.8614838719367981, + "step": 510 + }, + { + "epoch": 0.12476007677543186, + "grad_norm": 7.348411835249425, + "learning_rate": 4.990703473024958e-07, + "logits/chosen": -0.5138384103775024, + "logits/rejected": -0.5233681201934814, + "logps/chosen": -332.6457214355469, + "logps/rejected": -353.2841796875, + "loss": 0.621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5376469492912292, + "rewards/margins": 0.28140488266944885, + "rewards/rejected": -0.8190518617630005, + "step": 520 + }, + { + "epoch": 0.12715930902111325, + "grad_norm": 7.638245462015975, + "learning_rate": 4.98881210637893e-07, + "logits/chosen": -0.6331408619880676, + "logits/rejected": -0.5843578577041626, + "logps/chosen": -253.6781005859375, + "logps/rejected": -326.880126953125, + "loss": 0.6034, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44268542528152466, + "rewards/margins": 0.36151397228240967, + "rewards/rejected": -0.8041993379592896, + "step": 530 + }, + { + "epoch": 0.1295585412667946, + "grad_norm": 5.437673001258359, + "learning_rate": 4.986746159451553e-07, + "logits/chosen": -0.5594351887702942, + "logits/rejected": -0.5495598912239075, + "logps/chosen": -293.2762756347656, + "logps/rejected": -318.1295166015625, + "loss": 0.6058, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.37716802954673767, + "rewards/margins": 0.341768354177475, + "rewards/rejected": -0.7189363837242126, + "step": 540 + }, + { + "epoch": 0.131957773512476, + "grad_norm": 5.462303995641746, + "learning_rate": 4.984505777160795e-07, + "logits/chosen": -0.499727725982666, + "logits/rejected": -0.5212177038192749, + "logps/chosen": -356.2312927246094, + "logps/rejected": -389.5164794921875, + "loss": 0.6123, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5117184519767761, + "rewards/margins": 0.31264665722846985, + "rewards/rejected": -0.8243652582168579, + "step": 550 + }, + { + "epoch": 0.1343570057581574, + "grad_norm": 6.563474963984782, + "learning_rate": 4.982091116660574e-07, + "logits/chosen": -0.6962921023368835, + "logits/rejected": -0.7188105583190918, + "logps/chosen": -248.63955688476562, + "logps/rejected": -241.68521118164062, + "loss": 0.6252, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5236629843711853, + "rewards/margins": 0.1912154257297516, + "rewards/rejected": -0.7148783802986145, + "step": 560 + }, + { + "epoch": 0.13675623800383876, + "grad_norm": 7.806740806376691, + "learning_rate": 4.979502347329732e-07, + "logits/chosen": -0.531540036201477, + "logits/rejected": -0.5084825754165649, + "logps/chosen": -359.24871826171875, + "logps/rejected": -422.90606689453125, + "loss": 0.6116, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6582085490226746, + "rewards/margins": 0.3848935067653656, + "rewards/rejected": -1.0431021451950073, + "step": 570 + }, + { + "epoch": 0.13915547024952016, + "grad_norm": 8.43363363786984, + "learning_rate": 4.976739650760151e-07, + "logits/chosen": -0.6741775274276733, + "logits/rejected": -0.6754758358001709, + "logps/chosen": -318.57708740234375, + "logps/rejected": -327.61669921875, + "loss": 0.5905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5984118580818176, + "rewards/margins": 0.26561444997787476, + "rewards/rejected": -0.8640263676643372, + "step": 580 + }, + { + "epoch": 0.14155470249520152, + "grad_norm": 8.305901855269768, + "learning_rate": 4.97380322074402e-07, + "logits/chosen": -0.5298658609390259, + "logits/rejected": -0.5446540117263794, + "logps/chosen": -281.94403076171875, + "logps/rejected": -312.7386474609375, + "loss": 0.6096, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6375762820243835, + "rewards/margins": 0.3014167249202728, + "rewards/rejected": -0.9389930963516235, + "step": 590 + }, + { + "epoch": 0.14395393474088292, + "grad_norm": 6.932480462761205, + "learning_rate": 4.970693263260237e-07, + "logits/chosen": -0.6071778535842896, + "logits/rejected": -0.6448204517364502, + "logps/chosen": -334.240478515625, + "logps/rejected": -351.1219177246094, + "loss": 0.6118, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5482883453369141, + "rewards/margins": 0.4772118031978607, + "rewards/rejected": -1.0255001783370972, + "step": 600 + }, + { + "epoch": 0.1463531669865643, + "grad_norm": 7.988462606950694, + "learning_rate": 4.967409996459966e-07, + "logits/chosen": -0.6251802444458008, + "logits/rejected": -0.6402324438095093, + "logps/chosen": -337.7198486328125, + "logps/rejected": -350.0640563964844, + "loss": 0.5869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5524300932884216, + "rewards/margins": 0.395042359828949, + "rewards/rejected": -0.9474723935127258, + "step": 610 + }, + { + "epoch": 0.14875239923224567, + "grad_norm": 6.807256286481946, + "learning_rate": 4.963953650651326e-07, + "logits/chosen": -0.5530382394790649, + "logits/rejected": -0.5686200857162476, + "logps/chosen": -411.8724670410156, + "logps/rejected": -351.87310791015625, + "loss": 0.5886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6493805646896362, + "rewards/margins": 0.2970190942287445, + "rewards/rejected": -0.9463998079299927, + "step": 620 + }, + { + "epoch": 0.15115163147792707, + "grad_norm": 6.66103506943935, + "learning_rate": 4.960324468283248e-07, + "logits/chosen": -0.7188149690628052, + "logits/rejected": -0.7464720010757446, + "logps/chosen": -291.5167541503906, + "logps/rejected": -326.8276672363281, + "loss": 0.5613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6833261251449585, + "rewards/margins": 0.3538287281990051, + "rewards/rejected": -1.0371549129486084, + "step": 630 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 7.385295168977064, + "learning_rate": 4.956522703928451e-07, + "logits/chosen": -0.7066096663475037, + "logits/rejected": -0.6415206789970398, + "logps/chosen": -306.70269775390625, + "logps/rejected": -343.36724853515625, + "loss": 0.5626, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7203932404518127, + "rewards/margins": 0.3621978163719177, + "rewards/rejected": -1.082590937614441, + "step": 640 + }, + { + "epoch": 0.15595009596928983, + "grad_norm": 9.96633693590498, + "learning_rate": 4.952548624265606e-07, + "logits/chosen": -0.5960395336151123, + "logits/rejected": -0.5949414372444153, + "logps/chosen": -375.6698303222656, + "logps/rejected": -379.85662841796875, + "loss": 0.6024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9392830729484558, + "rewards/margins": 0.2532385289669037, + "rewards/rejected": -1.1925214529037476, + "step": 650 + }, + { + "epoch": 0.15834932821497122, + "grad_norm": 7.433452911452113, + "learning_rate": 4.948402508060607e-07, + "logits/chosen": -0.6946985721588135, + "logits/rejected": -0.714805006980896, + "logps/chosen": -307.7847900390625, + "logps/rejected": -352.8008117675781, + "loss": 0.6016, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7437388896942139, + "rewards/margins": 0.5146031379699707, + "rewards/rejected": -1.2583420276641846, + "step": 660 + }, + { + "epoch": 0.16074856046065258, + "grad_norm": 8.260908471494544, + "learning_rate": 4.944084646147038e-07, + "logits/chosen": -0.6452184319496155, + "logits/rejected": -0.6763893961906433, + "logps/chosen": -397.63629150390625, + "logps/rejected": -404.8137512207031, + "loss": 0.6115, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8227389454841614, + "rewards/margins": 0.29742100834846497, + "rewards/rejected": -1.1201599836349487, + "step": 670 + }, + { + "epoch": 0.16314779270633398, + "grad_norm": 8.266084905632274, + "learning_rate": 4.939595341405754e-07, + "logits/chosen": -0.7573758363723755, + "logits/rejected": -0.780553936958313, + "logps/chosen": -331.1211853027344, + "logps/rejected": -373.88775634765625, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7678017616271973, + "rewards/margins": 0.4917237162590027, + "rewards/rejected": -1.2595255374908447, + "step": 680 + }, + { + "epoch": 0.16554702495201534, + "grad_norm": 7.625957912581509, + "learning_rate": 4.93493490874365e-07, + "logits/chosen": -0.6451135873794556, + "logits/rejected": -0.6591531038284302, + "logps/chosen": -325.8547058105469, + "logps/rejected": -366.7559814453125, + "loss": 0.5467, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.821052074432373, + "rewards/margins": 0.33492714166641235, + "rewards/rejected": -1.1559793949127197, + "step": 690 + }, + { + "epoch": 0.16794625719769674, + "grad_norm": 10.17202506828973, + "learning_rate": 4.93010367507156e-07, + "logits/chosen": -0.7482548356056213, + "logits/rejected": -0.7481337189674377, + "logps/chosen": -276.33294677734375, + "logps/rejected": -313.0743103027344, + "loss": 0.5607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.673913300037384, + "rewards/margins": 0.6004130244255066, + "rewards/rejected": -1.2743263244628906, + "step": 700 + }, + { + "epoch": 0.17034548944337813, + "grad_norm": 9.522996639673643, + "learning_rate": 4.925101979281332e-07, + "logits/chosen": -0.6719304919242859, + "logits/rejected": -0.7497730851173401, + "logps/chosen": -369.2864685058594, + "logps/rejected": -377.1894226074219, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7069920301437378, + "rewards/margins": 0.6167112588882446, + "rewards/rejected": -1.3237032890319824, + "step": 710 + }, + { + "epoch": 0.1727447216890595, + "grad_norm": 7.878026358158539, + "learning_rate": 4.919930172222054e-07, + "logits/chosen": -0.7336487174034119, + "logits/rejected": -0.7846344709396362, + "logps/chosen": -344.8207702636719, + "logps/rejected": -387.56201171875, + "loss": 0.5369, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8977760076522827, + "rewards/margins": 0.49707236886024475, + "rewards/rejected": -1.3948484659194946, + "step": 720 + }, + { + "epoch": 0.1751439539347409, + "grad_norm": 8.810480029842584, + "learning_rate": 4.914588616675445e-07, + "logits/chosen": -0.8329795598983765, + "logits/rejected": -0.8481542468070984, + "logps/chosen": -279.27081298828125, + "logps/rejected": -337.36883544921875, + "loss": 0.5906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.61821049451828, + "rewards/margins": 0.4857397675514221, + "rewards/rejected": -1.1039502620697021, + "step": 730 + }, + { + "epoch": 0.17754318618042225, + "grad_norm": 9.762970453211091, + "learning_rate": 4.909077687330404e-07, + "logits/chosen": -0.6972378492355347, + "logits/rejected": -0.7455834150314331, + "logps/chosen": -354.2886657714844, + "logps/rejected": -357.53558349609375, + "loss": 0.553, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7641565799713135, + "rewards/margins": 0.3690626621246338, + "rewards/rejected": -1.1332192420959473, + "step": 740 + }, + { + "epoch": 0.17994241842610365, + "grad_norm": 9.234397345125467, + "learning_rate": 4.903397770756729e-07, + "logits/chosen": -0.7595505714416504, + "logits/rejected": -0.7833656668663025, + "logps/chosen": -351.7059020996094, + "logps/rejected": -407.5093688964844, + "loss": 0.5617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8937844038009644, + "rewards/margins": 0.57142174243927, + "rewards/rejected": -1.4652063846588135, + "step": 750 + }, + { + "epoch": 0.18234165067178504, + "grad_norm": 7.71688649402669, + "learning_rate": 4.897549265378004e-07, + "logits/chosen": -0.7180362939834595, + "logits/rejected": -0.7230840921401978, + "logps/chosen": -430.9496154785156, + "logps/rejected": -468.16937255859375, + "loss": 0.5631, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.040961503982544, + "rewards/margins": 0.4251536428928375, + "rewards/rejected": -1.4661149978637695, + "step": 760 + }, + { + "epoch": 0.1847408829174664, + "grad_norm": 8.153325048910528, + "learning_rate": 4.891532581443643e-07, + "logits/chosen": -0.8323251008987427, + "logits/rejected": -0.8525883555412292, + "logps/chosen": -381.8115234375, + "logps/rejected": -471.56719970703125, + "loss": 0.5333, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.7869538068771362, + "rewards/margins": 0.9414682388305664, + "rewards/rejected": -1.7284221649169922, + "step": 770 + }, + { + "epoch": 0.1871401151631478, + "grad_norm": 8.422916796673588, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -0.7381910085678101, + "logits/rejected": -0.7251767516136169, + "logps/chosen": -332.1246032714844, + "logps/rejected": -410.90057373046875, + "loss": 0.5641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8256322741508484, + "rewards/margins": 0.6119931936264038, + "rewards/rejected": -1.4376256465911865, + "step": 780 + }, + { + "epoch": 0.18953934740882916, + "grad_norm": 9.466411263035182, + "learning_rate": 4.878996377861367e-07, + "logits/chosen": -0.86748206615448, + "logits/rejected": -0.8805437088012695, + "logps/chosen": -308.0349426269531, + "logps/rejected": -359.1808166503906, + "loss": 0.5249, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.915812611579895, + "rewards/margins": 0.4137847423553467, + "rewards/rejected": -1.3295972347259521, + "step": 790 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 9.035814907343086, + "learning_rate": 4.872477737578327e-07, + "logits/chosen": -0.830212414264679, + "logits/rejected": -0.7413343787193298, + "logps/chosen": -373.04217529296875, + "logps/rejected": -468.3797302246094, + "loss": 0.5248, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.003941535949707, + "rewards/margins": 0.9732707738876343, + "rewards/rejected": -1.9772125482559204, + "step": 800 + }, + { + "epoch": 0.19433781190019195, + "grad_norm": 11.703896273776289, + "learning_rate": 4.865792677407718e-07, + "logits/chosen": -0.8188837766647339, + "logits/rejected": -0.8448683023452759, + "logps/chosen": -368.94512939453125, + "logps/rejected": -377.5952453613281, + "loss": 0.5712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1495884656906128, + "rewards/margins": 0.362571656703949, + "rewards/rejected": -1.512160062789917, + "step": 810 + }, + { + "epoch": 0.1967370441458733, + "grad_norm": 10.81943352397323, + "learning_rate": 4.858941666279955e-07, + "logits/chosen": -0.8488418459892273, + "logits/rejected": -0.880780816078186, + "logps/chosen": -353.0042419433594, + "logps/rejected": -379.8878479003906, + "loss": 0.5742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.79103684425354, + "rewards/margins": 0.4048077464103699, + "rewards/rejected": -1.1958444118499756, + "step": 820 + }, + { + "epoch": 0.1991362763915547, + "grad_norm": 8.925559856487725, + "learning_rate": 4.851925184766247e-07, + "logits/chosen": -0.8389931917190552, + "logits/rejected": -0.8778663873672485, + "logps/chosen": -348.4869079589844, + "logps/rejected": -395.1742248535156, + "loss": 0.5534, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9398209452629089, + "rewards/margins": 0.6886889934539795, + "rewards/rejected": -1.6285101175308228, + "step": 830 + }, + { + "epoch": 0.20153550863723607, + "grad_norm": 11.109685433240054, + "learning_rate": 4.844743725044897e-07, + "logits/chosen": -0.8412739038467407, + "logits/rejected": -0.9312038421630859, + "logps/chosen": -339.4559631347656, + "logps/rejected": -370.5246887207031, + "loss": 0.5445, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9467164278030396, + "rewards/margins": 0.5613173246383667, + "rewards/rejected": -1.5080337524414062, + "step": 840 + }, + { + "epoch": 0.20393474088291746, + "grad_norm": 10.72753800655601, + "learning_rate": 4.837397790866774e-07, + "logits/chosen": -0.8491243124008179, + "logits/rejected": -0.8605045080184937, + "logps/chosen": -366.4127502441406, + "logps/rejected": -440.4471130371094, + "loss": 0.5521, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7613908052444458, + "rewards/margins": 0.9519069790840149, + "rewards/rejected": -1.7132980823516846, + "step": 850 + }, + { + "epoch": 0.20633397312859886, + "grad_norm": 10.004722580523001, + "learning_rate": 4.829887897519974e-07, + "logits/chosen": -0.9373795390129089, + "logits/rejected": -0.9096584320068359, + "logps/chosen": -319.7113952636719, + "logps/rejected": -403.3350524902344, + "loss": 0.5559, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8845510482788086, + "rewards/margins": 0.5768999457359314, + "rewards/rejected": -1.4614509344100952, + "step": 860 + }, + { + "epoch": 0.20873320537428022, + "grad_norm": 9.07756678312938, + "learning_rate": 4.82221457179368e-07, + "logits/chosen": -0.869964599609375, + "logits/rejected": -0.871699333190918, + "logps/chosen": -354.28704833984375, + "logps/rejected": -417.8169860839844, + "loss": 0.5254, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8736898303031921, + "rewards/margins": 0.754965603351593, + "rewards/rejected": -1.6286554336547852, + "step": 870 + }, + { + "epoch": 0.21113243761996162, + "grad_norm": 11.276381279159365, + "learning_rate": 4.814378351941206e-07, + "logits/chosen": -0.8650039434432983, + "logits/rejected": -0.8803671002388, + "logps/chosen": -343.48822021484375, + "logps/rejected": -381.94207763671875, + "loss": 0.5619, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9373790621757507, + "rewards/margins": 0.47534435987472534, + "rewards/rejected": -1.4127235412597656, + "step": 880 + }, + { + "epoch": 0.21353166986564298, + "grad_norm": 10.076876257674526, + "learning_rate": 4.806379787642241e-07, + "logits/chosen": -0.8381707072257996, + "logits/rejected": -0.8108996152877808, + "logps/chosen": -331.9487609863281, + "logps/rejected": -403.20867919921875, + "loss": 0.5839, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8763322830200195, + "rewards/margins": 0.624114990234375, + "rewards/rejected": -1.500447392463684, + "step": 890 + }, + { + "epoch": 0.21593090211132437, + "grad_norm": 10.034596459189128, + "learning_rate": 4.798219439964293e-07, + "logits/chosen": -0.8501941561698914, + "logits/rejected": -0.8989803194999695, + "logps/chosen": -343.5841979980469, + "logps/rejected": -392.810791015625, + "loss": 0.5347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0320571660995483, + "rewards/margins": 0.36501601338386536, + "rewards/rejected": -1.3970732688903809, + "step": 900 + }, + { + "epoch": 0.21833013435700577, + "grad_norm": 10.27150503174172, + "learning_rate": 4.78989788132333e-07, + "logits/chosen": -0.902818500995636, + "logits/rejected": -0.8824566006660461, + "logps/chosen": -300.7974853515625, + "logps/rejected": -388.55126953125, + "loss": 0.5081, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8453769683837891, + "rewards/margins": 0.8020466566085815, + "rewards/rejected": -1.647423505783081, + "step": 910 + }, + { + "epoch": 0.22072936660268713, + "grad_norm": 8.577531584564799, + "learning_rate": 4.781415695443631e-07, + "logits/chosen": -0.7745347619056702, + "logits/rejected": -0.7863970398902893, + "logps/chosen": -420.1585388183594, + "logps/rejected": -460.1143493652344, + "loss": 0.5703, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2678101062774658, + "rewards/margins": 0.29927223920822144, + "rewards/rejected": -1.5670822858810425, + "step": 920 + }, + { + "epoch": 0.22312859884836853, + "grad_norm": 7.712609609832512, + "learning_rate": 4.772773477316836e-07, + "logits/chosen": -0.7534626722335815, + "logits/rejected": -0.7615676522254944, + "logps/chosen": -395.2740478515625, + "logps/rejected": -448.7425231933594, + "loss": 0.5444, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.115117073059082, + "rewards/margins": 0.5547926425933838, + "rewards/rejected": -1.6699097156524658, + "step": 930 + }, + { + "epoch": 0.2255278310940499, + "grad_norm": 14.001350284298912, + "learning_rate": 4.7639718331602117e-07, + "logits/chosen": -0.7357865571975708, + "logits/rejected": -0.7330624461174011, + "logps/chosen": -353.99200439453125, + "logps/rejected": -431.35565185546875, + "loss": 0.5372, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8342105150222778, + "rewards/margins": 0.878664493560791, + "rewards/rejected": -1.7128750085830688, + "step": 940 + }, + { + "epoch": 0.22792706333973128, + "grad_norm": 10.221546212484748, + "learning_rate": 4.7550113803741275e-07, + "logits/chosen": -0.7402353882789612, + "logits/rejected": -0.8165884017944336, + "logps/chosen": -379.7347717285156, + "logps/rejected": -362.58721923828125, + "loss": 0.5637, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0031925439834595, + "rewards/margins": 0.4585431218147278, + "rewards/rejected": -1.461735486984253, + "step": 950 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 10.091860887109807, + "learning_rate": 4.7458927474987454e-07, + "logits/chosen": -0.699491560459137, + "logits/rejected": -0.7056708931922913, + "logps/chosen": -411.66754150390625, + "logps/rejected": -392.9963073730469, + "loss": 0.5194, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8979185223579407, + "rewards/margins": 0.4219675064086914, + "rewards/rejected": -1.3198859691619873, + "step": 960 + }, + { + "epoch": 0.23272552783109404, + "grad_norm": 10.700825723854395, + "learning_rate": 4.7366165741699347e-07, + "logits/chosen": -0.7538058161735535, + "logits/rejected": -0.7752776145935059, + "logps/chosen": -426.29840087890625, + "logps/rejected": -456.43865966796875, + "loss": 0.5367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0425876379013062, + "rewards/margins": 0.5545080900192261, + "rewards/rejected": -1.5970958471298218, + "step": 970 + }, + { + "epoch": 0.23512476007677544, + "grad_norm": 9.318743253411762, + "learning_rate": 4.727183511074401e-07, + "logits/chosen": -0.8688480257987976, + "logits/rejected": -0.8663204908370972, + "logps/chosen": -376.7490234375, + "logps/rejected": -404.92816162109375, + "loss": 0.5436, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.009129285812378, + "rewards/margins": 0.382347047328949, + "rewards/rejected": -1.3914763927459717, + "step": 980 + }, + { + "epoch": 0.2375239923224568, + "grad_norm": 10.050271627816938, + "learning_rate": 4.717594219904043e-07, + "logits/chosen": -0.7565699815750122, + "logits/rejected": -0.805103600025177, + "logps/chosen": -377.40472412109375, + "logps/rejected": -379.1291809082031, + "loss": 0.5345, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0759207010269165, + "rewards/margins": 0.45810168981552124, + "rewards/rejected": -1.534022331237793, + "step": 990 + }, + { + "epoch": 0.2399232245681382, + "grad_norm": 9.560726789203981, + "learning_rate": 4.7078493733095393e-07, + "logits/chosen": -0.9027126431465149, + "logits/rejected": -0.9078402519226074, + "logps/chosen": -370.6812744140625, + "logps/rejected": -446.0091247558594, + "loss": 0.5318, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.204393982887268, + "rewards/margins": 0.6276249289512634, + "rewards/rejected": -1.8320188522338867, + "step": 1000 + }, + { + "epoch": 0.2423224568138196, + "grad_norm": 9.375367945145504, + "learning_rate": 4.6979496548531614e-07, + "logits/chosen": -0.8843992352485657, + "logits/rejected": -0.8497310876846313, + "logps/chosen": -399.2445373535156, + "logps/rejected": -512.7918701171875, + "loss": 0.5464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4072428941726685, + "rewards/margins": 0.6233394742012024, + "rewards/rejected": -2.0305821895599365, + "step": 1010 + }, + { + "epoch": 0.24472168905950095, + "grad_norm": 9.752181299642505, + "learning_rate": 4.6878957589608293e-07, + "logits/chosen": -0.8666139841079712, + "logits/rejected": -0.8512627482414246, + "logps/chosen": -390.6292724609375, + "logps/rejected": -506.28228759765625, + "loss": 0.5451, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2309002876281738, + "rewards/margins": 0.7173486351966858, + "rewards/rejected": -1.9482488632202148, + "step": 1020 + }, + { + "epoch": 0.24712092130518235, + "grad_norm": 9.424652839992445, + "learning_rate": 4.6776883908733956e-07, + "logits/chosen": -0.9692492485046387, + "logits/rejected": -1.0165684223175049, + "logps/chosen": -394.48858642578125, + "logps/rejected": -401.189697265625, + "loss": 0.5154, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0245649814605713, + "rewards/margins": 0.7163118720054626, + "rewards/rejected": -1.7408767938613892, + "step": 1030 + }, + { + "epoch": 0.2495201535508637, + "grad_norm": 12.277316191566172, + "learning_rate": 4.667328266597178e-07, + "logits/chosen": -0.8959840536117554, + "logits/rejected": -0.9235955476760864, + "logps/chosen": -361.2985534667969, + "logps/rejected": -423.58233642578125, + "loss": 0.5001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9520134925842285, + "rewards/margins": 0.6992291808128357, + "rewards/rejected": -1.6512426137924194, + "step": 1040 + }, + { + "epoch": 0.2519193857965451, + "grad_norm": 8.94451963423609, + "learning_rate": 4.6568161128537354e-07, + "logits/chosen": -0.8420774340629578, + "logits/rejected": -0.9481871724128723, + "logps/chosen": -380.89874267578125, + "logps/rejected": -393.58160400390625, + "loss": 0.5246, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2700251340866089, + "rewards/margins": 0.634412407875061, + "rewards/rejected": -1.9044376611709595, + "step": 1050 + }, + { + "epoch": 0.2543186180422265, + "grad_norm": 14.034213989285222, + "learning_rate": 4.6461526670288877e-07, + "logits/chosen": -0.8429055213928223, + "logits/rejected": -0.8352988958358765, + "logps/chosen": -404.8660888671875, + "logps/rejected": -438.3995056152344, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1988664865493774, + "rewards/margins": 0.6478247046470642, + "rewards/rejected": -1.8466911315917969, + "step": 1060 + }, + { + "epoch": 0.2567178502879079, + "grad_norm": 13.373585055843934, + "learning_rate": 4.635338677120994e-07, + "logits/chosen": -0.9964144825935364, + "logits/rejected": -0.9940506815910339, + "logps/chosen": -377.0820617675781, + "logps/rejected": -486.6973571777344, + "loss": 0.4774, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1816045045852661, + "rewards/margins": 0.8956181406974792, + "rewards/rejected": -2.0772225856781006, + "step": 1070 + }, + { + "epoch": 0.2591170825335892, + "grad_norm": 11.284998984427544, + "learning_rate": 4.6243749016884835e-07, + "logits/chosen": -0.7764107584953308, + "logits/rejected": -0.8370550870895386, + "logps/chosen": -418.0027770996094, + "logps/rejected": -593.8638916015625, + "loss": 0.5251, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.493062138557434, + "rewards/margins": 1.221855640411377, + "rewards/rejected": -2.7149176597595215, + "step": 1080 + }, + { + "epoch": 0.2615163147792706, + "grad_norm": 16.100264142741796, + "learning_rate": 4.613262109796645e-07, + "logits/chosen": -0.9073816537857056, + "logits/rejected": -0.8367312550544739, + "logps/chosen": -432.54095458984375, + "logps/rejected": -573.1912841796875, + "loss": 0.5319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6614210605621338, + "rewards/margins": 0.9968196749687195, + "rewards/rejected": -2.658240556716919, + "step": 1090 + }, + { + "epoch": 0.263915547024952, + "grad_norm": 12.598181058336907, + "learning_rate": 4.602001080963678e-07, + "logits/chosen": -0.8620105981826782, + "logits/rejected": -0.8818934559822083, + "logps/chosen": -452.8822326660156, + "logps/rejected": -499.30694580078125, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6771061420440674, + "rewards/margins": 0.7576761841773987, + "rewards/rejected": -2.4347822666168213, + "step": 1100 + }, + { + "epoch": 0.2663147792706334, + "grad_norm": 11.020616319405283, + "learning_rate": 4.590592605106017e-07, + "logits/chosen": -0.925918698310852, + "logits/rejected": -0.9431027173995972, + "logps/chosen": -427.30279541015625, + "logps/rejected": -476.481201171875, + "loss": 0.5515, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2021554708480835, + "rewards/margins": 0.7562609910964966, + "rewards/rejected": -1.9584165811538696, + "step": 1110 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 11.43283460001136, + "learning_rate": 4.5790374824829165e-07, + "logits/chosen": -0.8499002456665039, + "logits/rejected": -0.8679935336112976, + "logps/chosen": -310.17327880859375, + "logps/rejected": -387.49249267578125, + "loss": 0.5063, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2178689241409302, + "rewards/margins": 0.6209184527397156, + "rewards/rejected": -1.8387874364852905, + "step": 1120 + }, + { + "epoch": 0.27111324376199614, + "grad_norm": 10.510361605616744, + "learning_rate": 4.5673365236403216e-07, + "logits/chosen": -0.882469654083252, + "logits/rejected": -0.9395925402641296, + "logps/chosen": -300.57720947265625, + "logps/rejected": -434.6649475097656, + "loss": 0.5097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0626921653747559, + "rewards/margins": 1.0293641090393066, + "rewards/rejected": -2.0920560359954834, + "step": 1130 + }, + { + "epoch": 0.27351247600767753, + "grad_norm": 11.418966211240742, + "learning_rate": 4.5554905493540075e-07, + "logits/chosen": -0.9051049947738647, + "logits/rejected": -0.880601704120636, + "logps/chosen": -340.77911376953125, + "logps/rejected": -454.90252685546875, + "loss": 0.4885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.225037932395935, + "rewards/margins": 1.0688270330429077, + "rewards/rejected": -2.2938647270202637, + "step": 1140 + }, + { + "epoch": 0.2759117082533589, + "grad_norm": 9.72460464054391, + "learning_rate": 4.5435003905720074e-07, + "logits/chosen": -0.8102267980575562, + "logits/rejected": -0.8643985986709595, + "logps/chosen": -390.0484619140625, + "logps/rejected": -450.08868408203125, + "loss": 0.5012, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0651195049285889, + "rewards/margins": 0.8835436701774597, + "rewards/rejected": -1.9486631155014038, + "step": 1150 + }, + { + "epoch": 0.2783109404990403, + "grad_norm": 13.360087004422779, + "learning_rate": 4.531366888356324e-07, + "logits/chosen": -0.8684479594230652, + "logits/rejected": -0.8134763836860657, + "logps/chosen": -311.7840881347656, + "logps/rejected": -458.7945861816406, + "loss": 0.4901, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3148356676101685, + "rewards/margins": 1.0154473781585693, + "rewards/rejected": -2.3302829265594482, + "step": 1160 + }, + { + "epoch": 0.2807101727447217, + "grad_norm": 17.680025948725728, + "learning_rate": 4.519090893823931e-07, + "logits/chosen": -0.830313503742218, + "logits/rejected": -0.8483401536941528, + "logps/chosen": -398.6609802246094, + "logps/rejected": -483.01348876953125, + "loss": 0.4966, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.591963291168213, + "rewards/margins": 0.8999356031417847, + "rewards/rejected": -2.491899013519287, + "step": 1170 + }, + { + "epoch": 0.28310940499040305, + "grad_norm": 10.953334824928836, + "learning_rate": 4.5066732680870734e-07, + "logits/chosen": -0.8117620348930359, + "logits/rejected": -0.855734646320343, + "logps/chosen": -401.9914245605469, + "logps/rejected": -469.42218017578125, + "loss": 0.5003, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4848546981811523, + "rewards/margins": 1.069037914276123, + "rewards/rejected": -2.5538926124572754, + "step": 1180 + }, + { + "epoch": 0.28550863723608444, + "grad_norm": 12.957936678071963, + "learning_rate": 4.494114882192862e-07, + "logits/chosen": -0.8582413792610168, + "logits/rejected": -0.8433802723884583, + "logps/chosen": -400.0009460449219, + "logps/rejected": -485.3779296875, + "loss": 0.4816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.388290524482727, + "rewards/margins": 1.1432268619537354, + "rewards/rejected": -2.531517267227173, + "step": 1190 + }, + { + "epoch": 0.28790786948176583, + "grad_norm": 12.79986874331055, + "learning_rate": 4.4814166170621735e-07, + "logits/chosen": -0.8589996099472046, + "logits/rejected": -0.8760782480239868, + "logps/chosen": -367.97552490234375, + "logps/rejected": -463.1073303222656, + "loss": 0.5092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2390401363372803, + "rewards/margins": 1.1767470836639404, + "rewards/rejected": -2.4157872200012207, + "step": 1200 + }, + { + "epoch": 0.2903071017274472, + "grad_norm": 12.234631147028441, + "learning_rate": 4.468579363427858e-07, + "logits/chosen": -0.8185877799987793, + "logits/rejected": -0.8393834829330444, + "logps/chosen": -394.2176513671875, + "logps/rejected": -454.900146484375, + "loss": 0.5058, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4075071811676025, + "rewards/margins": 0.8511323928833008, + "rewards/rejected": -2.2586395740509033, + "step": 1210 + }, + { + "epoch": 0.2927063339731286, + "grad_norm": 17.553407135322153, + "learning_rate": 4.4556040217722555e-07, + "logits/chosen": -0.8887416124343872, + "logits/rejected": -0.860831618309021, + "logps/chosen": -352.8416748046875, + "logps/rejected": -510.03546142578125, + "loss": 0.4867, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1481072902679443, + "rewards/margins": 1.2108467817306519, + "rewards/rejected": -2.3589539527893066, + "step": 1220 + }, + { + "epoch": 0.29510556621880996, + "grad_norm": 11.868156097873015, + "learning_rate": 4.442491502264033e-07, + "logits/chosen": -0.8076246976852417, + "logits/rejected": -0.8352873921394348, + "logps/chosen": -364.78826904296875, + "logps/rejected": -407.4501953125, + "loss": 0.5063, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4375666379928589, + "rewards/margins": 0.5911107659339905, + "rewards/rejected": -2.028677463531494, + "step": 1230 + }, + { + "epoch": 0.29750479846449135, + "grad_norm": 11.089930522236687, + "learning_rate": 4.429242724694338e-07, + "logits/chosen": -0.8699033856391907, + "logits/rejected": -0.8417544364929199, + "logps/chosen": -404.20159912109375, + "logps/rejected": -516.6590576171875, + "loss": 0.5064, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.583553433418274, + "rewards/margins": 0.9518542289733887, + "rewards/rejected": -2.535407543182373, + "step": 1240 + }, + { + "epoch": 0.29990403071017274, + "grad_norm": 11.987389579927841, + "learning_rate": 4.4158586184122817e-07, + "logits/chosen": -0.8027983903884888, + "logits/rejected": -0.8380182981491089, + "logps/chosen": -424.4278259277344, + "logps/rejected": -504.82476806640625, + "loss": 0.4858, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.338322401046753, + "rewards/margins": 1.1540087461471558, + "rewards/rejected": -2.4923312664031982, + "step": 1250 + }, + { + "epoch": 0.30230326295585414, + "grad_norm": 11.726927303441652, + "learning_rate": 4.4023401222597443e-07, + "logits/chosen": -0.7710140943527222, + "logits/rejected": -0.8672100901603699, + "logps/chosen": -440.61566162109375, + "logps/rejected": -509.686767578125, + "loss": 0.4782, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5568013191223145, + "rewards/margins": 0.9064668416976929, + "rewards/rejected": -2.463268518447876, + "step": 1260 + }, + { + "epoch": 0.30470249520153553, + "grad_norm": 13.87871055626238, + "learning_rate": 4.3886881845055235e-07, + "logits/chosen": -0.8198641538619995, + "logits/rejected": -0.8790807723999023, + "logps/chosen": -392.24658203125, + "logps/rejected": -504.7674255371094, + "loss": 0.4534, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4889254570007324, + "rewards/margins": 1.196481466293335, + "rewards/rejected": -2.6854069232940674, + "step": 1270 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 10.998393295453823, + "learning_rate": 4.374903762778814e-07, + "logits/chosen": -0.8656896352767944, + "logits/rejected": -0.8914599418640137, + "logps/chosen": -444.3971252441406, + "logps/rejected": -520.9356689453125, + "loss": 0.4696, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8613601922988892, + "rewards/margins": 1.0016160011291504, + "rewards/rejected": -2.862975835800171, + "step": 1280 + }, + { + "epoch": 0.30950095969289826, + "grad_norm": 12.50531025670152, + "learning_rate": 4.3609878240020356e-07, + "logits/chosen": -0.8085900545120239, + "logits/rejected": -0.8679038882255554, + "logps/chosen": -484.76251220703125, + "logps/rejected": -533.459716796875, + "loss": 0.4863, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8432880640029907, + "rewards/margins": 1.1199102401733398, + "rewards/rejected": -2.963197946548462, + "step": 1290 + }, + { + "epoch": 0.31190019193857965, + "grad_norm": 11.511964627048417, + "learning_rate": 4.346941344323005e-07, + "logits/chosen": -0.8386822938919067, + "logits/rejected": -0.9056866765022278, + "logps/chosen": -432.3689880371094, + "logps/rejected": -450.2909240722656, + "loss": 0.5441, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9001528024673462, + "rewards/margins": 0.6564325094223022, + "rewards/rejected": -2.5565853118896484, + "step": 1300 + }, + { + "epoch": 0.31429942418426104, + "grad_norm": 11.097528659174904, + "learning_rate": 4.332765309046467e-07, + "logits/chosen": -0.7923992276191711, + "logits/rejected": -0.7886919379234314, + "logps/chosen": -419.7078552246094, + "logps/rejected": -483.94049072265625, + "loss": 0.5005, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4908955097198486, + "rewards/margins": 1.0034749507904053, + "rewards/rejected": -2.494370460510254, + "step": 1310 + }, + { + "epoch": 0.31669865642994244, + "grad_norm": 15.474416362716356, + "learning_rate": 4.3184607125649754e-07, + "logits/chosen": -0.8138014078140259, + "logits/rejected": -0.8160893321037292, + "logps/chosen": -396.61572265625, + "logps/rejected": -518.4467163085938, + "loss": 0.5069, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2408325672149658, + "rewards/margins": 1.0694630146026611, + "rewards/rejected": -2.310295581817627, + "step": 1320 + }, + { + "epoch": 0.3190978886756238, + "grad_norm": 10.27191089173125, + "learning_rate": 4.304028558289141e-07, + "logits/chosen": -0.8450434803962708, + "logits/rejected": -0.8716680407524109, + "logps/chosen": -392.87164306640625, + "logps/rejected": -477.4569396972656, + "loss": 0.4715, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1086870431900024, + "rewards/margins": 1.0651975870132446, + "rewards/rejected": -2.173884630203247, + "step": 1330 + }, + { + "epoch": 0.32149712092130517, + "grad_norm": 9.036616057089963, + "learning_rate": 4.28946985857725e-07, + "logits/chosen": -0.7608897686004639, + "logits/rejected": -0.7740424871444702, + "logps/chosen": -430.8748474121094, + "logps/rejected": -552.3256225585938, + "loss": 0.463, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5485337972640991, + "rewards/margins": 1.306312084197998, + "rewards/rejected": -2.8548457622528076, + "step": 1340 + }, + { + "epoch": 0.32389635316698656, + "grad_norm": 11.634362526668829, + "learning_rate": 4.2747856346642445e-07, + "logits/chosen": -0.8262165188789368, + "logits/rejected": -0.8555091023445129, + "logps/chosen": -375.15472412109375, + "logps/rejected": -471.82293701171875, + "loss": 0.4726, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4888992309570312, + "rewards/margins": 1.016867995262146, + "rewards/rejected": -2.505767345428467, + "step": 1350 + }, + { + "epoch": 0.32629558541266795, + "grad_norm": 13.398876290373195, + "learning_rate": 4.2599769165900933e-07, + "logits/chosen": -0.7887164950370789, + "logits/rejected": -0.8400223851203918, + "logps/chosen": -464.3109436035156, + "logps/rejected": -523.8682861328125, + "loss": 0.5048, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0703892707824707, + "rewards/margins": 0.833997905254364, + "rewards/rejected": -2.9043872356414795, + "step": 1360 + }, + { + "epoch": 0.32869481765834935, + "grad_norm": 10.664824987437187, + "learning_rate": 4.245044743127535e-07, + "logits/chosen": -0.9320866465568542, + "logits/rejected": -0.9028736352920532, + "logps/chosen": -407.45880126953125, + "logps/rejected": -506.1048889160156, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5945441722869873, + "rewards/margins": 0.8470728993415833, + "rewards/rejected": -2.441617250442505, + "step": 1370 + }, + { + "epoch": 0.3310940499040307, + "grad_norm": 14.46708847771426, + "learning_rate": 4.229990161709214e-07, + "logits/chosen": -0.7717675566673279, + "logits/rejected": -0.7377297282218933, + "logps/chosen": -367.9638977050781, + "logps/rejected": -525.7390747070312, + "loss": 0.5272, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.408815622329712, + "rewards/margins": 1.3415130376815796, + "rewards/rejected": -2.750328540802002, + "step": 1380 + }, + { + "epoch": 0.3334932821497121, + "grad_norm": 8.175303654043583, + "learning_rate": 4.214814228354204e-07, + "logits/chosen": -0.7827272415161133, + "logits/rejected": -0.7984440326690674, + "logps/chosen": -451.130126953125, + "logps/rejected": -563.998046875, + "loss": 0.4767, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7949268817901611, + "rewards/margins": 1.3159399032592773, + "rewards/rejected": -3.1108667850494385, + "step": 1390 + }, + { + "epoch": 0.33589251439539347, + "grad_norm": 12.456155197274827, + "learning_rate": 4.1995180075939375e-07, + "logits/chosen": -0.8594837188720703, + "logits/rejected": -0.8565725088119507, + "logps/chosen": -457.4839782714844, + "logps/rejected": -527.7620239257812, + "loss": 0.4705, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6926393508911133, + "rewards/margins": 0.9544218182563782, + "rewards/rejected": -2.647061347961426, + "step": 1400 + }, + { + "epoch": 0.33829174664107486, + "grad_norm": 10.492615269241377, + "learning_rate": 4.1841025723975297e-07, + "logits/chosen": -0.8039811849594116, + "logits/rejected": -0.809655487537384, + "logps/chosen": -395.67999267578125, + "logps/rejected": -496.8030700683594, + "loss": 0.4625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0631120204925537, + "rewards/margins": 1.2070002555847168, + "rewards/rejected": -2.2701122760772705, + "step": 1410 + }, + { + "epoch": 0.34069097888675626, + "grad_norm": 11.628088236483801, + "learning_rate": 4.168569004096516e-07, + "logits/chosen": -0.7779537439346313, + "logits/rejected": -0.732982337474823, + "logps/chosen": -365.4278259277344, + "logps/rejected": -518.9385986328125, + "loss": 0.4622, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3053802251815796, + "rewards/margins": 1.3282335996627808, + "rewards/rejected": -2.6336138248443604, + "step": 1420 + }, + { + "epoch": 0.3430902111324376, + "grad_norm": 9.810034255281902, + "learning_rate": 4.152918392308997e-07, + "logits/chosen": -0.9322064518928528, + "logits/rejected": -0.9035415649414062, + "logps/chosen": -429.0191345214844, + "logps/rejected": -481.86846923828125, + "loss": 0.4737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8905305862426758, + "rewards/margins": 0.6541526317596436, + "rewards/rejected": -2.5446829795837402, + "step": 1430 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 16.302876025790795, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": -0.8372869491577148, + "logits/rejected": -0.7931715250015259, + "logps/chosen": -425.23651123046875, + "logps/rejected": -571.8636474609375, + "loss": 0.5414, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9143438339233398, + "rewards/margins": 1.09660804271698, + "rewards/rejected": -3.0109522342681885, + "step": 1440 + }, + { + "epoch": 0.3478886756238004, + "grad_norm": 14.266445268878716, + "learning_rate": 4.121270437720526e-07, + "logits/chosen": -0.7531959414482117, + "logits/rejected": -0.7015701532363892, + "logps/chosen": -415.9129943847656, + "logps/rejected": -544.4853515625, + "loss": 0.4963, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1056931018829346, + "rewards/margins": 0.7997118234634399, + "rewards/rejected": -2.905405044555664, + "step": 1450 + }, + { + "epoch": 0.3502879078694818, + "grad_norm": 8.666123707311465, + "learning_rate": 4.105275314897852e-07, + "logits/chosen": -0.8094059228897095, + "logits/rejected": -0.8042441606521606, + "logps/chosen": -387.4685363769531, + "logps/rejected": -575.4444580078125, + "loss": 0.497, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7935253381729126, + "rewards/margins": 1.4926183223724365, + "rewards/rejected": -3.2861435413360596, + "step": 1460 + }, + { + "epoch": 0.35268714011516317, + "grad_norm": 10.639348735955451, + "learning_rate": 4.089167588389508e-07, + "logits/chosen": -0.7043929100036621, + "logits/rejected": -0.7534819841384888, + "logps/chosen": -515.1029663085938, + "logps/rejected": -574.6419067382812, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7564353942871094, + "rewards/margins": 1.080705165863037, + "rewards/rejected": -2.8371405601501465, + "step": 1470 + }, + { + "epoch": 0.3550863723608445, + "grad_norm": 13.106435290020489, + "learning_rate": 4.072948388088515e-07, + "logits/chosen": -0.6628540754318237, + "logits/rejected": -0.6470414400100708, + "logps/chosen": -432.12005615234375, + "logps/rejected": -549.7205200195312, + "loss": 0.4851, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.650691270828247, + "rewards/margins": 1.125057578086853, + "rewards/rejected": -2.7757484912872314, + "step": 1480 + }, + { + "epoch": 0.3574856046065259, + "grad_norm": 13.06259828621992, + "learning_rate": 4.056618851707334e-07, + "logits/chosen": -0.6907030344009399, + "logits/rejected": -0.7093620300292969, + "logps/chosen": -397.7687683105469, + "logps/rejected": -526.349853515625, + "loss": 0.4663, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3325411081314087, + "rewards/margins": 1.221077561378479, + "rewards/rejected": -2.553618907928467, + "step": 1490 + }, + { + "epoch": 0.3598848368522073, + "grad_norm": 12.917502818953556, + "learning_rate": 4.0401801246980675e-07, + "logits/chosen": -0.8259037137031555, + "logits/rejected": -0.8357691764831543, + "logps/chosen": -384.6356506347656, + "logps/rejected": -453.1322326660156, + "loss": 0.4882, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7201900482177734, + "rewards/margins": 0.8327716588973999, + "rewards/rejected": -2.552961587905884, + "step": 1500 + }, + { + "epoch": 0.3622840690978887, + "grad_norm": 12.433094406122816, + "learning_rate": 4.0236333601721043e-07, + "logits/chosen": -0.7449339628219604, + "logits/rejected": -0.7388188242912292, + "logps/chosen": -460.6793518066406, + "logps/rejected": -544.5335693359375, + "loss": 0.5044, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6877784729003906, + "rewards/margins": 0.7297952175140381, + "rewards/rejected": -2.417573928833008, + "step": 1510 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 13.800192452791908, + "learning_rate": 4.0069797188192364e-07, + "logits/chosen": -0.7582114934921265, + "logits/rejected": -0.742210328578949, + "logps/chosen": -439.11114501953125, + "logps/rejected": -521.8330688476562, + "loss": 0.4944, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5508705377578735, + "rewards/margins": 1.0782378911972046, + "rewards/rejected": -2.6291086673736572, + "step": 1520 + }, + { + "epoch": 0.3670825335892514, + "grad_norm": 13.571364318064191, + "learning_rate": 3.9902203688262417e-07, + "logits/chosen": -0.721932590007782, + "logits/rejected": -0.7364694476127625, + "logps/chosen": -405.0948791503906, + "logps/rejected": -501.7353515625, + "loss": 0.4661, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3923845291137695, + "rewards/margins": 1.094702959060669, + "rewards/rejected": -2.4870872497558594, + "step": 1530 + }, + { + "epoch": 0.3694817658349328, + "grad_norm": 11.011970777974243, + "learning_rate": 3.9733564857949365e-07, + "logits/chosen": -0.6986292600631714, + "logits/rejected": -0.6912825703620911, + "logps/chosen": -500.9569396972656, + "logps/rejected": -553.8840942382812, + "loss": 0.4869, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.857325553894043, + "rewards/margins": 0.9566340446472168, + "rewards/rejected": -2.8139595985412598, + "step": 1540 + }, + { + "epoch": 0.3718809980806142, + "grad_norm": 10.905693632178256, + "learning_rate": 3.9563892526597177e-07, + "logits/chosen": -0.7152280807495117, + "logits/rejected": -0.6881515383720398, + "logps/chosen": -388.0267333984375, + "logps/rejected": -521.2376708984375, + "loss": 0.4572, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6515610218048096, + "rewards/margins": 0.8464974164962769, + "rewards/rejected": -2.498058319091797, + "step": 1550 + }, + { + "epoch": 0.3742802303262956, + "grad_norm": 10.08743827300553, + "learning_rate": 3.9393198596045795e-07, + "logits/chosen": -0.7806371450424194, + "logits/rejected": -0.76411372423172, + "logps/chosen": -399.4765930175781, + "logps/rejected": -516.7181396484375, + "loss": 0.5182, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7115662097930908, + "rewards/margins": 0.941506028175354, + "rewards/rejected": -2.6530721187591553, + "step": 1560 + }, + { + "epoch": 0.376679462571977, + "grad_norm": 8.868869870998214, + "learning_rate": 3.922149503979628e-07, + "logits/chosen": -0.6893107295036316, + "logits/rejected": -0.7296844124794006, + "logps/chosen": -438.3675231933594, + "logps/rejected": -609.4227905273438, + "loss": 0.4809, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.628042221069336, + "rewards/margins": 1.6386429071426392, + "rewards/rejected": -3.2666850090026855, + "step": 1570 + }, + { + "epoch": 0.3790786948176583, + "grad_norm": 13.078715942181777, + "learning_rate": 3.904879390217095e-07, + "logits/chosen": -0.799870491027832, + "logits/rejected": -0.8303624987602234, + "logps/chosen": -410.641845703125, + "logps/rejected": -479.2076721191406, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6100763082504272, + "rewards/margins": 0.8601690530776978, + "rewards/rejected": -2.470245361328125, + "step": 1580 + }, + { + "epoch": 0.3814779270633397, + "grad_norm": 12.71197235549074, + "learning_rate": 3.8875107297468463e-07, + "logits/chosen": -0.7600913643836975, + "logits/rejected": -0.7607609033584595, + "logps/chosen": -388.62042236328125, + "logps/rejected": -583.7432250976562, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4480386972427368, + "rewards/margins": 1.5103285312652588, + "rewards/rejected": -2.958367109298706, + "step": 1590 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 13.466303880551026, + "learning_rate": 3.87004474091141e-07, + "logits/chosen": -0.6408634781837463, + "logits/rejected": -0.6452223062515259, + "logps/chosen": -373.9280700683594, + "logps/rejected": -497.0530700683594, + "loss": 0.4874, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4634227752685547, + "rewards/margins": 1.0529407262802124, + "rewards/rejected": -2.5163636207580566, + "step": 1600 + }, + { + "epoch": 0.3862763915547025, + "grad_norm": 12.405090587714351, + "learning_rate": 3.8524826488805114e-07, + "logits/chosen": -0.812592625617981, + "logits/rejected": -0.774621844291687, + "logps/chosen": -457.66961669921875, + "logps/rejected": -521.51904296875, + "loss": 0.5016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.722423791885376, + "rewards/margins": 1.091522455215454, + "rewards/rejected": -2.81394624710083, + "step": 1610 + }, + { + "epoch": 0.3886756238003839, + "grad_norm": 10.910380188178692, + "learning_rate": 3.834825685565133e-07, + "logits/chosen": -0.8149593472480774, + "logits/rejected": -0.8616162538528442, + "logps/chosen": -388.2657165527344, + "logps/rejected": -452.9178161621094, + "loss": 0.4461, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5048949718475342, + "rewards/margins": 1.0155197381973267, + "rewards/rejected": -2.5204145908355713, + "step": 1620 + }, + { + "epoch": 0.39107485604606523, + "grad_norm": 12.461150034624684, + "learning_rate": 3.8170750895311007e-07, + "logits/chosen": -0.7702925205230713, + "logits/rejected": -0.7936859726905823, + "logps/chosen": -419.08270263671875, + "logps/rejected": -508.16571044921875, + "loss": 0.4739, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3406755924224854, + "rewards/margins": 1.148717999458313, + "rewards/rejected": -2.489393711090088, + "step": 1630 + }, + { + "epoch": 0.3934740882917466, + "grad_norm": 10.482680213963254, + "learning_rate": 3.7992321059122045e-07, + "logits/chosen": -0.7163397073745728, + "logits/rejected": -0.7596901059150696, + "logps/chosen": -418.07147216796875, + "logps/rejected": -498.05078125, + "loss": 0.4946, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8541247844696045, + "rewards/margins": 0.9185699224472046, + "rewards/rejected": -2.7726948261260986, + "step": 1640 + }, + { + "epoch": 0.395873320537428, + "grad_norm": 12.021180525729404, + "learning_rate": 3.7812979863228576e-07, + "logits/chosen": -0.8531166911125183, + "logits/rejected": -0.8554477691650391, + "logps/chosen": -381.1147155761719, + "logps/rejected": -508.91668701171875, + "loss": 0.4537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7566955089569092, + "rewards/margins": 1.1585729122161865, + "rewards/rejected": -2.9152684211730957, + "step": 1650 + }, + { + "epoch": 0.3982725527831094, + "grad_norm": 12.979308903163362, + "learning_rate": 3.763273988770296e-07, + "logits/chosen": -0.7346752285957336, + "logits/rejected": -0.7792466878890991, + "logps/chosen": -394.4084167480469, + "logps/rejected": -528.5557861328125, + "loss": 0.455, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4199392795562744, + "rewards/margins": 1.3389031887054443, + "rewards/rejected": -2.7588424682617188, + "step": 1660 + }, + { + "epoch": 0.4006717850287908, + "grad_norm": 12.280364894112322, + "learning_rate": 3.7451613775663405e-07, + "logits/chosen": -0.7985413670539856, + "logits/rejected": -0.7544962763786316, + "logps/chosen": -410.6094665527344, + "logps/rejected": -567.0599365234375, + "loss": 0.4986, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7071869373321533, + "rewards/margins": 1.444684624671936, + "rewards/rejected": -3.1518714427948, + "step": 1670 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 11.808414350786867, + "learning_rate": 3.726961423238706e-07, + "logits/chosen": -0.8868053555488586, + "logits/rejected": -0.9050809741020203, + "logps/chosen": -378.8218078613281, + "logps/rejected": -536.3219604492188, + "loss": 0.4686, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5447314977645874, + "rewards/margins": 1.3333966732025146, + "rewards/rejected": -2.8781278133392334, + "step": 1680 + }, + { + "epoch": 0.40547024952015354, + "grad_norm": 12.787716674975018, + "learning_rate": 3.708675402441882e-07, + "logits/chosen": -0.7429116368293762, + "logits/rejected": -0.7805765867233276, + "logps/chosen": -458.7632751464844, + "logps/rejected": -516.9869384765625, + "loss": 0.4928, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7429218292236328, + "rewards/margins": 0.9082300066947937, + "rewards/rejected": -2.6511518955230713, + "step": 1690 + }, + { + "epoch": 0.40786948176583493, + "grad_norm": 10.598422088340676, + "learning_rate": 3.6903045978675775e-07, + "logits/chosen": -0.7389672994613647, + "logits/rejected": -0.7964872717857361, + "logps/chosen": -411.5867614746094, + "logps/rejected": -550.23876953125, + "loss": 0.4801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7794177532196045, + "rewards/margins": 1.554722785949707, + "rewards/rejected": -3.3341403007507324, + "step": 1700 + }, + { + "epoch": 0.4102687140115163, + "grad_norm": 10.761090409994697, + "learning_rate": 3.6718502981547474e-07, + "logits/chosen": -0.7715443968772888, + "logits/rejected": -0.7918425798416138, + "logps/chosen": -430.85003662109375, + "logps/rejected": -561.6787109375, + "loss": 0.4899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.676509141921997, + "rewards/margins": 0.8913475871086121, + "rewards/rejected": -2.567856788635254, + "step": 1710 + }, + { + "epoch": 0.4126679462571977, + "grad_norm": 9.729723638899472, + "learning_rate": 3.6533137977991986e-07, + "logits/chosen": -0.755111575126648, + "logits/rejected": -0.7594307065010071, + "logps/chosen": -430.3711853027344, + "logps/rejected": -534.5823974609375, + "loss": 0.5115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5598688125610352, + "rewards/margins": 0.8007798194885254, + "rewards/rejected": -2.3606486320495605, + "step": 1720 + }, + { + "epoch": 0.41506717850287905, + "grad_norm": 9.372402487840775, + "learning_rate": 3.6346963970627865e-07, + "logits/chosen": -0.6953638195991516, + "logits/rejected": -0.6692907214164734, + "logps/chosen": -390.3316955566406, + "logps/rejected": -514.3018798828125, + "loss": 0.4468, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3828855752944946, + "rewards/margins": 1.066463828086853, + "rewards/rejected": -2.4493489265441895, + "step": 1730 + }, + { + "epoch": 0.41746641074856045, + "grad_norm": 11.954691591503995, + "learning_rate": 3.615999401882207e-07, + "logits/chosen": -0.8805049657821655, + "logits/rejected": -0.8518358469009399, + "logps/chosen": -412.022216796875, + "logps/rejected": -552.1925659179688, + "loss": 0.4814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9677274227142334, + "rewards/margins": 1.194000005722046, + "rewards/rejected": -3.1617274284362793, + "step": 1740 + }, + { + "epoch": 0.41986564299424184, + "grad_norm": 11.021704456258432, + "learning_rate": 3.597224123777389e-07, + "logits/chosen": -0.7318686246871948, + "logits/rejected": -0.7280600666999817, + "logps/chosen": -430.3202209472656, + "logps/rejected": -572.8101806640625, + "loss": 0.4749, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8212677240371704, + "rewards/margins": 1.2428979873657227, + "rewards/rejected": -3.0641655921936035, + "step": 1750 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 10.015819519130572, + "learning_rate": 3.5783718797595e-07, + "logits/chosen": -0.8521868586540222, + "logits/rejected": -0.8556682467460632, + "logps/chosen": -473.8487243652344, + "logps/rejected": -544.9535522460938, + "loss": 0.4797, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8864997625350952, + "rewards/margins": 1.0899362564086914, + "rewards/rejected": -2.976435661315918, + "step": 1760 + }, + { + "epoch": 0.4246641074856046, + "grad_norm": 11.86534189148174, + "learning_rate": 3.559443992238558e-07, + "logits/chosen": -0.7805435061454773, + "logits/rejected": -0.8231045007705688, + "logps/chosen": -400.96697998046875, + "logps/rejected": -577.6618041992188, + "loss": 0.4997, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5335006713867188, + "rewards/margins": 1.4942717552185059, + "rewards/rejected": -3.0277724266052246, + "step": 1770 + }, + { + "epoch": 0.42706333973128596, + "grad_norm": 10.141163144169631, + "learning_rate": 3.540441788930673e-07, + "logits/chosen": -0.7176542282104492, + "logits/rejected": -0.7344351410865784, + "logps/chosen": -456.525146484375, + "logps/rejected": -563.7665405273438, + "loss": 0.4548, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6193135976791382, + "rewards/margins": 1.4220505952835083, + "rewards/rejected": -3.0413641929626465, + "step": 1780 + }, + { + "epoch": 0.42946257197696736, + "grad_norm": 12.961054235399837, + "learning_rate": 3.5213666027649123e-07, + "logits/chosen": -0.7940319180488586, + "logits/rejected": -0.8246362805366516, + "logps/chosen": -489.11004638671875, + "logps/rejected": -525.5428466796875, + "loss": 0.4772, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1909642219543457, + "rewards/margins": 0.7601931691169739, + "rewards/rejected": -2.951157808303833, + "step": 1790 + }, + { + "epoch": 0.43186180422264875, + "grad_norm": 11.84758063846055, + "learning_rate": 3.5022197717898017e-07, + "logits/chosen": -0.8181630969047546, + "logits/rejected": -0.8500107526779175, + "logps/chosen": -393.4739685058594, + "logps/rejected": -486.96917724609375, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7599443197250366, + "rewards/margins": 1.216341257095337, + "rewards/rejected": -2.976285457611084, + "step": 1800 + }, + { + "epoch": 0.43426103646833014, + "grad_norm": 10.86372215583825, + "learning_rate": 3.4830026390794633e-07, + "logits/chosen": -0.8050792813301086, + "logits/rejected": -0.8329674005508423, + "logps/chosen": -507.2084045410156, + "logps/rejected": -583.702880859375, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1328043937683105, + "rewards/margins": 1.2732837200164795, + "rewards/rejected": -3.406088352203369, + "step": 1810 + }, + { + "epoch": 0.43666026871401153, + "grad_norm": 7.738248672410445, + "learning_rate": 3.4637165526394104e-07, + "logits/chosen": -0.7997580766677856, + "logits/rejected": -0.8064180612564087, + "logps/chosen": -423.777587890625, + "logps/rejected": -542.863525390625, + "loss": 0.4725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9629684686660767, + "rewards/margins": 1.0524226427078247, + "rewards/rejected": -3.0153908729553223, + "step": 1820 + }, + { + "epoch": 0.43905950095969287, + "grad_norm": 11.736843534921078, + "learning_rate": 3.4443628653119814e-07, + "logits/chosen": -0.7079404592514038, + "logits/rejected": -0.7036377191543579, + "logps/chosen": -483.62896728515625, + "logps/rejected": -678.022216796875, + "loss": 0.4814, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1326420307159424, + "rewards/margins": 1.5004966259002686, + "rewards/rejected": -3.633139133453369, + "step": 1830 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 10.785415108021422, + "learning_rate": 3.424942934681453e-07, + "logits/chosen": -0.8217443227767944, + "logits/rejected": -0.8721915483474731, + "logps/chosen": -385.87860107421875, + "logps/rejected": -556.9625854492188, + "loss": 0.4584, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4244273900985718, + "rewards/margins": 1.7936569452285767, + "rewards/rejected": -3.2180843353271484, + "step": 1840 + }, + { + "epoch": 0.44385796545105566, + "grad_norm": 14.588516416025033, + "learning_rate": 3.405458122978804e-07, + "logits/chosen": -0.8775800466537476, + "logits/rejected": -0.8566424250602722, + "logps/chosen": -458.1707458496094, + "logps/rejected": -545.3057861328125, + "loss": 0.4726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.7237602472305298, + "rewards/margins": 1.154767632484436, + "rewards/rejected": -2.878527879714966, + "step": 1850 + }, + { + "epoch": 0.44625719769673705, + "grad_norm": 13.706269723531113, + "learning_rate": 3.3859097969861633e-07, + "logits/chosen": -0.7905477285385132, + "logits/rejected": -0.7530331015586853, + "logps/chosen": -460.21728515625, + "logps/rejected": -553.216796875, + "loss": 0.4504, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8379026651382446, + "rewards/margins": 1.2619093656539917, + "rewards/rejected": -3.0998120307922363, + "step": 1860 + }, + { + "epoch": 0.44865642994241844, + "grad_norm": 13.3769074850146, + "learning_rate": 3.366299327940936e-07, + "logits/chosen": -0.8109074831008911, + "logits/rejected": -0.773389458656311, + "logps/chosen": -470.64947509765625, + "logps/rejected": -587.9104614257812, + "loss": 0.4834, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7903703451156616, + "rewards/margins": 0.9450393915176392, + "rewards/rejected": -2.7354094982147217, + "step": 1870 + }, + { + "epoch": 0.4510556621880998, + "grad_norm": 11.753551278940575, + "learning_rate": 3.3466280914396117e-07, + "logits/chosen": -0.773266077041626, + "logits/rejected": -0.7779923677444458, + "logps/chosen": -402.0985412597656, + "logps/rejected": -570.60693359375, + "loss": 0.4764, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6223961114883423, + "rewards/margins": 1.4883135557174683, + "rewards/rejected": -3.1107096672058105, + "step": 1880 + }, + { + "epoch": 0.4534548944337812, + "grad_norm": 14.233604576877488, + "learning_rate": 3.326897467341281e-07, + "logits/chosen": -0.8112883567810059, + "logits/rejected": -0.8336831331253052, + "logps/chosen": -351.8514404296875, + "logps/rejected": -480.0999450683594, + "loss": 0.474, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.529261827468872, + "rewards/margins": 1.1229501962661743, + "rewards/rejected": -2.652211904525757, + "step": 1890 + }, + { + "epoch": 0.45585412667946257, + "grad_norm": 13.300598272317474, + "learning_rate": 3.3071088396708335e-07, + "logits/chosen": -0.8645914196968079, + "logits/rejected": -0.8331616520881653, + "logps/chosen": -363.18206787109375, + "logps/rejected": -533.6971435546875, + "loss": 0.4819, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6752821207046509, + "rewards/margins": 1.4778271913528442, + "rewards/rejected": -3.153109550476074, + "step": 1900 + }, + { + "epoch": 0.45825335892514396, + "grad_norm": 12.640086996532164, + "learning_rate": 3.2872635965218824e-07, + "logits/chosen": -0.6304786205291748, + "logits/rejected": -0.6618056297302246, + "logps/chosen": -448.3641052246094, + "logps/rejected": -589.9913330078125, + "loss": 0.5109, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9804298877716064, + "rewards/margins": 1.1321611404418945, + "rewards/rejected": -3.112590789794922, + "step": 1910 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 9.986496934257348, + "learning_rate": 3.2673631299593905e-07, + "logits/chosen": -0.7311594486236572, + "logits/rejected": -0.8009797930717468, + "logps/chosen": -461.6552734375, + "logps/rejected": -577.480712890625, + "loss": 0.4668, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8996919393539429, + "rewards/margins": 1.237914800643921, + "rewards/rejected": -3.1376068592071533, + "step": 1920 + }, + { + "epoch": 0.4630518234165067, + "grad_norm": 13.218071975981442, + "learning_rate": 3.247408835922024e-07, + "logits/chosen": -0.7413262724876404, + "logits/rejected": -0.7506011724472046, + "logps/chosen": -488.389404296875, + "logps/rejected": -615.5206298828125, + "loss": 0.4835, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.845880150794983, + "rewards/margins": 1.2004640102386475, + "rewards/rejected": -3.046344041824341, + "step": 1930 + }, + { + "epoch": 0.4654510556621881, + "grad_norm": 12.635690431262567, + "learning_rate": 3.2274021141242306e-07, + "logits/chosen": -0.6942049860954285, + "logits/rejected": -0.7160819172859192, + "logps/chosen": -423.4578552246094, + "logps/rejected": -537.9940795898438, + "loss": 0.4495, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6584323644638062, + "rewards/margins": 1.0953803062438965, + "rewards/rejected": -2.753812551498413, + "step": 1940 + }, + { + "epoch": 0.4678502879078695, + "grad_norm": 15.405837310763367, + "learning_rate": 3.2073443679580613e-07, + "logits/chosen": -0.7654497027397156, + "logits/rejected": -0.7917548418045044, + "logps/chosen": -448.06378173828125, + "logps/rejected": -538.6315307617188, + "loss": 0.4705, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.71183180809021, + "rewards/margins": 0.9023053050041199, + "rewards/rejected": -2.614137649536133, + "step": 1950 + }, + { + "epoch": 0.47024952015355087, + "grad_norm": 9.857507863821315, + "learning_rate": 3.1872370043947194e-07, + "logits/chosen": -0.8800600171089172, + "logits/rejected": -0.888513445854187, + "logps/chosen": -407.2889709472656, + "logps/rejected": -570.4630126953125, + "loss": 0.4494, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3753950595855713, + "rewards/margins": 1.693584680557251, + "rewards/rejected": -3.0689799785614014, + "step": 1960 + }, + { + "epoch": 0.47264875239923226, + "grad_norm": 11.49826557648692, + "learning_rate": 3.167081433885874e-07, + "logits/chosen": -0.6018909215927124, + "logits/rejected": -0.6167675852775574, + "logps/chosen": -508.90655517578125, + "logps/rejected": -644.7164916992188, + "loss": 0.4066, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8501713275909424, + "rewards/margins": 0.9607030749320984, + "rewards/rejected": -2.8108744621276855, + "step": 1970 + }, + { + "epoch": 0.4750479846449136, + "grad_norm": 13.29052258173837, + "learning_rate": 3.14687907026472e-07, + "logits/chosen": -0.6973208785057068, + "logits/rejected": -0.7304965257644653, + "logps/chosen": -395.8529357910156, + "logps/rejected": -551.4783935546875, + "loss": 0.4563, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6641004085540771, + "rewards/margins": 1.4298467636108398, + "rewards/rejected": -3.093946933746338, + "step": 1980 + }, + { + "epoch": 0.477447216890595, + "grad_norm": 12.240475995337231, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -0.699920117855072, + "logits/rejected": -0.7062256932258606, + "logps/chosen": -519.5510864257812, + "logps/rejected": -607.6595458984375, + "loss": 0.4849, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1099679470062256, + "rewards/margins": 0.9010626077651978, + "rewards/rejected": -3.011030673980713, + "step": 1990 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 13.241550075194965, + "learning_rate": 3.1063396353306097e-07, + "logits/chosen": -0.726953387260437, + "logits/rejected": -0.788418173789978, + "logps/chosen": -436.1697692871094, + "logps/rejected": -513.4632568359375, + "loss": 0.4416, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6597273349761963, + "rewards/margins": 1.236422061920166, + "rewards/rejected": -2.896149158477783, + "step": 2000 + }, + { + "epoch": 0.4798464491362764, + "eval_logits/chosen": -0.7618333697319031, + "eval_logits/rejected": -0.7765002846717834, + "eval_logps/chosen": -424.0602111816406, + "eval_logps/rejected": -575.2555541992188, + "eval_loss": 0.45312055945396423, + "eval_rewards/accuracies": 0.8160714507102966, + "eval_rewards/chosen": -1.7141555547714233, + "eval_rewards/margins": 1.430633306503296, + "eval_rewards/rejected": -3.144789457321167, + "eval_runtime": 233.581, + "eval_samples_per_second": 19.098, + "eval_steps_per_second": 0.3, + "step": 2000 + }, + { + "epoch": 0.4822456813819578, + "grad_norm": 11.1357246980386, + "learning_rate": 3.0860054076979535e-07, + "logits/chosen": -0.7470555901527405, + "logits/rejected": -0.74172443151474, + "logps/chosen": -460.82867431640625, + "logps/rejected": -552.6365356445312, + "loss": 0.4743, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9067308902740479, + "rewards/margins": 1.165321946144104, + "rewards/rejected": -3.0720529556274414, + "step": 2010 + }, + { + "epoch": 0.4846449136276392, + "grad_norm": 17.785449989220815, + "learning_rate": 3.065630074114115e-07, + "logits/chosen": -0.7443466782569885, + "logits/rejected": -0.762556254863739, + "logps/chosen": -465.6786193847656, + "logps/rejected": -580.1879272460938, + "loss": 0.4669, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7899072170257568, + "rewards/margins": 1.538434624671936, + "rewards/rejected": -3.3283417224884033, + "step": 2020 + }, + { + "epoch": 0.4870441458733205, + "grad_norm": 16.171594176924803, + "learning_rate": 3.0452150638277947e-07, + "logits/chosen": -0.6747657060623169, + "logits/rejected": -0.6558694839477539, + "logps/chosen": -392.1339416503906, + "logps/rejected": -500.20025634765625, + "loss": 0.5029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.720434546470642, + "rewards/margins": 0.9752155542373657, + "rewards/rejected": -2.695650339126587, + "step": 2030 + }, + { + "epoch": 0.4894433781190019, + "grad_norm": 8.227849718874795, + "learning_rate": 3.024761808870856e-07, + "logits/chosen": -0.7772229313850403, + "logits/rejected": -0.7858240008354187, + "logps/chosen": -381.36236572265625, + "logps/rejected": -597.6624145507812, + "loss": 0.4504, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.510353684425354, + "rewards/margins": 2.1284565925598145, + "rewards/rejected": -3.6388099193573, + "step": 2040 + }, + { + "epoch": 0.4918426103646833, + "grad_norm": 20.051647299898132, + "learning_rate": 3.004271743957875e-07, + "logits/chosen": -0.6749522089958191, + "logits/rejected": -0.6905564069747925, + "logps/chosen": -484.854736328125, + "logps/rejected": -604.3311767578125, + "loss": 0.4945, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.175485134124756, + "rewards/margins": 0.9892854690551758, + "rewards/rejected": -3.1647706031799316, + "step": 2050 + }, + { + "epoch": 0.4942418426103647, + "grad_norm": 12.004150798678578, + "learning_rate": 2.983746306385499e-07, + "logits/chosen": -0.8129485845565796, + "logits/rejected": -0.7655819654464722, + "logps/chosen": -414.5586853027344, + "logps/rejected": -578.7470703125, + "loss": 0.4453, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7315165996551514, + "rewards/margins": 1.4258909225463867, + "rewards/rejected": -3.1574079990386963, + "step": 2060 + }, + { + "epoch": 0.4966410748560461, + "grad_norm": 11.308824836556166, + "learning_rate": 2.963186935931628e-07, + "logits/chosen": -0.7485088109970093, + "logits/rejected": -0.7199539542198181, + "logps/chosen": -450.9239196777344, + "logps/rejected": -583.173583984375, + "loss": 0.4482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.732980728149414, + "rewards/margins": 1.3427022695541382, + "rewards/rejected": -3.075683116912842, + "step": 2070 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 9.373923594426847, + "learning_rate": 2.9425950747544176e-07, + "logits/chosen": -0.6613216400146484, + "logits/rejected": -0.7013477683067322, + "logps/chosen": -505.81573486328125, + "logps/rejected": -642.0211181640625, + "loss": 0.4343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.986654281616211, + "rewards/margins": 1.6596873998641968, + "rewards/rejected": -3.646341323852539, + "step": 2080 + }, + { + "epoch": 0.5014395393474088, + "grad_norm": 15.749741167722831, + "learning_rate": 2.921972167291119e-07, + "logits/chosen": -0.7112385630607605, + "logits/rejected": -0.7526477575302124, + "logps/chosen": -436.2796325683594, + "logps/rejected": -586.3831787109375, + "loss": 0.4399, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5150210857391357, + "rewards/margins": 1.2452566623687744, + "rewards/rejected": -2.760277271270752, + "step": 2090 + }, + { + "epoch": 0.5038387715930902, + "grad_norm": 13.419312414930658, + "learning_rate": 2.9013196601567567e-07, + "logits/chosen": -0.7128900289535522, + "logits/rejected": -0.730756402015686, + "logps/chosen": -419.7554626464844, + "logps/rejected": -551.4823608398438, + "loss": 0.5299, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.718276023864746, + "rewards/margins": 1.1899365186691284, + "rewards/rejected": -2.908212661743164, + "step": 2100 + }, + { + "epoch": 0.5062380038387716, + "grad_norm": 9.651741347788393, + "learning_rate": 2.8806390020426555e-07, + "logits/chosen": -0.7773910760879517, + "logits/rejected": -0.7731062173843384, + "logps/chosen": -431.44317626953125, + "logps/rejected": -574.38720703125, + "loss": 0.4257, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.687414526939392, + "rewards/margins": 1.4306485652923584, + "rewards/rejected": -3.11806321144104, + "step": 2110 + }, + { + "epoch": 0.508637236084453, + "grad_norm": 13.48177862842598, + "learning_rate": 2.8599316436148187e-07, + "logits/chosen": -0.6940504312515259, + "logits/rejected": -0.6836844682693481, + "logps/chosen": -445.3380432128906, + "logps/rejected": -537.2091064453125, + "loss": 0.4344, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9495445489883423, + "rewards/margins": 0.9152109026908875, + "rewards/rejected": -2.864755153656006, + "step": 2120 + }, + { + "epoch": 0.5110364683301344, + "grad_norm": 13.289305185609427, + "learning_rate": 2.8391990374121723e-07, + "logits/chosen": -0.726272463798523, + "logits/rejected": -0.7124502062797546, + "logps/chosen": -440.59832763671875, + "logps/rejected": -601.8394775390625, + "loss": 0.4771, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0392353534698486, + "rewards/margins": 1.3474972248077393, + "rewards/rejected": -3.386732816696167, + "step": 2130 + }, + { + "epoch": 0.5134357005758158, + "grad_norm": 12.7036271351969, + "learning_rate": 2.818442637744669e-07, + "logits/chosen": -0.7679746747016907, + "logits/rejected": -0.7643837332725525, + "logps/chosen": -455.5375061035156, + "logps/rejected": -597.4730224609375, + "loss": 0.4634, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0901169776916504, + "rewards/margins": 1.3490521907806396, + "rewards/rejected": -3.439169406890869, + "step": 2140 + }, + { + "epoch": 0.5158349328214972, + "grad_norm": 10.384572302471735, + "learning_rate": 2.797663900591284e-07, + "logits/chosen": -0.7398999333381653, + "logits/rejected": -0.7659087181091309, + "logps/chosen": -447.51593017578125, + "logps/rejected": -535.8583984375, + "loss": 0.4256, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8267303705215454, + "rewards/margins": 1.21419358253479, + "rewards/rejected": -3.040923833847046, + "step": 2150 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 12.512282425185987, + "learning_rate": 2.776864283497874e-07, + "logits/chosen": -0.7339873909950256, + "logits/rejected": -0.7800690531730652, + "logps/chosen": -417.85107421875, + "logps/rejected": -591.3509521484375, + "loss": 0.4684, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8667776584625244, + "rewards/margins": 1.7389957904815674, + "rewards/rejected": -3.6057732105255127, + "step": 2160 + }, + { + "epoch": 0.5206333973128598, + "grad_norm": 9.865835212510957, + "learning_rate": 2.756045245474943e-07, + "logits/chosen": -0.6862331628799438, + "logits/rejected": -0.6869142651557922, + "logps/chosen": -441.952880859375, + "logps/rejected": -546.9207153320312, + "loss": 0.4634, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6532140970230103, + "rewards/margins": 0.8384655714035034, + "rewards/rejected": -2.4916796684265137, + "step": 2170 + }, + { + "epoch": 0.5230326295585412, + "grad_norm": 16.184183395491957, + "learning_rate": 2.7352082468952977e-07, + "logits/chosen": -0.7555044889450073, + "logits/rejected": -0.7893722057342529, + "logps/chosen": -431.03448486328125, + "logps/rejected": -618.13427734375, + "loss": 0.4812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9443515539169312, + "rewards/margins": 1.6091206073760986, + "rewards/rejected": -3.5534720420837402, + "step": 2180 + }, + { + "epoch": 0.5254318618042226, + "grad_norm": 14.810315270160304, + "learning_rate": 2.7143547493916e-07, + "logits/chosen": -0.7784820795059204, + "logits/rejected": -0.7843117117881775, + "logps/chosen": -398.50506591796875, + "logps/rejected": -604.5455322265625, + "loss": 0.4501, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4673595428466797, + "rewards/margins": 2.0520424842834473, + "rewards/rejected": -3.519402027130127, + "step": 2190 + }, + { + "epoch": 0.527831094049904, + "grad_norm": 14.870480901797459, + "learning_rate": 2.693486215753853e-07, + "logits/chosen": -0.7710849046707153, + "logits/rejected": -0.7755874395370483, + "logps/chosen": -432.92803955078125, + "logps/rejected": -596.2999877929688, + "loss": 0.467, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9311811923980713, + "rewards/margins": 1.8095871210098267, + "rewards/rejected": -3.7407684326171875, + "step": 2200 + }, + { + "epoch": 0.5302303262955854, + "grad_norm": 15.695210321976734, + "learning_rate": 2.6726041098267805e-07, + "logits/chosen": -0.8125902414321899, + "logits/rejected": -0.8185180425643921, + "logps/chosen": -507.8814392089844, + "logps/rejected": -558.718994140625, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2443294525146484, + "rewards/margins": 0.8388055562973022, + "rewards/rejected": -3.083134889602661, + "step": 2210 + }, + { + "epoch": 0.5326295585412668, + "grad_norm": 16.11416540421591, + "learning_rate": 2.6517098964071507e-07, + "logits/chosen": -0.7117936015129089, + "logits/rejected": -0.7254031896591187, + "logps/chosen": -454.80242919921875, + "logps/rejected": -549.0224609375, + "loss": 0.511, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9632365703582764, + "rewards/margins": 0.8028262257575989, + "rewards/rejected": -2.7660632133483887, + "step": 2220 + }, + { + "epoch": 0.5350287907869482, + "grad_norm": 13.538871078171768, + "learning_rate": 2.630805041141023e-07, + "logits/chosen": -0.7859424352645874, + "logits/rejected": -0.7693944573402405, + "logps/chosen": -402.9466857910156, + "logps/rejected": -582.7056884765625, + "loss": 0.4555, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7725486755371094, + "rewards/margins": 1.6191837787628174, + "rewards/rejected": -3.3917324542999268, + "step": 2230 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 14.707908117003884, + "learning_rate": 2.609891010420941e-07, + "logits/chosen": -0.8051595687866211, + "logits/rejected": -0.7927287817001343, + "logps/chosen": -450.76519775390625, + "logps/rejected": -590.6575317382812, + "loss": 0.4335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.906309723854065, + "rewards/margins": 1.394263744354248, + "rewards/rejected": -3.3005733489990234, + "step": 2240 + }, + { + "epoch": 0.539827255278311, + "grad_norm": 13.589410553380741, + "learning_rate": 2.5889692712830674e-07, + "logits/chosen": -0.7646986246109009, + "logits/rejected": -0.7824346423149109, + "logps/chosen": -391.0314025878906, + "logps/rejected": -513.9915771484375, + "loss": 0.4367, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.665148377418518, + "rewards/margins": 1.241813063621521, + "rewards/rejected": -2.90696120262146, + "step": 2250 + }, + { + "epoch": 0.5422264875239923, + "grad_norm": 13.492840980385337, + "learning_rate": 2.5680412913042843e-07, + "logits/chosen": -0.7536166906356812, + "logits/rejected": -0.7371748685836792, + "logps/chosen": -442.3362731933594, + "logps/rejected": -595.1715087890625, + "loss": 0.4245, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0182454586029053, + "rewards/margins": 1.4872602224349976, + "rewards/rejected": -3.5055058002471924, + "step": 2260 + }, + { + "epoch": 0.5446257197696737, + "grad_norm": 14.894715683363456, + "learning_rate": 2.5471085384992404e-07, + "logits/chosen": -0.8191806674003601, + "logits/rejected": -0.806193470954895, + "logps/chosen": -411.95147705078125, + "logps/rejected": -647.9240112304688, + "loss": 0.421, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7768102884292603, + "rewards/margins": 2.2000908851623535, + "rewards/rejected": -3.9769012928009033, + "step": 2270 + }, + { + "epoch": 0.5470249520153551, + "grad_norm": 11.287950370775208, + "learning_rate": 2.526172481217381e-07, + "logits/chosen": -0.7717296481132507, + "logits/rejected": -0.7522517442703247, + "logps/chosen": -446.72607421875, + "logps/rejected": -596.8885498046875, + "loss": 0.4738, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3675601482391357, + "rewards/margins": 1.335943579673767, + "rewards/rejected": -3.703503370285034, + "step": 2280 + }, + { + "epoch": 0.5494241842610365, + "grad_norm": 17.56101375995895, + "learning_rate": 2.5052345880399456e-07, + "logits/chosen": -0.8398829698562622, + "logits/rejected": -0.8727308511734009, + "logps/chosen": -449.156982421875, + "logps/rejected": -570.5628662109375, + "loss": 0.434, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.23502779006958, + "rewards/margins": 1.1653441190719604, + "rewards/rejected": -3.400371551513672, + "step": 2290 + }, + { + "epoch": 0.5518234165067178, + "grad_norm": 11.302567518557918, + "learning_rate": 2.4842963276769555e-07, + "logits/chosen": -0.7016631364822388, + "logits/rejected": -0.6639502048492432, + "logps/chosen": -437.28399658203125, + "logps/rejected": -608.0369873046875, + "loss": 0.4484, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.180513858795166, + "rewards/margins": 1.2660033702850342, + "rewards/rejected": -3.446516752243042, + "step": 2300 + }, + { + "epoch": 0.5542226487523992, + "grad_norm": 11.742895780710647, + "learning_rate": 2.463359168864189e-07, + "logits/chosen": -0.6675876379013062, + "logits/rejected": -0.7527247667312622, + "logps/chosen": -516.6282958984375, + "logps/rejected": -603.8510131835938, + "loss": 0.4765, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1180880069732666, + "rewards/margins": 1.259250283241272, + "rewards/rejected": -3.377338409423828, + "step": 2310 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 15.611988191868031, + "learning_rate": 2.4424245802601555e-07, + "logits/chosen": -0.7548837065696716, + "logits/rejected": -0.7677526473999023, + "logps/chosen": -436.63763427734375, + "logps/rejected": -580.12109375, + "loss": 0.448, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0357186794281006, + "rewards/margins": 0.8878978490829468, + "rewards/rejected": -2.923617124557495, + "step": 2320 + }, + { + "epoch": 0.559021113243762, + "grad_norm": 11.963131637466097, + "learning_rate": 2.421494030343072e-07, + "logits/chosen": -0.6403541564941406, + "logits/rejected": -0.6971467137336731, + "logps/chosen": -463.41668701171875, + "logps/rejected": -511.3497009277344, + "loss": 0.5057, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.029705047607422, + "rewards/margins": 0.9956240653991699, + "rewards/rejected": -3.025329113006592, + "step": 2330 + }, + { + "epoch": 0.5614203454894434, + "grad_norm": 12.457563977370029, + "learning_rate": 2.400568987307861e-07, + "logits/chosen": -0.6719120740890503, + "logits/rejected": -0.6879553198814392, + "logps/chosen": -428.35992431640625, + "logps/rejected": -484.6227111816406, + "loss": 0.4224, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0458950996398926, + "rewards/margins": 0.6281440258026123, + "rewards/rejected": -2.674039363861084, + "step": 2340 + }, + { + "epoch": 0.5638195777351248, + "grad_norm": 11.92177172958945, + "learning_rate": 2.379650918963156e-07, + "logits/chosen": -0.7526946663856506, + "logits/rejected": -0.7413941025733948, + "logps/chosen": -406.4283447265625, + "logps/rejected": -570.8126220703125, + "loss": 0.4205, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.06895112991333, + "rewards/margins": 1.500595211982727, + "rewards/rejected": -3.5695462226867676, + "step": 2350 + }, + { + "epoch": 0.5662188099808061, + "grad_norm": 16.624039363219666, + "learning_rate": 2.3587412926283438e-07, + "logits/chosen": -0.7837602496147156, + "logits/rejected": -0.7445356249809265, + "logps/chosen": -491.05059814453125, + "logps/rejected": -638.3920288085938, + "loss": 0.4727, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9546314477920532, + "rewards/margins": 1.7906033992767334, + "rewards/rejected": -3.745234966278076, + "step": 2360 + }, + { + "epoch": 0.5686180422264875, + "grad_norm": 9.978773646984067, + "learning_rate": 2.337841575030642e-07, + "logits/chosen": -0.6353663802146912, + "logits/rejected": -0.670727014541626, + "logps/chosen": -482.7936096191406, + "logps/rejected": -629.0820922851562, + "loss": 0.4658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9853990077972412, + "rewards/margins": 1.31783127784729, + "rewards/rejected": -3.3032302856445312, + "step": 2370 + }, + { + "epoch": 0.5710172744721689, + "grad_norm": 10.175503774195834, + "learning_rate": 2.316953232202206e-07, + "logits/chosen": -0.6700790524482727, + "logits/rejected": -0.7611835598945618, + "logps/chosen": -429.82666015625, + "logps/rejected": -465.277099609375, + "loss": 0.423, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9541041851043701, + "rewards/margins": 0.9658061861991882, + "rewards/rejected": -2.9199106693267822, + "step": 2380 + }, + { + "epoch": 0.5734165067178503, + "grad_norm": 11.211484923712767, + "learning_rate": 2.2960777293772958e-07, + "logits/chosen": -0.6513696908950806, + "logits/rejected": -0.6979095339775085, + "logps/chosen": -405.17120361328125, + "logps/rejected": -547.6156005859375, + "loss": 0.4611, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.898032546043396, + "rewards/margins": 1.5655709505081177, + "rewards/rejected": -3.4636034965515137, + "step": 2390 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 10.432280326904475, + "learning_rate": 2.2752165308894974e-07, + "logits/chosen": -0.6997479200363159, + "logits/rejected": -0.6795819401741028, + "logps/chosen": -405.59454345703125, + "logps/rejected": -530.2125244140625, + "loss": 0.4443, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1597695350646973, + "rewards/margins": 1.3249900341033936, + "rewards/rejected": -3.484759569168091, + "step": 2400 + }, + { + "epoch": 0.5782149712092131, + "grad_norm": 16.8329792009058, + "learning_rate": 2.254371100069005e-07, + "logits/chosen": -0.6621764898300171, + "logits/rejected": -0.612916111946106, + "logps/chosen": -440.20538330078125, + "logps/rejected": -585.4799194335938, + "loss": 0.4638, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.897138237953186, + "rewards/margins": 1.18193781375885, + "rewards/rejected": -3.0790762901306152, + "step": 2410 + }, + { + "epoch": 0.5806142034548945, + "grad_norm": 16.7833808451815, + "learning_rate": 2.2335428991399725e-07, + "logits/chosen": -0.6124377846717834, + "logits/rejected": -0.6342424154281616, + "logps/chosen": -424.91595458984375, + "logps/rejected": -682.1309814453125, + "loss": 0.4579, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3315539360046387, + "rewards/margins": 2.402623414993286, + "rewards/rejected": -4.734177112579346, + "step": 2420 + }, + { + "epoch": 0.5830134357005758, + "grad_norm": 11.37683798430042, + "learning_rate": 2.2127333891179458e-07, + "logits/chosen": -0.6946436166763306, + "logits/rejected": -0.7337637543678284, + "logps/chosen": -401.97235107421875, + "logps/rejected": -590.7584228515625, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.964185118675232, + "rewards/margins": 1.5734838247299194, + "rewards/rejected": -3.5376694202423096, + "step": 2430 + }, + { + "epoch": 0.5854126679462572, + "grad_norm": 14.137637654420558, + "learning_rate": 2.1919440297073782e-07, + "logits/chosen": -0.6848565340042114, + "logits/rejected": -0.7031614184379578, + "logps/chosen": -389.65863037109375, + "logps/rejected": -572.2056884765625, + "loss": 0.4726, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.85574471950531, + "rewards/margins": 1.6927335262298584, + "rewards/rejected": -3.548478364944458, + "step": 2440 + }, + { + "epoch": 0.5878119001919386, + "grad_norm": 11.976818733935065, + "learning_rate": 2.1711762791992368e-07, + "logits/chosen": -0.6271128058433533, + "logits/rejected": -0.6209608316421509, + "logps/chosen": -453.7582092285156, + "logps/rejected": -558.2598876953125, + "loss": 0.464, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6763954162597656, + "rewards/margins": 1.3264000415802002, + "rewards/rejected": -3.002795696258545, + "step": 2450 + }, + { + "epoch": 0.5902111324376199, + "grad_norm": 12.353087005836548, + "learning_rate": 2.1504315943687114e-07, + "logits/chosen": -0.7643033862113953, + "logits/rejected": -0.736054539680481, + "logps/chosen": -416.31512451171875, + "logps/rejected": -612.2337646484375, + "loss": 0.4454, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.881116509437561, + "rewards/margins": 1.4656778573989868, + "rewards/rejected": -3.346794605255127, + "step": 2460 + }, + { + "epoch": 0.5926103646833013, + "grad_norm": 15.676634180170838, + "learning_rate": 2.1297114303730248e-07, + "logits/chosen": -0.6767653226852417, + "logits/rejected": -0.6186730265617371, + "logps/chosen": -407.5095520019531, + "logps/rejected": -585.6527709960938, + "loss": 0.4968, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.816948652267456, + "rewards/margins": 1.2787069082260132, + "rewards/rejected": -3.0956554412841797, + "step": 2470 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 12.693263479126406, + "learning_rate": 2.1090172406493616e-07, + "logits/chosen": -0.6466863751411438, + "logits/rejected": -0.6392595171928406, + "logps/chosen": -390.559326171875, + "logps/rejected": -549.0421142578125, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5861185789108276, + "rewards/margins": 1.4114429950714111, + "rewards/rejected": -2.997561454772949, + "step": 2480 + }, + { + "epoch": 0.5974088291746641, + "grad_norm": 14.530343733322098, + "learning_rate": 2.0883504768129146e-07, + "logits/chosen": -0.7057468295097351, + "logits/rejected": -0.7234060764312744, + "logps/chosen": -470.16229248046875, + "logps/rejected": -603.843994140625, + "loss": 0.478, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9950097799301147, + "rewards/margins": 1.3909389972686768, + "rewards/rejected": -3.385948896408081, + "step": 2490 + }, + { + "epoch": 0.5998080614203455, + "grad_norm": 13.287779534586859, + "learning_rate": 2.0677125885550571e-07, + "logits/chosen": -0.5642179846763611, + "logits/rejected": -0.6275255084037781, + "logps/chosen": -422.0602111816406, + "logps/rejected": -509.85833740234375, + "loss": 0.4554, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8453181982040405, + "rewards/margins": 1.3027547597885132, + "rewards/rejected": -3.1480727195739746, + "step": 2500 + }, + { + "epoch": 0.6022072936660269, + "grad_norm": 15.517152991224327, + "learning_rate": 2.0471050235416587e-07, + "logits/chosen": -0.6314137578010559, + "logits/rejected": -0.7092536091804504, + "logps/chosen": -456.7703552246094, + "logps/rejected": -560.7614135742188, + "loss": 0.4175, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.005941390991211, + "rewards/margins": 1.4926408529281616, + "rewards/rejected": -3.498582363128662, + "step": 2510 + }, + { + "epoch": 0.6046065259117083, + "grad_norm": 15.430444876775148, + "learning_rate": 2.026529227311532e-07, + "logits/chosen": -0.696445107460022, + "logits/rejected": -0.6817110776901245, + "logps/chosen": -419.6112365722656, + "logps/rejected": -545.1299438476562, + "loss": 0.4725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.074556827545166, + "rewards/margins": 1.122429609298706, + "rewards/rejected": -3.196986436843872, + "step": 2520 + }, + { + "epoch": 0.6070057581573897, + "grad_norm": 12.135680251339695, + "learning_rate": 2.005986643175036e-07, + "logits/chosen": -0.6033408045768738, + "logits/rejected": -0.5708281993865967, + "logps/chosen": -444.126220703125, + "logps/rejected": -621.4119873046875, + "loss": 0.3872, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6977285146713257, + "rewards/margins": 1.8118762969970703, + "rewards/rejected": -3.5096049308776855, + "step": 2530 + }, + { + "epoch": 0.6094049904030711, + "grad_norm": 16.900656042802993, + "learning_rate": 1.9854787121128328e-07, + "logits/chosen": -0.6752146482467651, + "logits/rejected": -0.6918280124664307, + "logps/chosen": -413.39178466796875, + "logps/rejected": -453.151123046875, + "loss": 0.4863, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9585962295532227, + "rewards/margins": 0.8429145812988281, + "rewards/rejected": -2.8015105724334717, + "step": 2540 + }, + { + "epoch": 0.6118042226487524, + "grad_norm": 14.460169065470863, + "learning_rate": 1.9650068726748106e-07, + "logits/chosen": -0.6060270667076111, + "logits/rejected": -0.6441822052001953, + "logps/chosen": -460.1835021972656, + "logps/rejected": -609.755126953125, + "loss": 0.4705, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.047046184539795, + "rewards/margins": 1.4443639516830444, + "rewards/rejected": -3.4914097785949707, + "step": 2550 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 14.47008564939435, + "learning_rate": 1.9445725608791718e-07, + "logits/chosen": -0.6000035405158997, + "logits/rejected": -0.6268490552902222, + "logps/chosen": -412.12322998046875, + "logps/rejected": -622.6077270507812, + "loss": 0.4549, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5705702304840088, + "rewards/margins": 2.0281901359558105, + "rewards/rejected": -3.5987606048583984, + "step": 2560 + }, + { + "epoch": 0.6166026871401151, + "grad_norm": 16.15106890491746, + "learning_rate": 1.924177210111705e-07, + "logits/chosen": -0.6954480409622192, + "logits/rejected": -0.7049099206924438, + "logps/chosen": -394.63970947265625, + "logps/rejected": -582.3255004882812, + "loss": 0.4508, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.777197241783142, + "rewards/margins": 1.7624857425689697, + "rewards/rejected": -3.5396828651428223, + "step": 2570 + }, + { + "epoch": 0.6190019193857965, + "grad_norm": 11.273315658642467, + "learning_rate": 1.9038222510252364e-07, + "logits/chosen": -0.6852430105209351, + "logits/rejected": -0.6738708019256592, + "logps/chosen": -441.40020751953125, + "logps/rejected": -517.69921875, + "loss": 0.4767, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9201513528823853, + "rewards/margins": 0.8821185231208801, + "rewards/rejected": -2.8022701740264893, + "step": 2580 + }, + { + "epoch": 0.6214011516314779, + "grad_norm": 13.419885609170674, + "learning_rate": 1.883509111439277e-07, + "logits/chosen": -0.5949097275733948, + "logits/rejected": -0.5894945859909058, + "logps/chosen": -434.9554138183594, + "logps/rejected": -670.6376953125, + "loss": 0.4836, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0458106994628906, + "rewards/margins": 1.706055998802185, + "rewards/rejected": -3.7518672943115234, + "step": 2590 + }, + { + "epoch": 0.6238003838771593, + "grad_norm": 10.25337511089072, + "learning_rate": 1.8632392162398665e-07, + "logits/chosen": -0.6396509408950806, + "logits/rejected": -0.6442984342575073, + "logps/chosen": -447.4217834472656, + "logps/rejected": -649.7928466796875, + "loss": 0.4354, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6608057022094727, + "rewards/margins": 2.017603874206543, + "rewards/rejected": -3.6784095764160156, + "step": 2600 + }, + { + "epoch": 0.6261996161228407, + "grad_norm": 12.615964200890309, + "learning_rate": 1.84301398727962e-07, + "logits/chosen": -0.6086243391036987, + "logits/rejected": -0.5457442998886108, + "logps/chosen": -363.9615173339844, + "logps/rejected": -625.5221557617188, + "loss": 0.4274, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8102554082870483, + "rewards/margins": 2.2328672409057617, + "rewards/rejected": -4.0431227684021, + "step": 2610 + }, + { + "epoch": 0.6285988483685221, + "grad_norm": 13.561387993292696, + "learning_rate": 1.8228348432779966e-07, + "logits/chosen": -0.6876164078712463, + "logits/rejected": -0.6831247210502625, + "logps/chosen": -427.48388671875, + "logps/rejected": -554.5697631835938, + "loss": 0.5102, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0380465984344482, + "rewards/margins": 1.3742176294326782, + "rewards/rejected": -3.412264347076416, + "step": 2620 + }, + { + "epoch": 0.6309980806142035, + "grad_norm": 11.884741198970012, + "learning_rate": 1.8027031997217773e-07, + "logits/chosen": -0.7005245089530945, + "logits/rejected": -0.7127174139022827, + "logps/chosen": -418.88031005859375, + "logps/rejected": -650.9946899414062, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1531500816345215, + "rewards/margins": 2.098936080932617, + "rewards/rejected": -4.2520856857299805, + "step": 2630 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 13.756259667587228, + "learning_rate": 1.7826204687657758e-07, + "logits/chosen": -0.6128356456756592, + "logits/rejected": -0.5883530378341675, + "logps/chosen": -479.2039489746094, + "logps/rejected": -541.1580200195312, + "loss": 0.4189, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0355639457702637, + "rewards/margins": 1.0127185583114624, + "rewards/rejected": -3.0482823848724365, + "step": 2640 + }, + { + "epoch": 0.6357965451055663, + "grad_norm": 16.334230649896075, + "learning_rate": 1.762588059133781e-07, + "logits/chosen": -0.5741305947303772, + "logits/rejected": -0.5979640483856201, + "logps/chosen": -492.15985107421875, + "logps/rejected": -616.077880859375, + "loss": 0.4344, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.001828670501709, + "rewards/margins": 1.6204664707183838, + "rewards/rejected": -3.6222949028015137, + "step": 2650 + }, + { + "epoch": 0.6381957773512476, + "grad_norm": 12.818149694157945, + "learning_rate": 1.7426073760197406e-07, + "logits/chosen": -0.7115119099617004, + "logits/rejected": -0.7030835151672363, + "logps/chosen": -436.45440673828125, + "logps/rejected": -656.4097900390625, + "loss": 0.464, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0036306381225586, + "rewards/margins": 1.860400915145874, + "rewards/rejected": -3.864032030105591, + "step": 2660 + }, + { + "epoch": 0.6405950095969289, + "grad_norm": 10.772055196711287, + "learning_rate": 1.7226798209891935e-07, + "logits/chosen": -0.5705487132072449, + "logits/rejected": -0.6100784540176392, + "logps/chosen": -454.169677734375, + "logps/rejected": -567.9891967773438, + "loss": 0.4243, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.027036666870117, + "rewards/margins": 1.6898279190063477, + "rewards/rejected": -3.716864824295044, + "step": 2670 + }, + { + "epoch": 0.6429942418426103, + "grad_norm": 12.732993242920942, + "learning_rate": 1.7028067918809535e-07, + "logits/chosen": -0.6443219184875488, + "logits/rejected": -0.6607564687728882, + "logps/chosen": -408.63385009765625, + "logps/rejected": -678.4671630859375, + "loss": 0.4429, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9784246683120728, + "rewards/margins": 2.1958975791931152, + "rewards/rejected": -4.174322605133057, + "step": 2680 + }, + { + "epoch": 0.6453934740882917, + "grad_norm": 16.43098937212258, + "learning_rate": 1.6829896827090584e-07, + "logits/chosen": -0.6939103007316589, + "logits/rejected": -0.6823415756225586, + "logps/chosen": -444.9854431152344, + "logps/rejected": -528.9821166992188, + "loss": 0.4604, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0313303470611572, + "rewards/margins": 1.1062291860580444, + "rewards/rejected": -3.137559652328491, + "step": 2690 + }, + { + "epoch": 0.6477927063339731, + "grad_norm": 9.363672145947863, + "learning_rate": 1.6632298835649844e-07, + "logits/chosen": -0.5836836099624634, + "logits/rejected": -0.5799709558486938, + "logps/chosen": -470.08197021484375, + "logps/rejected": -686.9669799804688, + "loss": 0.4069, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0098516941070557, + "rewards/margins": 1.7816604375839233, + "rewards/rejected": -3.7915122509002686, + "step": 2700 + }, + { + "epoch": 0.6501919385796545, + "grad_norm": 17.2324020009346, + "learning_rate": 1.6435287805201364e-07, + "logits/chosen": -0.5617779493331909, + "logits/rejected": -0.5524694919586182, + "logps/chosen": -467.46075439453125, + "logps/rejected": -561.6852416992188, + "loss": 0.4668, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1388306617736816, + "rewards/margins": 1.0130404233932495, + "rewards/rejected": -3.1518714427948, + "step": 2710 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 12.016483428799015, + "learning_rate": 1.6238877555286207e-07, + "logits/chosen": -0.6310284733772278, + "logits/rejected": -0.6076905727386475, + "logps/chosen": -451.049560546875, + "logps/rejected": -625.4998779296875, + "loss": 0.4317, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.715829849243164, + "rewards/margins": 1.5645663738250732, + "rewards/rejected": -3.280395984649658, + "step": 2720 + }, + { + "epoch": 0.6549904030710173, + "grad_norm": 13.942118191847904, + "learning_rate": 1.60430818633031e-07, + "logits/chosen": -0.6367970705032349, + "logits/rejected": -0.6443176865577698, + "logps/chosen": -442.45428466796875, + "logps/rejected": -603.2288818359375, + "loss": 0.4292, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.833968162536621, + "rewards/margins": 1.6198402643203735, + "rewards/rejected": -3.453808546066284, + "step": 2730 + }, + { + "epoch": 0.6573896353166987, + "grad_norm": 12.960755123491403, + "learning_rate": 1.5847914463541939e-07, + "logits/chosen": -0.6094954013824463, + "logits/rejected": -0.616841197013855, + "logps/chosen": -387.78448486328125, + "logps/rejected": -546.4832763671875, + "loss": 0.4165, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.933266043663025, + "rewards/margins": 1.3234989643096924, + "rewards/rejected": -3.2567648887634277, + "step": 2740 + }, + { + "epoch": 0.6597888675623801, + "grad_norm": 11.216331431155993, + "learning_rate": 1.5653389046220427e-07, + "logits/chosen": -0.558444619178772, + "logits/rejected": -0.5738928318023682, + "logps/chosen": -416.5931091308594, + "logps/rejected": -565.6954345703125, + "loss": 0.4206, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8503406047821045, + "rewards/margins": 1.2912790775299072, + "rewards/rejected": -3.141619920730591, + "step": 2750 + }, + { + "epoch": 0.6621880998080614, + "grad_norm": 14.796243391579123, + "learning_rate": 1.545951925652375e-07, + "logits/chosen": -0.5394322872161865, + "logits/rejected": -0.5567634105682373, + "logps/chosen": -502.76947021484375, + "logps/rejected": -626.8363037109375, + "loss": 0.4198, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9490293264389038, + "rewards/margins": 1.7876237630844116, + "rewards/rejected": -3.7366535663604736, + "step": 2760 + }, + { + "epoch": 0.6645873320537428, + "grad_norm": 13.413386618717803, + "learning_rate": 1.5266318693647423e-07, + "logits/chosen": -0.5417942404747009, + "logits/rejected": -0.5404913425445557, + "logps/chosen": -455.65264892578125, + "logps/rejected": -571.8460693359375, + "loss": 0.4333, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9460630416870117, + "rewards/margins": 1.299930453300476, + "rewards/rejected": -3.2459938526153564, + "step": 2770 + }, + { + "epoch": 0.6669865642994242, + "grad_norm": 16.515416936082715, + "learning_rate": 1.5073800909843353e-07, + "logits/chosen": -0.5896440744400024, + "logits/rejected": -0.6135233640670776, + "logps/chosen": -448.375732421875, + "logps/rejected": -550.3060302734375, + "loss": 0.4321, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8330185413360596, + "rewards/margins": 1.5391408205032349, + "rewards/rejected": -3.372159481048584, + "step": 2780 + }, + { + "epoch": 0.6693857965451055, + "grad_norm": 16.63287415870903, + "learning_rate": 1.488197940946922e-07, + "logits/chosen": -0.5957229733467102, + "logits/rejected": -0.5872025489807129, + "logps/chosen": -433.13287353515625, + "logps/rejected": -544.17724609375, + "loss": 0.4116, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5587198734283447, + "rewards/margins": 1.6922566890716553, + "rewards/rejected": -3.250977039337158, + "step": 2790 + }, + { + "epoch": 0.6717850287907869, + "grad_norm": 17.943322401731898, + "learning_rate": 1.4690867648041167e-07, + "logits/chosen": -0.5465134978294373, + "logits/rejected": -0.5925148725509644, + "logps/chosen": -427.1700134277344, + "logps/rejected": -597.7564697265625, + "loss": 0.463, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7293113470077515, + "rewards/margins": 1.9395873546600342, + "rewards/rejected": -3.668898820877075, + "step": 2800 + }, + { + "epoch": 0.6741842610364683, + "grad_norm": 13.110879243171492, + "learning_rate": 1.4500479031289987e-07, + "logits/chosen": -0.5818893313407898, + "logits/rejected": -0.6302607655525208, + "logps/chosen": -445.3763732910156, + "logps/rejected": -590.7268676757812, + "loss": 0.4776, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6836143732070923, + "rewards/margins": 1.5271230936050415, + "rewards/rejected": -3.2107372283935547, + "step": 2810 + }, + { + "epoch": 0.6765834932821497, + "grad_norm": 12.058976212342188, + "learning_rate": 1.4310826914220747e-07, + "logits/chosen": -0.6128555536270142, + "logits/rejected": -0.6189436912536621, + "logps/chosen": -497.07183837890625, + "logps/rejected": -601.4428100585938, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8764116764068604, + "rewards/margins": 1.2488231658935547, + "rewards/rejected": -3.125235080718994, + "step": 2820 + }, + { + "epoch": 0.6789827255278311, + "grad_norm": 17.97354363119042, + "learning_rate": 1.412192460017597e-07, + "logits/chosen": -0.6210779547691345, + "logits/rejected": -0.6108576655387878, + "logps/chosen": -455.32391357421875, + "logps/rejected": -592.3138427734375, + "loss": 0.4641, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1085453033447266, + "rewards/margins": 1.3274444341659546, + "rewards/rejected": -3.4359898567199707, + "step": 2830 + }, + { + "epoch": 0.6813819577735125, + "grad_norm": 11.08581581164283, + "learning_rate": 1.3933785339902504e-07, + "logits/chosen": -0.6120859384536743, + "logits/rejected": -0.5777018666267395, + "logps/chosen": -392.5328674316406, + "logps/rejected": -571.60986328125, + "loss": 0.4874, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9550358057022095, + "rewards/margins": 1.3144079446792603, + "rewards/rejected": -3.2694435119628906, + "step": 2840 + }, + { + "epoch": 0.6837811900191939, + "grad_norm": 11.419675800311689, + "learning_rate": 1.374642233062197e-07, + "logits/chosen": -0.5789315104484558, + "logits/rejected": -0.5940367579460144, + "logps/chosen": -489.6767578125, + "logps/rejected": -603.5349731445312, + "loss": 0.4489, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0280773639678955, + "rewards/margins": 1.4480621814727783, + "rewards/rejected": -3.476139545440674, + "step": 2850 + }, + { + "epoch": 0.6861804222648752, + "grad_norm": 12.571278512714647, + "learning_rate": 1.355984871510511e-07, + "logits/chosen": -0.5551937818527222, + "logits/rejected": -0.526736319065094, + "logps/chosen": -485.8904724121094, + "logps/rejected": -637.25341796875, + "loss": 0.425, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9234853982925415, + "rewards/margins": 1.3963334560394287, + "rewards/rejected": -3.3198189735412598, + "step": 2860 + }, + { + "epoch": 0.6885796545105566, + "grad_norm": 10.614013070716824, + "learning_rate": 1.3374077580749783e-07, + "logits/chosen": -0.576995313167572, + "logits/rejected": -0.5995679497718811, + "logps/chosen": -382.17498779296875, + "logps/rejected": -547.46484375, + "loss": 0.4393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.897216796875, + "rewards/margins": 1.4525885581970215, + "rewards/rejected": -3.3498051166534424, + "step": 2870 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 18.09913751507019, + "learning_rate": 1.3189121958663024e-07, + "logits/chosen": -0.5638588666915894, + "logits/rejected": -0.6245552897453308, + "logps/chosen": -522.0628662109375, + "logps/rejected": -575.4417724609375, + "loss": 0.4336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.396254062652588, + "rewards/margins": 0.8919731974601746, + "rewards/rejected": -3.2882275581359863, + "step": 2880 + }, + { + "epoch": 0.6933781190019194, + "grad_norm": 15.80771289819406, + "learning_rate": 1.3004994822746895e-07, + "logits/chosen": -0.7042198181152344, + "logits/rejected": -0.697306752204895, + "logps/chosen": -436.90386962890625, + "logps/rejected": -577.8507080078125, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9693387746810913, + "rewards/margins": 1.3294403553009033, + "rewards/rejected": -3.298779010772705, + "step": 2890 + }, + { + "epoch": 0.6957773512476008, + "grad_norm": 11.587138387929464, + "learning_rate": 1.2821709088788434e-07, + "logits/chosen": -0.5107399821281433, + "logits/rejected": -0.5425523519515991, + "logps/chosen": -380.4593505859375, + "logps/rejected": -541.3460693359375, + "loss": 0.4398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7053035497665405, + "rewards/margins": 1.585318922996521, + "rewards/rejected": -3.2906222343444824, + "step": 2900 + }, + { + "epoch": 0.6981765834932822, + "grad_norm": 15.327291081862692, + "learning_rate": 1.2639277613553736e-07, + "logits/chosen": -0.5802925825119019, + "logits/rejected": -0.5720899105072021, + "logps/chosen": -379.57318115234375, + "logps/rejected": -496.7958068847656, + "loss": 0.4473, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8436704874038696, + "rewards/margins": 1.1200520992279053, + "rewards/rejected": -2.9637222290039062, + "step": 2910 + }, + { + "epoch": 0.7005758157389635, + "grad_norm": 11.468692622260464, + "learning_rate": 1.2457713193885975e-07, + "logits/chosen": -0.5771138072013855, + "logits/rejected": -0.5807372331619263, + "logps/chosen": -359.72613525390625, + "logps/rejected": -550.9989013671875, + "loss": 0.4176, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8891617059707642, + "rewards/margins": 1.5464417934417725, + "rewards/rejected": -3.435603618621826, + "step": 2920 + }, + { + "epoch": 0.7029750479846449, + "grad_norm": 17.115987201031448, + "learning_rate": 1.2277028565807838e-07, + "logits/chosen": -0.5637086629867554, + "logits/rejected": -0.584968090057373, + "logps/chosen": -446.92767333984375, + "logps/rejected": -574.7530517578125, + "loss": 0.468, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9414465427398682, + "rewards/margins": 1.3546481132507324, + "rewards/rejected": -3.2960944175720215, + "step": 2930 + }, + { + "epoch": 0.7053742802303263, + "grad_norm": 17.441141869897656, + "learning_rate": 1.209723640362815e-07, + "logits/chosen": -0.5792838335037231, + "logits/rejected": -0.5826687216758728, + "logps/chosen": -456.9283142089844, + "logps/rejected": -620.7843627929688, + "loss": 0.4813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8888061046600342, + "rewards/margins": 1.7339365482330322, + "rewards/rejected": -3.6227424144744873, + "step": 2940 + }, + { + "epoch": 0.7077735124760077, + "grad_norm": 11.312151269139996, + "learning_rate": 1.191834931905277e-07, + "logits/chosen": -0.5471521615982056, + "logits/rejected": -0.5616979598999023, + "logps/chosen": -510.0934143066406, + "logps/rejected": -647.6993408203125, + "loss": 0.4134, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.116403341293335, + "rewards/margins": 1.3659651279449463, + "rewards/rejected": -3.482367992401123, + "step": 2950 + }, + { + "epoch": 0.710172744721689, + "grad_norm": 13.959291203129078, + "learning_rate": 1.1740379860299988e-07, + "logits/chosen": -0.5202777981758118, + "logits/rejected": -0.5581659078598022, + "logps/chosen": -476.20635986328125, + "logps/rejected": -612.1878662109375, + "loss": 0.4634, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9924306869506836, + "rewards/margins": 1.1773386001586914, + "rewards/rejected": -3.169769287109375, + "step": 2960 + }, + { + "epoch": 0.7125719769673704, + "grad_norm": 13.163034374925202, + "learning_rate": 1.1563340511220254e-07, + "logits/chosen": -0.5559359788894653, + "logits/rejected": -0.5668517351150513, + "logps/chosen": -511.17333984375, + "logps/rejected": -623.8465576171875, + "loss": 0.4817, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2064290046691895, + "rewards/margins": 1.2941501140594482, + "rewards/rejected": -3.5005791187286377, + "step": 2970 + }, + { + "epoch": 0.7149712092130518, + "grad_norm": 12.726063519299634, + "learning_rate": 1.1387243690420556e-07, + "logits/chosen": -0.5109056234359741, + "logits/rejected": -0.5152195692062378, + "logps/chosen": -488.728759765625, + "logps/rejected": -659.3638305664062, + "loss": 0.4383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.865386724472046, + "rewards/margins": 1.8146740198135376, + "rewards/rejected": -3.680060863494873, + "step": 2980 + }, + { + "epoch": 0.7173704414587332, + "grad_norm": 15.060517596878292, + "learning_rate": 1.1212101750393235e-07, + "logits/chosen": -0.5651146173477173, + "logits/rejected": -0.5633836984634399, + "logps/chosen": -457.1798400878906, + "logps/rejected": -594.5474853515625, + "loss": 0.4016, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1187145709991455, + "rewards/margins": 1.5976879596710205, + "rewards/rejected": -3.716402530670166, + "step": 2990 + }, + { + "epoch": 0.7197696737044146, + "grad_norm": 10.840848096813268, + "learning_rate": 1.1037926976649562e-07, + "logits/chosen": -0.6062291860580444, + "logits/rejected": -0.5924742817878723, + "logps/chosen": -467.4085388183594, + "logps/rejected": -654.9591674804688, + "loss": 0.4878, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1459498405456543, + "rewards/margins": 1.5907325744628906, + "rewards/rejected": -3.736682415008545, + "step": 3000 + }, + { + "epoch": 0.722168905950096, + "grad_norm": 15.042187958894935, + "learning_rate": 1.0864731586857936e-07, + "logits/chosen": -0.4485263228416443, + "logits/rejected": -0.44067448377609253, + "logps/chosen": -470.13055419921875, + "logps/rejected": -602.8482666015625, + "loss": 0.4195, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9293149709701538, + "rewards/margins": 1.6746339797973633, + "rewards/rejected": -3.6039490699768066, + "step": 3010 + }, + { + "epoch": 0.7245681381957774, + "grad_norm": 12.194246416479364, + "learning_rate": 1.0692527729986839e-07, + "logits/chosen": -0.5851191282272339, + "logits/rejected": -0.592607855796814, + "logps/chosen": -451.42376708984375, + "logps/rejected": -582.010498046875, + "loss": 0.3994, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9533573389053345, + "rewards/margins": 1.4484539031982422, + "rewards/rejected": -3.401811122894287, + "step": 3020 + }, + { + "epoch": 0.7269673704414588, + "grad_norm": 14.340146970439987, + "learning_rate": 1.0521327485452692e-07, + "logits/chosen": -0.5049649477005005, + "logits/rejected": -0.5048503875732422, + "logps/chosen": -444.41015625, + "logps/rejected": -574.8997802734375, + "loss": 0.4438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.013583183288574, + "rewards/margins": 1.4341424703598022, + "rewards/rejected": -3.447725296020508, + "step": 3030 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 17.094188348902286, + "learning_rate": 1.0351142862272468e-07, + "logits/chosen": -0.4877733290195465, + "logits/rejected": -0.542160153388977, + "logps/chosen": -409.5499267578125, + "logps/rejected": -598.6688232421875, + "loss": 0.4483, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.956335425376892, + "rewards/margins": 1.96954345703125, + "rewards/rejected": -3.9258790016174316, + "step": 3040 + }, + { + "epoch": 0.7317658349328215, + "grad_norm": 16.384363468476412, + "learning_rate": 1.0181985798221343e-07, + "logits/chosen": -0.44771862030029297, + "logits/rejected": -0.48205646872520447, + "logps/chosen": -469.80364990234375, + "logps/rejected": -642.3453979492188, + "loss": 0.4837, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.060243844985962, + "rewards/margins": 1.60220468044281, + "rewards/rejected": -3.6624481678009033, + "step": 3050 + }, + { + "epoch": 0.7341650671785028, + "grad_norm": 15.159551813260798, + "learning_rate": 1.0013868158995329e-07, + "logits/chosen": -0.4246044158935547, + "logits/rejected": -0.45489010214805603, + "logps/chosen": -471.446533203125, + "logps/rejected": -587.3192138671875, + "loss": 0.4687, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.233337879180908, + "rewards/margins": 1.3818399906158447, + "rewards/rejected": -3.615177869796753, + "step": 3060 + }, + { + "epoch": 0.7365642994241842, + "grad_norm": 13.463524340329029, + "learning_rate": 9.84680173737887e-08, + "logits/chosen": -0.5496365427970886, + "logits/rejected": -0.569757878780365, + "logps/chosen": -493.57843017578125, + "logps/rejected": -592.6453247070312, + "loss": 0.4307, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.2892556190490723, + "rewards/margins": 1.3507412672042847, + "rewards/rejected": -3.6399970054626465, + "step": 3070 + }, + { + "epoch": 0.7389635316698656, + "grad_norm": 12.455106651157736, + "learning_rate": 9.680798252417713e-08, + "logits/chosen": -0.5762359499931335, + "logits/rejected": -0.5943504571914673, + "logps/chosen": -403.1676025390625, + "logps/rejected": -588.2984619140625, + "loss": 0.4171, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.9747021198272705, + "rewards/margins": 1.486598014831543, + "rewards/rejected": -3.4612998962402344, + "step": 3080 + }, + { + "epoch": 0.741362763915547, + "grad_norm": 14.204008950257876, + "learning_rate": 9.515869348596808e-08, + "logits/chosen": -0.5742790699005127, + "logits/rejected": -0.6299481987953186, + "logps/chosen": -498.3915100097656, + "logps/rejected": -616.3846435546875, + "loss": 0.453, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0285322666168213, + "rewards/margins": 1.541265845298767, + "rewards/rejected": -3.569798231124878, + "step": 3090 + }, + { + "epoch": 0.7437619961612284, + "grad_norm": 11.674082631607327, + "learning_rate": 9.352026595023493e-08, + "logits/chosen": -0.6226130723953247, + "logits/rejected": -0.6168379783630371, + "logps/chosen": -476.732666015625, + "logps/rejected": -552.8656005859375, + "loss": 0.4441, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.860538125038147, + "rewards/margins": 1.0402759313583374, + "rewards/rejected": -2.9008140563964844, + "step": 3100 + }, + { + "epoch": 0.7461612284069098, + "grad_norm": 13.589284320232862, + "learning_rate": 9.189281484616004e-08, + "logits/chosen": -0.5403670072555542, + "logits/rejected": -0.5388067960739136, + "logps/chosen": -406.925048828125, + "logps/rejected": -575.8323974609375, + "loss": 0.4779, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1276307106018066, + "rewards/margins": 1.1783473491668701, + "rewards/rejected": -3.3059780597686768, + "step": 3110 + }, + { + "epoch": 0.7485604606525912, + "grad_norm": 14.083306203022303, + "learning_rate": 9.027645433297249e-08, + "logits/chosen": -0.5009843707084656, + "logits/rejected": -0.5172004699707031, + "logps/chosen": -545.912109375, + "logps/rejected": -643.9588012695312, + "loss": 0.4964, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4063868522644043, + "rewards/margins": 1.3331215381622314, + "rewards/rejected": -3.7395083904266357, + "step": 3120 + }, + { + "epoch": 0.7509596928982726, + "grad_norm": 15.054060697910124, + "learning_rate": 8.867129779194066e-08, + "logits/chosen": -0.6103423833847046, + "logits/rejected": -0.6305662393569946, + "logps/chosen": -362.4765930175781, + "logps/rejected": -533.6757202148438, + "loss": 0.4347, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5669844150543213, + "rewards/margins": 1.7321140766143799, + "rewards/rejected": -3.2990989685058594, + "step": 3130 + }, + { + "epoch": 0.753358925143954, + "grad_norm": 15.171972092803582, + "learning_rate": 8.707745781841866e-08, + "logits/chosen": -0.5541486144065857, + "logits/rejected": -0.5569981932640076, + "logps/chosen": -399.5630187988281, + "logps/rejected": -563.7500610351562, + "loss": 0.4655, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.850064992904663, + "rewards/margins": 1.5996869802474976, + "rewards/rejected": -3.44975209236145, + "step": 3140 + }, + { + "epoch": 0.7557581573896354, + "grad_norm": 7.987545135931887, + "learning_rate": 8.549504621394831e-08, + "logits/chosen": -0.6387466192245483, + "logits/rejected": -0.629570484161377, + "logps/chosen": -387.7325134277344, + "logps/rejected": -584.460693359375, + "loss": 0.3641, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.571352243423462, + "rewards/margins": 1.9252641201019287, + "rewards/rejected": -3.4966163635253906, + "step": 3150 + }, + { + "epoch": 0.7581573896353166, + "grad_norm": 13.9921540642964, + "learning_rate": 8.392417397841703e-08, + "logits/chosen": -0.5311389565467834, + "logits/rejected": -0.563139796257019, + "logps/chosen": -412.6419982910156, + "logps/rejected": -557.9671630859375, + "loss": 0.4553, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.664912223815918, + "rewards/margins": 1.2914505004882812, + "rewards/rejected": -2.956362724304199, + "step": 3160 + }, + { + "epoch": 0.760556621880998, + "grad_norm": 10.54509313617014, + "learning_rate": 8.236495130227083e-08, + "logits/chosen": -0.507122278213501, + "logits/rejected": -0.5326481461524963, + "logps/chosen": -449.63214111328125, + "logps/rejected": -611.7938232421875, + "loss": 0.4536, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.660041093826294, + "rewards/margins": 1.9526357650756836, + "rewards/rejected": -3.6126770973205566, + "step": 3170 + }, + { + "epoch": 0.7629558541266794, + "grad_norm": 19.060642084344988, + "learning_rate": 8.081748755878612e-08, + "logits/chosen": -0.5622953176498413, + "logits/rejected": -0.6007119417190552, + "logps/chosen": -473.05029296875, + "logps/rejected": -541.9755249023438, + "loss": 0.4346, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0736916065216064, + "rewards/margins": 1.2077754735946655, + "rewards/rejected": -3.2814669609069824, + "step": 3180 + }, + { + "epoch": 0.7653550863723608, + "grad_norm": 12.641909967755108, + "learning_rate": 7.928189129639632e-08, + "logits/chosen": -0.5046022534370422, + "logits/rejected": -0.4738716185092926, + "logps/chosen": -424.58990478515625, + "logps/rejected": -587.5633544921875, + "loss": 0.4025, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.909470796585083, + "rewards/margins": 1.5023930072784424, + "rewards/rejected": -3.411863327026367, + "step": 3190 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 17.284721871893858, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -0.5278437733650208, + "logits/rejected": -0.5451136827468872, + "logps/chosen": -438.02001953125, + "logps/rejected": -594.689453125, + "loss": 0.4761, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9781577587127686, + "rewards/margins": 1.3622440099716187, + "rewards/rejected": -3.340402126312256, + "step": 3200 + }, + { + "epoch": 0.7701535508637236, + "grad_norm": 16.225895751875765, + "learning_rate": 7.624673123879682e-08, + "logits/chosen": -0.6049574017524719, + "logits/rejected": -0.6234583854675293, + "logps/chosen": -408.36663818359375, + "logps/rejected": -536.5324096679688, + "loss": 0.4467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7185170650482178, + "rewards/margins": 1.4202756881713867, + "rewards/rejected": -3.1387927532196045, + "step": 3210 + }, + { + "epoch": 0.772552783109405, + "grad_norm": 11.145879658096279, + "learning_rate": 7.474738034800663e-08, + "logits/chosen": -0.6624782085418701, + "logits/rejected": -0.6585075259208679, + "logps/chosen": -374.86956787109375, + "logps/rejected": -557.4089965820312, + "loss": 0.4768, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.858591079711914, + "rewards/margins": 1.8989025354385376, + "rewards/rejected": -3.7574939727783203, + "step": 3220 + }, + { + "epoch": 0.7749520153550864, + "grad_norm": 12.542053839469856, + "learning_rate": 7.326032273221606e-08, + "logits/chosen": -0.5727890133857727, + "logits/rejected": -0.5879526138305664, + "logps/chosen": -486.57806396484375, + "logps/rejected": -597.93115234375, + "loss": 0.41, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0051982402801514, + "rewards/margins": 1.297196626663208, + "rewards/rejected": -3.3023948669433594, + "step": 3230 + }, + { + "epoch": 0.7773512476007678, + "grad_norm": 14.33501525091917, + "learning_rate": 7.178566270260872e-08, + "logits/chosen": -0.5633407235145569, + "logits/rejected": -0.5676406621932983, + "logps/chosen": -460.4888610839844, + "logps/rejected": -608.435791015625, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.092958927154541, + "rewards/margins": 1.1691919565200806, + "rewards/rejected": -3.262151002883911, + "step": 3240 + }, + { + "epoch": 0.7797504798464492, + "grad_norm": 12.819177195288049, + "learning_rate": 7.032350370072709e-08, + "logits/chosen": -0.5324596166610718, + "logits/rejected": -0.5585157871246338, + "logps/chosen": -442.884765625, + "logps/rejected": -604.0455932617188, + "loss": 0.4107, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7358335256576538, + "rewards/margins": 1.707724928855896, + "rewards/rejected": -3.44355845451355, + "step": 3250 + }, + { + "epoch": 0.7821497120921305, + "grad_norm": 12.690278052375158, + "learning_rate": 6.887394829121596e-08, + "logits/chosen": -0.5658280849456787, + "logits/rejected": -0.6075069308280945, + "logps/chosen": -454.86376953125, + "logps/rejected": -674.8836059570312, + "loss": 0.4234, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9878448247909546, + "rewards/margins": 2.3019330501556396, + "rewards/rejected": -4.289777755737305, + "step": 3260 + }, + { + "epoch": 0.7845489443378119, + "grad_norm": 13.632469694204937, + "learning_rate": 6.743709815462833e-08, + "logits/chosen": -0.6113773584365845, + "logits/rejected": -0.62415611743927, + "logps/chosen": -446.94708251953125, + "logps/rejected": -559.8250732421875, + "loss": 0.4336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9104185104370117, + "rewards/margins": 1.419710397720337, + "rewards/rejected": -3.3301289081573486, + "step": 3270 + }, + { + "epoch": 0.7869481765834933, + "grad_norm": 11.823636613615504, + "learning_rate": 6.601305408029287e-08, + "logits/chosen": -0.47927188873291016, + "logits/rejected": -0.48892560601234436, + "logps/chosen": -441.11334228515625, + "logps/rejected": -583.4063720703125, + "loss": 0.4456, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.027625560760498, + "rewards/margins": 1.3957432508468628, + "rewards/rejected": -3.4233689308166504, + "step": 3280 + }, + { + "epoch": 0.7893474088291746, + "grad_norm": 16.011451751603392, + "learning_rate": 6.460191595924366e-08, + "logits/chosen": -0.5301553010940552, + "logits/rejected": -0.5367687940597534, + "logps/chosen": -456.830322265625, + "logps/rejected": -578.0042724609375, + "loss": 0.4282, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9470264911651611, + "rewards/margins": 1.198061227798462, + "rewards/rejected": -3.145087718963623, + "step": 3290 + }, + { + "epoch": 0.791746641074856, + "grad_norm": 12.691724390773901, + "learning_rate": 6.320378277721342e-08, + "logits/chosen": -0.496354877948761, + "logits/rejected": -0.5095658898353577, + "logps/chosen": -461.4742126464844, + "logps/rejected": -546.1395874023438, + "loss": 0.452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.140702724456787, + "rewards/margins": 0.8940545320510864, + "rewards/rejected": -3.034757137298584, + "step": 3300 + }, + { + "epoch": 0.7941458733205374, + "grad_norm": 16.823392530566938, + "learning_rate": 6.181875260769032e-08, + "logits/chosen": -0.560473620891571, + "logits/rejected": -0.5936623811721802, + "logps/chosen": -451.93463134765625, + "logps/rejected": -540.117431640625, + "loss": 0.4733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6345758438110352, + "rewards/margins": 1.5807870626449585, + "rewards/rejected": -3.215363025665283, + "step": 3310 + }, + { + "epoch": 0.7965451055662188, + "grad_norm": 14.520894739599417, + "learning_rate": 6.044692260503797e-08, + "logits/chosen": -0.5175925493240356, + "logits/rejected": -0.5312203764915466, + "logps/chosen": -503.57965087890625, + "logps/rejected": -643.9119873046875, + "loss": 0.3855, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1000008583068848, + "rewards/margins": 1.6387672424316406, + "rewards/rejected": -3.7387681007385254, + "step": 3320 + }, + { + "epoch": 0.7989443378119002, + "grad_norm": 14.229849309645635, + "learning_rate": 5.9088388997680984e-08, + "logits/chosen": -0.5600963830947876, + "logits/rejected": -0.5659655332565308, + "logps/chosen": -531.041259765625, + "logps/rejected": -604.8890380859375, + "loss": 0.4186, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.036707639694214, + "rewards/margins": 1.4657360315322876, + "rewards/rejected": -3.502443313598633, + "step": 3330 + }, + { + "epoch": 0.8013435700575816, + "grad_norm": 14.73962835970847, + "learning_rate": 5.774324708135439e-08, + "logits/chosen": -0.6263202428817749, + "logits/rejected": -0.6441248059272766, + "logps/chosen": -392.24835205078125, + "logps/rejected": -516.92236328125, + "loss": 0.4425, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7824525833129883, + "rewards/margins": 1.451863169670105, + "rewards/rejected": -3.2343153953552246, + "step": 3340 + }, + { + "epoch": 0.803742802303263, + "grad_norm": 10.763534598750553, + "learning_rate": 5.641159121241953e-08, + "logits/chosen": -0.5910140872001648, + "logits/rejected": -0.5627475380897522, + "logps/chosen": -398.29547119140625, + "logps/rejected": -605.0794067382812, + "loss": 0.453, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8737194538116455, + "rewards/margins": 1.7353298664093018, + "rewards/rejected": -3.609048366546631, + "step": 3350 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 13.4623185493143, + "learning_rate": 5.5093514801245106e-08, + "logits/chosen": -0.497117817401886, + "logits/rejected": -0.5237521529197693, + "logps/chosen": -448.5010681152344, + "logps/rejected": -618.0789794921875, + "loss": 0.4387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0547759532928467, + "rewards/margins": 1.4181969165802002, + "rewards/rejected": -3.472972869873047, + "step": 3360 + }, + { + "epoch": 0.8085412667946257, + "grad_norm": 13.694316296216712, + "learning_rate": 5.378911030565453e-08, + "logits/chosen": -0.44851231575012207, + "logits/rejected": -0.4383707046508789, + "logps/chosen": -530.1419677734375, + "logps/rejected": -675.80224609375, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4019455909729004, + "rewards/margins": 1.1452901363372803, + "rewards/rejected": -3.5472354888916016, + "step": 3370 + }, + { + "epoch": 0.8109404990403071, + "grad_norm": 10.528600910072944, + "learning_rate": 5.249846922444101e-08, + "logits/chosen": -0.5667535066604614, + "logits/rejected": -0.5957349538803101, + "logps/chosen": -412.9205017089844, + "logps/rejected": -626.1097412109375, + "loss": 0.4203, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0233278274536133, + "rewards/margins": 2.25299334526062, + "rewards/rejected": -4.276320934295654, + "step": 3380 + }, + { + "epoch": 0.8133397312859885, + "grad_norm": 14.951755734068287, + "learning_rate": 5.122168209094865e-08, + "logits/chosen": -0.49277129769325256, + "logits/rejected": -0.5049806833267212, + "logps/chosen": -425.42291259765625, + "logps/rejected": -525.0523071289062, + "loss": 0.4278, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1365087032318115, + "rewards/margins": 1.0094716548919678, + "rewards/rejected": -3.1459803581237793, + "step": 3390 + }, + { + "epoch": 0.8157389635316699, + "grad_norm": 10.733513837258595, + "learning_rate": 4.995883846672222e-08, + "logits/chosen": -0.5534166693687439, + "logits/rejected": -0.5597847700119019, + "logps/chosen": -573.193115234375, + "logps/rejected": -633.9830322265625, + "loss": 0.4259, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0612425804138184, + "rewards/margins": 1.392027735710144, + "rewards/rejected": -3.453270435333252, + "step": 3400 + }, + { + "epoch": 0.8181381957773513, + "grad_norm": 14.785275906962475, + "learning_rate": 4.871002693522486e-08, + "logits/chosen": -0.601963222026825, + "logits/rejected": -0.6202664971351624, + "logps/chosen": -458.74700927734375, + "logps/rejected": -539.7118530273438, + "loss": 0.4571, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.000706434249878, + "rewards/margins": 1.0372394323349, + "rewards/rejected": -3.0379462242126465, + "step": 3410 + }, + { + "epoch": 0.8205374280230326, + "grad_norm": 10.774672959999748, + "learning_rate": 4.7475335095623956e-08, + "logits/chosen": -0.5421626567840576, + "logits/rejected": -0.5486319661140442, + "logps/chosen": -448.543212890625, + "logps/rejected": -592.2151489257812, + "loss": 0.4437, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.940687894821167, + "rewards/margins": 1.5618858337402344, + "rewards/rejected": -3.5025742053985596, + "step": 3420 + }, + { + "epoch": 0.822936660268714, + "grad_norm": 18.59487101118735, + "learning_rate": 4.6254849556646714e-08, + "logits/chosen": -0.4963017404079437, + "logits/rejected": -0.4984667897224426, + "logps/chosen": -487.8688049316406, + "logps/rejected": -633.2571411132812, + "loss": 0.4242, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9285959005355835, + "rewards/margins": 1.74213445186615, + "rewards/rejected": -3.6707305908203125, + "step": 3430 + }, + { + "epoch": 0.8253358925143954, + "grad_norm": 14.44038111770707, + "learning_rate": 4.504865593050483e-08, + "logits/chosen": -0.5637535452842712, + "logits/rejected": -0.5861309170722961, + "logps/chosen": -481.045654296875, + "logps/rejected": -599.7122192382812, + "loss": 0.4611, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.163588047027588, + "rewards/margins": 1.1554136276245117, + "rewards/rejected": -3.3190014362335205, + "step": 3440 + }, + { + "epoch": 0.8277351247600768, + "grad_norm": 15.606897326164194, + "learning_rate": 4.385683882688895e-08, + "logits/chosen": -0.5581148862838745, + "logits/rejected": -0.5503061413764954, + "logps/chosen": -502.58038330078125, + "logps/rejected": -550.5048828125, + "loss": 0.5052, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2417547702789307, + "rewards/margins": 0.969860851764679, + "rewards/rejected": -3.2116153240203857, + "step": 3450 + }, + { + "epoch": 0.8301343570057581, + "grad_norm": 14.412578218384434, + "learning_rate": 4.2679481847033985e-08, + "logits/chosen": -0.5228904485702515, + "logits/rejected": -0.5363970994949341, + "logps/chosen": -470.89544677734375, + "logps/rejected": -635.0361328125, + "loss": 0.4608, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.150740623474121, + "rewards/margins": 1.5161142349243164, + "rewards/rejected": -3.6668548583984375, + "step": 3460 + }, + { + "epoch": 0.8325335892514395, + "grad_norm": 12.138527821387553, + "learning_rate": 4.151666757785435e-08, + "logits/chosen": -0.6146914958953857, + "logits/rejected": -0.6073625087738037, + "logps/chosen": -406.7055969238281, + "logps/rejected": -636.0123291015625, + "loss": 0.4199, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6833385229110718, + "rewards/margins": 2.205997943878174, + "rewards/rejected": -3.889336347579956, + "step": 3470 + }, + { + "epoch": 0.8349328214971209, + "grad_norm": 14.790830386193752, + "learning_rate": 4.036847758615136e-08, + "logits/chosen": -0.4863740801811218, + "logits/rejected": -0.5530039668083191, + "logps/chosen": -498.92742919921875, + "logps/rejected": -627.8193359375, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.640145778656006, + "rewards/margins": 1.157325029373169, + "rewards/rejected": -3.797470808029175, + "step": 3480 + }, + { + "epoch": 0.8373320537428023, + "grad_norm": 10.798032597651305, + "learning_rate": 3.923499241289113e-08, + "logits/chosen": -0.6010452508926392, + "logits/rejected": -0.6536823511123657, + "logps/chosen": -528.5853881835938, + "logps/rejected": -596.8575439453125, + "loss": 0.4873, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2441864013671875, + "rewards/margins": 1.2915928363800049, + "rewards/rejected": -3.5357794761657715, + "step": 3490 + }, + { + "epoch": 0.8397312859884837, + "grad_norm": 10.865503254373744, + "learning_rate": 3.811629156755541e-08, + "logits/chosen": -0.5816788077354431, + "logits/rejected": -0.6040675640106201, + "logps/chosen": -496.4483947753906, + "logps/rejected": -629.3803100585938, + "loss": 0.4407, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0880205631256104, + "rewards/margins": 1.3838069438934326, + "rewards/rejected": -3.471827268600464, + "step": 3500 + }, + { + "epoch": 0.8421305182341651, + "grad_norm": 11.278652918262004, + "learning_rate": 3.701245352256391e-08, + "logits/chosen": -0.5675192475318909, + "logits/rejected": -0.6156761050224304, + "logps/chosen": -493.8138732910156, + "logps/rejected": -581.7841796875, + "loss": 0.449, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.045945882797241, + "rewards/margins": 1.155053973197937, + "rewards/rejected": -3.2009997367858887, + "step": 3510 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 13.271574282231718, + "learning_rate": 3.592355570776984e-08, + "logits/chosen": -0.6514331102371216, + "logits/rejected": -0.652426540851593, + "logps/chosen": -390.1234436035156, + "logps/rejected": -560.892822265625, + "loss": 0.4027, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7292102575302124, + "rewards/margins": 1.5589096546173096, + "rewards/rejected": -3.2881197929382324, + "step": 3520 + }, + { + "epoch": 0.8469289827255279, + "grad_norm": 12.264631757796657, + "learning_rate": 3.484967450502904e-08, + "logits/chosen": -0.5596613883972168, + "logits/rejected": -0.5722562670707703, + "logps/chosen": -371.233154296875, + "logps/rejected": -591.3458862304688, + "loss": 0.4278, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6561084985733032, + "rewards/margins": 1.7522211074829102, + "rewards/rejected": -3.408329725265503, + "step": 3530 + }, + { + "epoch": 0.8493282149712092, + "grad_norm": 21.707760756412107, + "learning_rate": 3.3790885242841296e-08, + "logits/chosen": -0.5783206224441528, + "logits/rejected": -0.6104044318199158, + "logps/chosen": -453.5904235839844, + "logps/rejected": -656.5443115234375, + "loss": 0.4022, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1034021377563477, + "rewards/margins": 2.0261425971984863, + "rewards/rejected": -4.129544258117676, + "step": 3540 + }, + { + "epoch": 0.8517274472168906, + "grad_norm": 17.547573796246525, + "learning_rate": 3.274726219106677e-08, + "logits/chosen": -0.5989304780960083, + "logits/rejected": -0.6391880512237549, + "logps/chosen": -502.286865234375, + "logps/rejected": -645.5471801757812, + "loss": 0.465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.106433868408203, + "rewards/margins": 1.479160189628601, + "rewards/rejected": -3.5855941772460938, + "step": 3550 + }, + { + "epoch": 0.8541266794625719, + "grad_norm": 12.613105176272517, + "learning_rate": 3.171887855571642e-08, + "logits/chosen": -0.5413884520530701, + "logits/rejected": -0.4987201690673828, + "logps/chosen": -401.0578918457031, + "logps/rejected": -514.5476684570312, + "loss": 0.4373, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.801924467086792, + "rewards/margins": 1.2051035165786743, + "rewards/rejected": -3.007028102874756, + "step": 3560 + }, + { + "epoch": 0.8565259117082533, + "grad_norm": 14.359058138666166, + "learning_rate": 3.070580647381643e-08, + "logits/chosen": -0.5530554056167603, + "logits/rejected": -0.5961068272590637, + "logps/chosen": -413.8770446777344, + "logps/rejected": -572.0676879882812, + "loss": 0.4628, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.856571912765503, + "rewards/margins": 1.5779253244400024, + "rewards/rejected": -3.434497356414795, + "step": 3570 + }, + { + "epoch": 0.8589251439539347, + "grad_norm": 13.165867775542413, + "learning_rate": 2.9708117008348576e-08, + "logits/chosen": -0.5645931959152222, + "logits/rejected": -0.5597985982894897, + "logps/chosen": -502.2027893066406, + "logps/rejected": -563.4842529296875, + "loss": 0.4342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9971221685409546, + "rewards/margins": 1.1243679523468018, + "rewards/rejected": -3.121490240097046, + "step": 3580 + }, + { + "epoch": 0.8613243761996161, + "grad_norm": 12.288962805449156, + "learning_rate": 2.8725880143264992e-08, + "logits/chosen": -0.6008769869804382, + "logits/rejected": -0.6012517213821411, + "logps/chosen": -473.57110595703125, + "logps/rejected": -637.85546875, + "loss": 0.4957, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3291287422180176, + "rewards/margins": 1.2044109106063843, + "rewards/rejected": -3.5335395336151123, + "step": 3590 + }, + { + "epoch": 0.8637236084452975, + "grad_norm": 20.135961770884553, + "learning_rate": 2.775916477857948e-08, + "logits/chosen": -0.5295973420143127, + "logits/rejected": -0.5415645837783813, + "logps/chosen": -432.3699645996094, + "logps/rejected": -547.469970703125, + "loss": 0.4302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.17122220993042, + "rewards/margins": 1.1380698680877686, + "rewards/rejected": -3.3092925548553467, + "step": 3600 + }, + { + "epoch": 0.8661228406909789, + "grad_norm": 14.482337367851747, + "learning_rate": 2.680803872553408e-08, + "logits/chosen": -0.5756790637969971, + "logits/rejected": -0.651614248752594, + "logps/chosen": -415.51275634765625, + "logps/rejected": -632.2459716796875, + "loss": 0.4528, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7809759378433228, + "rewards/margins": 2.2642104625701904, + "rewards/rejected": -4.045186519622803, + "step": 3610 + }, + { + "epoch": 0.8685220729366603, + "grad_norm": 17.202988883981018, + "learning_rate": 2.5872568701842706e-08, + "logits/chosen": -0.5497530698776245, + "logits/rejected": -0.566763162612915, + "logps/chosen": -387.01171875, + "logps/rejected": -564.3225708007812, + "loss": 0.4966, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8862674236297607, + "rewards/margins": 1.5258787870407104, + "rewards/rejected": -3.4121460914611816, + "step": 3620 + }, + { + "epoch": 0.8709213051823417, + "grad_norm": 16.587596939503936, + "learning_rate": 2.495282032701096e-08, + "logits/chosen": -0.5850919485092163, + "logits/rejected": -0.6418455839157104, + "logps/chosen": -336.73431396484375, + "logps/rejected": -482.5956115722656, + "loss": 0.4306, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7132467031478882, + "rewards/margins": 1.6492410898208618, + "rewards/rejected": -3.36248779296875, + "step": 3630 + }, + { + "epoch": 0.8733205374280231, + "grad_norm": 17.480004730447185, + "learning_rate": 2.4048858117733133e-08, + "logits/chosen": -0.6369383931159973, + "logits/rejected": -0.6253448724746704, + "logps/chosen": -445.79388427734375, + "logps/rejected": -605.3549194335938, + "loss": 0.4271, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9298584461212158, + "rewards/margins": 2.020317554473877, + "rewards/rejected": -3.9501757621765137, + "step": 3640 + }, + { + "epoch": 0.8757197696737045, + "grad_norm": 15.836939219932262, + "learning_rate": 2.3160745483366938e-08, + "logits/chosen": -0.5421168208122253, + "logits/rejected": -0.5399103760719299, + "logps/chosen": -435.59417724609375, + "logps/rejected": -604.5552978515625, + "loss": 0.432, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.094374179840088, + "rewards/margins": 1.313892126083374, + "rewards/rejected": -3.408266067504883, + "step": 3650 + }, + { + "epoch": 0.8781190019193857, + "grad_norm": 15.232342004495118, + "learning_rate": 2.2288544721485197e-08, + "logits/chosen": -0.6265963912010193, + "logits/rejected": -0.6633044481277466, + "logps/chosen": -367.7839660644531, + "logps/rejected": -573.351318359375, + "loss": 0.4105, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6053600311279297, + "rewards/margins": 1.8461459875106812, + "rewards/rejected": -3.4515061378479004, + "step": 3660 + }, + { + "epoch": 0.8805182341650671, + "grad_norm": 14.172769513689536, + "learning_rate": 2.1432317013506117e-08, + "logits/chosen": -0.65406334400177, + "logits/rejected": -0.6587377190589905, + "logps/chosen": -456.6194763183594, + "logps/rejected": -549.1373291015625, + "loss": 0.4751, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.102041721343994, + "rewards/margins": 1.3290612697601318, + "rewards/rejected": -3.431102752685547, + "step": 3670 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 14.701070733746729, + "learning_rate": 2.0592122420401704e-08, + "logits/chosen": -0.4700722098350525, + "logits/rejected": -0.46845799684524536, + "logps/chosen": -396.937255859375, + "logps/rejected": -518.0514526367188, + "loss": 0.4567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.834855318069458, + "rewards/margins": 1.0623095035552979, + "rewards/rejected": -2.8971645832061768, + "step": 3680 + }, + { + "epoch": 0.8853166986564299, + "grad_norm": 12.846048547846815, + "learning_rate": 1.976801987848459e-08, + "logits/chosen": -0.6387890577316284, + "logits/rejected": -0.6454359292984009, + "logps/chosen": -442.06005859375, + "logps/rejected": -623.1151733398438, + "loss": 0.4364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8863179683685303, + "rewards/margins": 1.654547095298767, + "rewards/rejected": -3.540865421295166, + "step": 3690 + }, + { + "epoch": 0.8877159309021113, + "grad_norm": 13.872418259324444, + "learning_rate": 1.8960067195273987e-08, + "logits/chosen": -0.6006834506988525, + "logits/rejected": -0.6487979888916016, + "logps/chosen": -395.2511901855469, + "logps/rejected": -565.4385986328125, + "loss": 0.4271, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9122810363769531, + "rewards/margins": 1.7357158660888672, + "rewards/rejected": -3.647996425628662, + "step": 3700 + }, + { + "epoch": 0.8901151631477927, + "grad_norm": 14.265685542528358, + "learning_rate": 1.816832104544072e-08, + "logits/chosen": -0.47106099128723145, + "logits/rejected": -0.5008233189582825, + "logps/chosen": -471.165283203125, + "logps/rejected": -570.4010620117188, + "loss": 0.4474, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.060793876647949, + "rewards/margins": 1.2439154386520386, + "rewards/rejected": -3.3047091960906982, + "step": 3710 + }, + { + "epoch": 0.8925143953934741, + "grad_norm": 12.43063163239937, + "learning_rate": 1.7392836966831553e-08, + "logits/chosen": -0.5043891668319702, + "logits/rejected": -0.541654646396637, + "logps/chosen": -437.30902099609375, + "logps/rejected": -612.6980590820312, + "loss": 0.4117, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.7850959300994873, + "rewards/margins": 2.040393352508545, + "rewards/rejected": -3.825489044189453, + "step": 3720 + }, + { + "epoch": 0.8949136276391555, + "grad_norm": 15.428500593517235, + "learning_rate": 1.663366935657373e-08, + "logits/chosen": -0.6029775738716125, + "logits/rejected": -0.6049376726150513, + "logps/chosen": -399.92132568359375, + "logps/rejected": -564.7398681640625, + "loss": 0.4785, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9421736001968384, + "rewards/margins": 1.540740966796875, + "rewards/rejected": -3.482914447784424, + "step": 3730 + }, + { + "epoch": 0.8973128598848369, + "grad_norm": 15.901745967879851, + "learning_rate": 1.5890871467258898e-08, + "logits/chosen": -0.5083228945732117, + "logits/rejected": -0.5067955851554871, + "logps/chosen": -519.1453857421875, + "logps/rejected": -613.5746459960938, + "loss": 0.4358, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.022681474685669, + "rewards/margins": 1.2650549411773682, + "rewards/rejected": -3.287736415863037, + "step": 3740 + }, + { + "epoch": 0.8997120921305183, + "grad_norm": 12.288479611102789, + "learning_rate": 1.5164495403207967e-08, + "logits/chosen": -0.6138381958007812, + "logits/rejected": -0.648627758026123, + "logps/chosen": -492.9188537597656, + "logps/rejected": -688.1244506835938, + "loss": 0.4429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.270721673965454, + "rewards/margins": 1.671618103981018, + "rewards/rejected": -3.9423396587371826, + "step": 3750 + }, + { + "epoch": 0.9021113243761996, + "grad_norm": 13.881705443551352, + "learning_rate": 1.4454592116815962e-08, + "logits/chosen": -0.5159580707550049, + "logits/rejected": -0.5592847466468811, + "logps/chosen": -446.95257568359375, + "logps/rejected": -608.6639404296875, + "loss": 0.4088, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.826229453086853, + "rewards/margins": 1.4294321537017822, + "rewards/rejected": -3.255661725997925, + "step": 3760 + }, + { + "epoch": 0.904510556621881, + "grad_norm": 9.457402626337869, + "learning_rate": 1.3761211404977934e-08, + "logits/chosen": -0.6075922250747681, + "logits/rejected": -0.6064502596855164, + "logps/chosen": -420.793701171875, + "logps/rejected": -630.1043701171875, + "loss": 0.3875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.974437952041626, + "rewards/margins": 2.114816427230835, + "rewards/rejected": -4.089253902435303, + "step": 3770 + }, + { + "epoch": 0.9069097888675623, + "grad_norm": 15.273283970687197, + "learning_rate": 1.3084401905596177e-08, + "logits/chosen": -0.5783820152282715, + "logits/rejected": -0.640872597694397, + "logps/chosen": -474.72100830078125, + "logps/rejected": -551.1171875, + "loss": 0.4556, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9363771677017212, + "rewards/margins": 1.1951727867126465, + "rewards/rejected": -3.1315500736236572, + "step": 3780 + }, + { + "epoch": 0.9093090211132437, + "grad_norm": 15.207248331238338, + "learning_rate": 1.2424211094168053e-08, + "logits/chosen": -0.44624510407447815, + "logits/rejected": -0.45766526460647583, + "logps/chosen": -504.5450134277344, + "logps/rejected": -639.1659545898438, + "loss": 0.4296, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8555368185043335, + "rewards/margins": 1.4501540660858154, + "rewards/rejected": -3.3056907653808594, + "step": 3790 + }, + { + "epoch": 0.9117082533589251, + "grad_norm": 12.766030520120715, + "learning_rate": 1.1780685280456143e-08, + "logits/chosen": -0.561526894569397, + "logits/rejected": -0.5872783064842224, + "logps/chosen": -519.5202026367188, + "logps/rejected": -713.41357421875, + "loss": 0.4758, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.273921251296997, + "rewards/margins": 1.8192304372787476, + "rewards/rejected": -4.093151569366455, + "step": 3800 + }, + { + "epoch": 0.9141074856046065, + "grad_norm": 14.582108396435707, + "learning_rate": 1.1153869605239564e-08, + "logits/chosen": -0.5614346265792847, + "logits/rejected": -0.5825585722923279, + "logps/chosen": -472.6640625, + "logps/rejected": -527.7772216796875, + "loss": 0.441, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9882261753082275, + "rewards/margins": 0.9708881378173828, + "rewards/rejected": -2.9591145515441895, + "step": 3810 + }, + { + "epoch": 0.9165067178502879, + "grad_norm": 13.50064006592574, + "learning_rate": 1.0543808037147606e-08, + "logits/chosen": -0.6324799060821533, + "logits/rejected": -0.6540195345878601, + "logps/chosen": -440.49188232421875, + "logps/rejected": -644.3251342773438, + "loss": 0.4312, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.86978018283844, + "rewards/margins": 2.010561227798462, + "rewards/rejected": -3.8803412914276123, + "step": 3820 + }, + { + "epoch": 0.9189059500959693, + "grad_norm": 10.258131441990278, + "learning_rate": 9.95054336957557e-09, + "logits/chosen": -0.5800519585609436, + "logits/rejected": -0.6037092208862305, + "logps/chosen": -437.89251708984375, + "logps/rejected": -572.6655883789062, + "loss": 0.3879, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8630110025405884, + "rewards/margins": 1.2645984888076782, + "rewards/rejected": -3.1276094913482666, + "step": 3830 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 14.86589440455775, + "learning_rate": 9.37411721768286e-09, + "logits/chosen": -0.530808687210083, + "logits/rejected": -0.5798245668411255, + "logps/chosen": -473.449951171875, + "logps/rejected": -675.644775390625, + "loss": 0.4163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.131751298904419, + "rewards/margins": 1.583280324935913, + "rewards/rejected": -3.715031385421753, + "step": 3840 + }, + { + "epoch": 0.9237044145873321, + "grad_norm": 13.05727695798275, + "learning_rate": 8.81457001547392e-09, + "logits/chosen": -0.5503281354904175, + "logits/rejected": -0.533139169216156, + "logps/chosen": -491.100341796875, + "logps/rejected": -626.0603637695312, + "loss": 0.4468, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2812390327453613, + "rewards/margins": 1.2485569715499878, + "rewards/rejected": -3.5297958850860596, + "step": 3850 + }, + { + "epoch": 0.9261036468330134, + "grad_norm": 14.54576174319674, + "learning_rate": 8.271941012961942e-09, + "logits/chosen": -0.48925477266311646, + "logits/rejected": -0.4920194149017334, + "logps/chosen": -403.09722900390625, + "logps/rejected": -654.4439086914062, + "loss": 0.4432, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.928261399269104, + "rewards/margins": 1.9341392517089844, + "rewards/rejected": -3.862400531768799, + "step": 3860 + }, + { + "epoch": 0.9285028790786948, + "grad_norm": 13.945738623677995, + "learning_rate": 7.746268273415568e-09, + "logits/chosen": -0.5374631285667419, + "logits/rejected": -0.5382106900215149, + "logps/chosen": -466.7080993652344, + "logps/rejected": -586.3713989257812, + "loss": 0.4421, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0114636421203613, + "rewards/margins": 0.7434648871421814, + "rewards/rejected": -2.7549285888671875, + "step": 3870 + }, + { + "epoch": 0.9309021113243762, + "grad_norm": 12.078781330743034, + "learning_rate": 7.237588670689076e-09, + "logits/chosen": -0.6346238255500793, + "logits/rejected": -0.696995198726654, + "logps/chosen": -439.1622619628906, + "logps/rejected": -595.7288818359375, + "loss": 0.4174, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9709396362304688, + "rewards/margins": 1.8637230396270752, + "rewards/rejected": -3.834662675857544, + "step": 3880 + }, + { + "epoch": 0.9333013435700576, + "grad_norm": 12.482916834754162, + "learning_rate": 6.745937886635606e-09, + "logits/chosen": -0.6023680567741394, + "logits/rejected": -0.597190797328949, + "logps/chosen": -462.8761291503906, + "logps/rejected": -663.2772827148438, + "loss": 0.42, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8327722549438477, + "rewards/margins": 1.956402063369751, + "rewards/rejected": -3.7891743183135986, + "step": 3890 + }, + { + "epoch": 0.935700575815739, + "grad_norm": 14.24690648354876, + "learning_rate": 6.271350408604409e-09, + "logits/chosen": -0.5814956426620483, + "logits/rejected": -0.5836547613143921, + "logps/chosen": -371.41156005859375, + "logps/rejected": -554.6315307617188, + "loss": 0.419, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.471068024635315, + "rewards/margins": 1.6207103729248047, + "rewards/rejected": -3.09177827835083, + "step": 3900 + }, + { + "epoch": 0.9380998080614203, + "grad_norm": 13.651182373973166, + "learning_rate": 5.813859527021487e-09, + "logits/chosen": -0.5507432222366333, + "logits/rejected": -0.5853307843208313, + "logps/chosen": -442.532470703125, + "logps/rejected": -599.6609497070312, + "loss": 0.4181, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9738194942474365, + "rewards/margins": 1.8410329818725586, + "rewards/rejected": -3.814852476119995, + "step": 3910 + }, + { + "epoch": 0.9404990403071017, + "grad_norm": 12.561972537735913, + "learning_rate": 5.373497333054616e-09, + "logits/chosen": -0.6120710968971252, + "logits/rejected": -0.620995044708252, + "logps/chosen": -499.99139404296875, + "logps/rejected": -590.181396484375, + "loss": 0.4722, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.298414945602417, + "rewards/margins": 0.9743332862854004, + "rewards/rejected": -3.2727482318878174, + "step": 3920 + }, + { + "epoch": 0.9428982725527831, + "grad_norm": 16.011643842899986, + "learning_rate": 4.950294716362213e-09, + "logits/chosen": -0.5541747212409973, + "logits/rejected": -0.5821543335914612, + "logps/chosen": -511.50970458984375, + "logps/rejected": -614.6409301757812, + "loss": 0.4314, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2209866046905518, + "rewards/margins": 1.0603018999099731, + "rewards/rejected": -3.2812886238098145, + "step": 3930 + }, + { + "epoch": 0.9452975047984645, + "grad_norm": 10.553547990487088, + "learning_rate": 4.544281362926422e-09, + "logits/chosen": -0.6090785264968872, + "logits/rejected": -0.6422208547592163, + "logps/chosen": -493.9991760253906, + "logps/rejected": -632.8602294921875, + "loss": 0.4246, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9454662799835205, + "rewards/margins": 1.474827527999878, + "rewards/rejected": -3.4202942848205566, + "step": 3940 + }, + { + "epoch": 0.9476967370441459, + "grad_norm": 12.9216379203511, + "learning_rate": 4.15548575297095e-09, + "logits/chosen": -0.6255580186843872, + "logits/rejected": -0.6386123895645142, + "logps/chosen": -431.75634765625, + "logps/rejected": -608.4300537109375, + "loss": 0.4087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9927335977554321, + "rewards/margins": 1.8527963161468506, + "rewards/rejected": -3.8455300331115723, + "step": 3950 + }, + { + "epoch": 0.9500959692898272, + "grad_norm": 9.979083154735912, + "learning_rate": 3.7839351589631366e-09, + "logits/chosen": -0.5646272301673889, + "logits/rejected": -0.5582197308540344, + "logps/chosen": -406.14178466796875, + "logps/rejected": -604.2752075195312, + "loss": 0.415, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8224601745605469, + "rewards/margins": 1.4527934789657593, + "rewards/rejected": -3.275254011154175, + "step": 3960 + }, + { + "epoch": 0.9524952015355086, + "grad_norm": 14.72280065382829, + "learning_rate": 3.4296556437010405e-09, + "logits/chosen": -0.6492162942886353, + "logits/rejected": -0.6573163866996765, + "logps/chosen": -392.5443420410156, + "logps/rejected": -550.0157470703125, + "loss": 0.4284, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9494428634643555, + "rewards/margins": 1.5496604442596436, + "rewards/rejected": -3.499103546142578, + "step": 3970 + }, + { + "epoch": 0.95489443378119, + "grad_norm": 19.52331839763403, + "learning_rate": 3.092672058485124e-09, + "logits/chosen": -0.5937837362289429, + "logits/rejected": -0.5867224931716919, + "logps/chosen": -420.0135192871094, + "logps/rejected": -637.5535888671875, + "loss": 0.4764, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9328027963638306, + "rewards/margins": 1.9535869359970093, + "rewards/rejected": -3.886389970779419, + "step": 3980 + }, + { + "epoch": 0.9572936660268714, + "grad_norm": 15.465373415357526, + "learning_rate": 2.7730080413750356e-09, + "logits/chosen": -0.5144689083099365, + "logits/rejected": -0.5366243720054626, + "logps/chosen": -462.44580078125, + "logps/rejected": -614.26904296875, + "loss": 0.4304, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9767814874649048, + "rewards/margins": 1.4872825145721436, + "rewards/rejected": -3.464064121246338, + "step": 3990 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 12.554501395226785, + "learning_rate": 2.4706860155316033e-09, + "logits/chosen": -0.5792466402053833, + "logits/rejected": -0.5846326947212219, + "logps/chosen": -533.231201171875, + "logps/rejected": -670.0733642578125, + "loss": 0.46, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0717945098876953, + "rewards/margins": 1.349689245223999, + "rewards/rejected": -3.4214844703674316, + "step": 4000 + }, + { + "epoch": 0.9596928982725528, + "eval_logits/chosen": -0.5574566721916199, + "eval_logits/rejected": -0.5789428949356079, + "eval_logps/chosen": -453.795654296875, + "eval_logps/rejected": -623.4196166992188, + "eval_loss": 0.42671090364456177, + "eval_rewards/accuracies": 0.8446428775787354, + "eval_rewards/chosen": -2.0115108489990234, + "eval_rewards/margins": 1.6149202585220337, + "eval_rewards/rejected": -3.6264309883117676, + "eval_runtime": 208.7971, + "eval_samples_per_second": 21.365, + "eval_steps_per_second": 0.335, + "step": 4000 + }, + { + "epoch": 0.9620921305182342, + "grad_norm": 14.306928956925486, + "learning_rate": 2.185727187643843e-09, + "logits/chosen": -0.5993860363960266, + "logits/rejected": -0.6200038194656372, + "logps/chosen": -403.6116943359375, + "logps/rejected": -601.7921752929688, + "loss": 0.4649, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9371687173843384, + "rewards/margins": 1.8979822397232056, + "rewards/rejected": -3.835150957107544, + "step": 4010 + }, + { + "epoch": 0.9644913627639156, + "grad_norm": 16.135414053750264, + "learning_rate": 1.9181515464413434e-09, + "logits/chosen": -0.5625258088111877, + "logits/rejected": -0.5911010503768921, + "logps/chosen": -544.6771240234375, + "logps/rejected": -725.96142578125, + "loss": 0.3909, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8679109811782837, + "rewards/margins": 1.8002150058746338, + "rewards/rejected": -3.668125867843628, + "step": 4020 + }, + { + "epoch": 0.966890595009597, + "grad_norm": 15.134608783297615, + "learning_rate": 1.6679778612923302e-09, + "logits/chosen": -0.5708626508712769, + "logits/rejected": -0.5663528442382812, + "logps/chosen": -496.3414001464844, + "logps/rejected": -607.8493041992188, + "loss": 0.403, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.051764965057373, + "rewards/margins": 1.1176114082336426, + "rewards/rejected": -3.1693766117095947, + "step": 4030 + }, + { + "epoch": 0.9692898272552783, + "grad_norm": 15.867680764731526, + "learning_rate": 1.43522368088686e-09, + "logits/chosen": -0.5292009115219116, + "logits/rejected": -0.5536502003669739, + "logps/chosen": -493.24053955078125, + "logps/rejected": -687.3652954101562, + "loss": 0.4896, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3322641849517822, + "rewards/margins": 1.9033256769180298, + "rewards/rejected": -4.235589504241943, + "step": 4040 + }, + { + "epoch": 0.9716890595009597, + "grad_norm": 18.145967539331632, + "learning_rate": 1.2199053320059993e-09, + "logits/chosen": -0.5495906472206116, + "logits/rejected": -0.5624712109565735, + "logps/chosen": -464.6747131347656, + "logps/rejected": -626.3593139648438, + "loss": 0.4444, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9679105281829834, + "rewards/margins": 1.4822068214416504, + "rewards/rejected": -3.4501171112060547, + "step": 4050 + }, + { + "epoch": 0.974088291746641, + "grad_norm": 11.736148609957327, + "learning_rate": 1.0220379183764338e-09, + "logits/chosen": -0.6397042870521545, + "logits/rejected": -0.6401645541191101, + "logps/chosen": -382.80255126953125, + "logps/rejected": -585.4246826171875, + "loss": 0.4335, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8114726543426514, + "rewards/margins": 1.9243446588516235, + "rewards/rejected": -3.7358174324035645, + "step": 4060 + }, + { + "epoch": 0.9764875239923224, + "grad_norm": 13.40993494403174, + "learning_rate": 8.416353196111503e-10, + "logits/chosen": -0.531282901763916, + "logits/rejected": -0.5239174365997314, + "logps/chosen": -447.2433166503906, + "logps/rejected": -588.00341796875, + "loss": 0.4917, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.130341053009033, + "rewards/margins": 1.5175530910491943, + "rewards/rejected": -3.6478943824768066, + "step": 4070 + }, + { + "epoch": 0.9788867562380038, + "grad_norm": 14.008572116854983, + "learning_rate": 6.787101902356873e-10, + "logits/chosen": -0.5458533763885498, + "logits/rejected": -0.530588686466217, + "logps/chosen": -478.8675842285156, + "logps/rejected": -640.27783203125, + "loss": 0.4184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1983494758605957, + "rewards/margins": 1.3467283248901367, + "rewards/rejected": -3.5450775623321533, + "step": 4080 + }, + { + "epoch": 0.9812859884836852, + "grad_norm": 17.14797402633899, + "learning_rate": 5.332739588005953e-10, + "logits/chosen": -0.6215013861656189, + "logits/rejected": -0.6441851854324341, + "logps/chosen": -390.38421630859375, + "logps/rejected": -607.4275512695312, + "loss": 0.4348, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.911041498184204, + "rewards/margins": 1.8259137868881226, + "rewards/rejected": -3.736954927444458, + "step": 4090 + }, + { + "epoch": 0.9836852207293666, + "grad_norm": 14.786463692836161, + "learning_rate": 4.053368270797164e-10, + "logits/chosen": -0.5029186010360718, + "logits/rejected": -0.5265758037567139, + "logps/chosen": -459.8993225097656, + "logps/rejected": -595.8419189453125, + "loss": 0.4479, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2223143577575684, + "rewards/margins": 1.3676447868347168, + "rewards/rejected": -3.589958667755127, + "step": 4100 + }, + { + "epoch": 0.986084452975048, + "grad_norm": 10.752700482691958, + "learning_rate": 2.949077693545354e-10, + "logits/chosen": -0.46297192573547363, + "logits/rejected": -0.522523045539856, + "logps/chosen": -502.4552307128906, + "logps/rejected": -644.851318359375, + "loss": 0.4713, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2377562522888184, + "rewards/margins": 1.0717887878417969, + "rewards/rejected": -3.3095450401306152, + "step": 4110 + }, + { + "epoch": 0.9884836852207294, + "grad_norm": 13.109993080936734, + "learning_rate": 2.0199453178471047e-10, + "logits/chosen": -0.5104750394821167, + "logits/rejected": -0.5807961225509644, + "logps/chosen": -529.2632446289062, + "logps/rejected": -605.51416015625, + "loss": 0.4171, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1927380561828613, + "rewards/margins": 1.1302746534347534, + "rewards/rejected": -3.323012590408325, + "step": 4120 + }, + { + "epoch": 0.9908829174664108, + "grad_norm": 14.68142360061518, + "learning_rate": 1.266036318647301e-10, + "logits/chosen": -0.5655652284622192, + "logits/rejected": -0.5766940116882324, + "logps/chosen": -503.4109802246094, + "logps/rejected": -674.8923950195312, + "loss": 0.4159, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8927507400512695, + "rewards/margins": 1.9182952642440796, + "rewards/rejected": -3.8110461235046387, + "step": 4130 + }, + { + "epoch": 0.9932821497120922, + "grad_norm": 16.922904240171835, + "learning_rate": 6.874035796672339e-11, + "logits/chosen": -0.5960877537727356, + "logits/rejected": -0.5835133194923401, + "logps/chosen": -456.58172607421875, + "logps/rejected": -612.15087890625, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7158613204956055, + "rewards/margins": 2.088901996612549, + "rewards/rejected": -3.8047633171081543, + "step": 4140 + }, + { + "epoch": 0.9956813819577736, + "grad_norm": 16.596879517738124, + "learning_rate": 2.8408768969423458e-11, + "logits/chosen": -0.6005167961120605, + "logits/rejected": -0.6138831377029419, + "logps/chosen": -461.8202209472656, + "logps/rejected": -612.9940185546875, + "loss": 0.4126, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8562465906143188, + "rewards/margins": 1.3878483772277832, + "rewards/rejected": -3.2440948486328125, + "step": 4150 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 15.505557355450042, + "learning_rate": 5.611693973617271e-12, + "logits/chosen": -0.5526952743530273, + "logits/rejected": -0.5355725288391113, + "logps/chosen": -412.329345703125, + "logps/rejected": -580.66357421875, + "loss": 0.4472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9485257863998413, + "rewards/margins": 1.4926297664642334, + "rewards/rejected": -3.4411556720733643, + "step": 4160 + }, + { + "epoch": 1.0, + "step": 4168, + "total_flos": 0.0, + "train_loss": 0.49609584714538074, + "train_runtime": 16148.8615, + "train_samples_per_second": 8.259, + "train_steps_per_second": 0.258 + } + ], + "logging_steps": 10, + "max_steps": 4168, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}