{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 931, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010741138560687433, "grad_norm": 38.01819610595703, "learning_rate": 5.3191489361702125e-09, "logits/chosen": -1.7826814651489258, "logits/rejected": -1.9013216495513916, "logps/chosen": -277.2808837890625, "logps/rejected": -667.4992065429688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010741138560687433, "grad_norm": 34.09270477294922, "learning_rate": 5.3191489361702123e-08, "logits/chosen": -2.232851266860962, "logits/rejected": -2.225285530090332, "logps/chosen": -305.5078125, "logps/rejected": -258.5682067871094, "loss": 0.6934, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.00024179136380553246, "rewards/margins": -0.0007453116122633219, "rewards/rejected": 0.000987103208899498, "step": 10 }, { "epoch": 0.021482277121374866, "grad_norm": 33.920143127441406, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -2.0934910774230957, "logits/rejected": -2.0479633808135986, "logps/chosen": -278.1986389160156, "logps/rejected": -270.8542175292969, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.002228913828730583, "rewards/margins": 0.0013251047348603606, "rewards/rejected": 0.0009038090938702226, "step": 20 }, { "epoch": 0.0322234156820623, "grad_norm": 36.32479476928711, "learning_rate": 1.5957446808510638e-07, "logits/chosen": -2.2630741596221924, "logits/rejected": -2.158916473388672, "logps/chosen": -329.6264343261719, "logps/rejected": -238.6651153564453, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": 0.015065526589751244, "rewards/margins": 0.008328847587108612, "rewards/rejected": 0.006736679468303919, "step": 30 }, { "epoch": 0.04296455424274973, "grad_norm": 29.183805465698242, "learning_rate": 2.127659574468085e-07, "logits/chosen": -2.290919542312622, "logits/rejected": -2.1671340465545654, "logps/chosen": -298.9222412109375, "logps/rejected": -290.89227294921875, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04827628284692764, "rewards/margins": 0.017906121909618378, "rewards/rejected": 0.030370160937309265, "step": 40 }, { "epoch": 0.05370569280343716, "grad_norm": 29.736236572265625, "learning_rate": 2.659574468085106e-07, "logits/chosen": -2.400148868560791, "logits/rejected": -2.2399775981903076, "logps/chosen": -286.6379699707031, "logps/rejected": -248.4271697998047, "loss": 0.6748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08774404227733612, "rewards/margins": 0.037349071353673935, "rewards/rejected": 0.05039495974779129, "step": 50 }, { "epoch": 0.0644468313641246, "grad_norm": 29.81789207458496, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -2.3034303188323975, "logits/rejected": -2.119563102722168, "logps/chosen": -292.1720275878906, "logps/rejected": -246.50967407226562, "loss": 0.6684, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1263999193906784, "rewards/margins": 0.08450669795274734, "rewards/rejected": 0.04189322888851166, "step": 60 }, { "epoch": 0.07518796992481203, "grad_norm": 28.368366241455078, "learning_rate": 3.7234042553191484e-07, "logits/chosen": -2.194551944732666, "logits/rejected": -2.0943870544433594, "logps/chosen": -270.94085693359375, "logps/rejected": -222.0772705078125, "loss": 0.6529, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15133240818977356, "rewards/margins": 0.09613539278507233, "rewards/rejected": 0.05519700050354004, "step": 70 }, { "epoch": 0.08592910848549946, "grad_norm": 32.63807678222656, "learning_rate": 4.25531914893617e-07, "logits/chosen": -2.110665798187256, "logits/rejected": -2.0879478454589844, "logps/chosen": -302.55810546875, "logps/rejected": -276.37298583984375, "loss": 0.6364, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12729570269584656, "rewards/margins": 0.13354265689849854, "rewards/rejected": -0.006246985401958227, "step": 80 }, { "epoch": 0.0966702470461869, "grad_norm": 33.7596549987793, "learning_rate": 4.787234042553192e-07, "logits/chosen": -2.011441707611084, "logits/rejected": -1.953314185142517, "logps/chosen": -275.5764465332031, "logps/rejected": -278.00384521484375, "loss": 0.6135, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.013616062700748444, "rewards/margins": 0.2872251570224762, "rewards/rejected": -0.30084121227264404, "step": 90 }, { "epoch": 0.10741138560687433, "grad_norm": 49.362213134765625, "learning_rate": 4.999366067542832e-07, "logits/chosen": -2.1004366874694824, "logits/rejected": -1.8379312753677368, "logps/chosen": -325.45672607421875, "logps/rejected": -329.73748779296875, "loss": 0.6027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.26145023107528687, "rewards/margins": 0.2924392819404602, "rewards/rejected": -0.5538895726203918, "step": 100 }, { "epoch": 0.11815252416756176, "grad_norm": 53.218753814697266, "learning_rate": 4.995493200065303e-07, "logits/chosen": -2.0087809562683105, "logits/rejected": -1.840013861656189, "logps/chosen": -386.00592041015625, "logps/rejected": -363.0046081542969, "loss": 0.5916, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.4549340605735779, "rewards/margins": 0.5379987359046936, "rewards/rejected": -0.9929327964782715, "step": 110 }, { "epoch": 0.1288936627282492, "grad_norm": 47.64997482299805, "learning_rate": 4.988105098641859e-07, "logits/chosen": -1.9923064708709717, "logits/rejected": -1.8889319896697998, "logps/chosen": -368.4234313964844, "logps/rejected": -384.9602355957031, "loss": 0.5813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.44473686814308167, "rewards/margins": 0.4787873327732086, "rewards/rejected": -0.9235240817070007, "step": 120 }, { "epoch": 0.13963480128893663, "grad_norm": 38.56282424926758, "learning_rate": 4.977212170395597e-07, "logits/chosen": -1.8189750909805298, "logits/rejected": -1.6740334033966064, "logps/chosen": -328.65789794921875, "logps/rejected": -318.7656555175781, "loss": 0.5653, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4469381272792816, "rewards/margins": 0.42922893166542053, "rewards/rejected": -0.8761669993400574, "step": 130 }, { "epoch": 0.15037593984962405, "grad_norm": 60.210453033447266, "learning_rate": 4.962829759464157e-07, "logits/chosen": -1.896112084388733, "logits/rejected": -1.7687921524047852, "logps/chosen": -346.6222229003906, "logps/rejected": -360.77880859375, "loss": 0.6072, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5491372346878052, "rewards/margins": 0.2989022731781006, "rewards/rejected": -0.8480396270751953, "step": 140 }, { "epoch": 0.1611170784103115, "grad_norm": 48.47090148925781, "learning_rate": 4.944978125385465e-07, "logits/chosen": -2.036083698272705, "logits/rejected": -1.8509035110473633, "logps/chosen": -321.4471130371094, "logps/rejected": -355.3517150878906, "loss": 0.5702, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.34621065855026245, "rewards/margins": 0.6038306951522827, "rewards/rejected": -0.9500414133071899, "step": 150 }, { "epoch": 0.17185821697099893, "grad_norm": 43.58736801147461, "learning_rate": 4.923682414559481e-07, "logits/chosen": -1.8674886226654053, "logits/rejected": -1.689318299293518, "logps/chosen": -357.22833251953125, "logps/rejected": -359.64666748046875, "loss": 0.5438, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3814140856266022, "rewards/margins": 0.5234302282333374, "rewards/rejected": -0.9048442840576172, "step": 160 }, { "epoch": 0.18259935553168635, "grad_norm": 68.03955841064453, "learning_rate": 4.898972624826147e-07, "logits/chosen": -1.8476066589355469, "logits/rejected": -1.6870009899139404, "logps/chosen": -403.6802673339844, "logps/rejected": -395.5543212890625, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5655828714370728, "rewards/margins": 0.6453245282173157, "rewards/rejected": -1.210907220840454, "step": 170 }, { "epoch": 0.1933404940923738, "grad_norm": 54.94865036010742, "learning_rate": 4.870883563209449e-07, "logits/chosen": -1.838039755821228, "logits/rejected": -1.630350112915039, "logps/chosen": -387.94305419921875, "logps/rejected": -392.7327880859375, "loss": 0.5404, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5131097435951233, "rewards/margins": 0.5987601280212402, "rewards/rejected": -1.1118699312210083, "step": 180 }, { "epoch": 0.20408163265306123, "grad_norm": 57.38792037963867, "learning_rate": 4.839454796887089e-07, "logits/chosen": -2.1099352836608887, "logits/rejected": -1.886162519454956, "logps/chosen": -425.6485290527344, "logps/rejected": -410.21478271484375, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": -0.5665225982666016, "rewards/margins": 0.683864414691925, "rewards/rejected": -1.2503869533538818, "step": 190 }, { "epoch": 0.21482277121374865, "grad_norm": 44.05226135253906, "learning_rate": 4.804730597454859e-07, "logits/chosen": -2.0036187171936035, "logits/rejected": -1.8792438507080078, "logps/chosen": -399.13043212890625, "logps/rejected": -396.62396240234375, "loss": 0.5527, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4406675696372986, "rewards/margins": 0.7380279302597046, "rewards/rejected": -1.1786954402923584, "step": 200 }, { "epoch": 0.22556390977443608, "grad_norm": 51.885616302490234, "learning_rate": 4.7667598785642125e-07, "logits/chosen": -2.023136615753174, "logits/rejected": -1.8735549449920654, "logps/chosen": -310.3551025390625, "logps/rejected": -374.1414489746094, "loss": 0.5322, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.42302098870277405, "rewards/margins": 0.5779639482498169, "rewards/rejected": -1.0009849071502686, "step": 210 }, { "epoch": 0.23630504833512353, "grad_norm": 81.05691528320312, "learning_rate": 4.725596127020879e-07, "logits/chosen": -1.7004687786102295, "logits/rejected": -1.7193877696990967, "logps/chosen": -350.72308349609375, "logps/rejected": -415.49591064453125, "loss": 0.5508, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7310193777084351, "rewards/margins": 0.5848621129989624, "rewards/rejected": -1.315881609916687, "step": 220 }, { "epoch": 0.24704618689581095, "grad_norm": 53.66835021972656, "learning_rate": 4.6812973274415924e-07, "logits/chosen": -1.7762218713760376, "logits/rejected": -1.6072883605957031, "logps/chosen": -355.7483825683594, "logps/rejected": -385.47930908203125, "loss": 0.5362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.48422494530677795, "rewards/margins": 0.607512354850769, "rewards/rejected": -1.091737151145935, "step": 230 }, { "epoch": 0.2577873254564984, "grad_norm": 61.94645309448242, "learning_rate": 4.633925880575046e-07, "logits/chosen": -1.9013745784759521, "logits/rejected": -1.7701694965362549, "logps/chosen": -420.7286682128906, "logps/rejected": -435.3941955566406, "loss": 0.518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7990320920944214, "rewards/margins": 0.7329047918319702, "rewards/rejected": -1.5319368839263916, "step": 240 }, { "epoch": 0.26852846401718583, "grad_norm": 72.49921417236328, "learning_rate": 4.5835485154021437e-07, "logits/chosen": -1.7423961162567139, "logits/rejected": -1.6719386577606201, "logps/chosen": -402.57159423828125, "logps/rejected": -477.9698791503906, "loss": 0.5094, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6995307803153992, "rewards/margins": 0.7330148220062256, "rewards/rejected": -1.4325456619262695, "step": 250 }, { "epoch": 0.27926960257787325, "grad_norm": 65.41045379638672, "learning_rate": 4.530236195139358e-07, "logits/chosen": -1.8140243291854858, "logits/rejected": -1.6735155582427979, "logps/chosen": -393.9617919921875, "logps/rejected": -421.1630859375, "loss": 0.5377, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5548108816146851, "rewards/margins": 0.7069646716117859, "rewards/rejected": -1.2617756128311157, "step": 260 }, { "epoch": 0.2900107411385607, "grad_norm": 49.067054748535156, "learning_rate": 4.474064017277605e-07, "logits/chosen": -1.8181359767913818, "logits/rejected": -1.7128044366836548, "logps/chosen": -370.6304016113281, "logps/rejected": -398.3502502441406, "loss": 0.544, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7615430951118469, "rewards/margins": 0.4647931158542633, "rewards/rejected": -1.2263362407684326, "step": 270 }, { "epoch": 0.3007518796992481, "grad_norm": 67.5071792602539, "learning_rate": 4.415111107797445e-07, "logits/chosen": -1.7189642190933228, "logits/rejected": -1.7779271602630615, "logps/chosen": -348.5400695800781, "logps/rejected": -425.5625915527344, "loss": 0.5203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6641637682914734, "rewards/margins": 0.7238661050796509, "rewards/rejected": -1.3880298137664795, "step": 280 }, { "epoch": 0.31149301825993553, "grad_norm": 50.27863311767578, "learning_rate": 4.3534605097096176e-07, "logits/chosen": -1.651114821434021, "logits/rejected": -1.5353507995605469, "logps/chosen": -335.2391357421875, "logps/rejected": -443.3240661621094, "loss": 0.5236, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7241489291191101, "rewards/margins": 0.6447311639785767, "rewards/rejected": -1.368880033493042, "step": 290 }, { "epoch": 0.322234156820623, "grad_norm": 52.84365463256836, "learning_rate": 4.289199066077922e-07, "logits/chosen": -1.712467908859253, "logits/rejected": -1.4087601900100708, "logps/chosen": -359.71502685546875, "logps/rejected": -427.78045654296875, "loss": 0.5071, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7437978386878967, "rewards/margins": 0.8904324769973755, "rewards/rejected": -1.6342302560806274, "step": 300 }, { "epoch": 0.33297529538131043, "grad_norm": 73.85507202148438, "learning_rate": 4.2224172976892166e-07, "logits/chosen": -1.4589178562164307, "logits/rejected": -1.3919631242752075, "logps/chosen": -395.29888916015625, "logps/rejected": -469.9727478027344, "loss": 0.5085, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9644006490707397, "rewards/margins": 0.9210098385810852, "rewards/rejected": -1.8854105472564697, "step": 310 }, { "epoch": 0.34371643394199786, "grad_norm": 163.49395751953125, "learning_rate": 4.1532092755428525e-07, "logits/chosen": -1.453866958618164, "logits/rejected": -1.392270803451538, "logps/chosen": -352.23748779296875, "logps/rejected": -432.1480407714844, "loss": 0.5065, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8710242509841919, "rewards/margins": 0.7999363541603088, "rewards/rejected": -1.6709606647491455, "step": 320 }, { "epoch": 0.3544575725026853, "grad_norm": 87.29029846191406, "learning_rate": 4.0816724883391707e-07, "logits/chosen": -1.7530879974365234, "logits/rejected": -1.5067665576934814, "logps/chosen": -385.63702392578125, "logps/rejected": -428.7237854003906, "loss": 0.5205, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8045473098754883, "rewards/margins": 0.8592826128005981, "rewards/rejected": -1.663830041885376, "step": 330 }, { "epoch": 0.3651987110633727, "grad_norm": 59.684043884277344, "learning_rate": 4.007907705153699e-07, "logits/chosen": -1.6779248714447021, "logits/rejected": -1.6681525707244873, "logps/chosen": -394.14007568359375, "logps/rejected": -467.6216735839844, "loss": 0.543, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5750120878219604, "rewards/margins": 0.646149754524231, "rewards/rejected": -1.2211616039276123, "step": 340 }, { "epoch": 0.37593984962406013, "grad_norm": 53.85585403442383, "learning_rate": 3.932018833490518e-07, "logits/chosen": -1.727351427078247, "logits/rejected": -1.599186658859253, "logps/chosen": -391.8357849121094, "logps/rejected": -442.3279724121094, "loss": 0.5156, "rewards/accuracies": 0.75, "rewards/chosen": -0.6497470140457153, "rewards/margins": 0.7530845403671265, "rewards/rejected": -1.4028315544128418, "step": 350 }, { "epoch": 0.3866809881847476, "grad_norm": 55.50624084472656, "learning_rate": 3.854112772914723e-07, "logits/chosen": -1.479723572731018, "logits/rejected": -1.5374826192855835, "logps/chosen": -342.92791748046875, "logps/rejected": -446.47503662109375, "loss": 0.5091, "rewards/accuracies": 0.75, "rewards/chosen": -0.8904348611831665, "rewards/margins": 0.907945990562439, "rewards/rejected": -1.7983808517456055, "step": 360 }, { "epoch": 0.39742212674543503, "grad_norm": 53.516597747802734, "learning_rate": 3.774299264470177e-07, "logits/chosen": -1.506805181503296, "logits/rejected": -1.5588204860687256, "logps/chosen": -392.9142150878906, "logps/rejected": -493.38812255859375, "loss": 0.5008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9804698824882507, "rewards/margins": 0.8168279528617859, "rewards/rejected": -1.7972980737686157, "step": 370 }, { "epoch": 0.40816326530612246, "grad_norm": 57.87943649291992, "learning_rate": 3.6926907360946604e-07, "logits/chosen": -1.7936798334121704, "logits/rejected": -1.573922872543335, "logps/chosen": -382.94732666015625, "logps/rejected": -445.59423828125, "loss": 0.5065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.781761109828949, "rewards/margins": 0.822665810585022, "rewards/rejected": -1.6044269800186157, "step": 380 }, { "epoch": 0.4189044038668099, "grad_norm": 54.115848541259766, "learning_rate": 3.609402144250171e-07, "logits/chosen": -1.8267205953598022, "logits/rejected": -1.5302643775939941, "logps/chosen": -418.0675354003906, "logps/rejected": -446.8143005371094, "loss": 0.4838, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7378665208816528, "rewards/margins": 1.0708403587341309, "rewards/rejected": -1.8087068796157837, "step": 390 }, { "epoch": 0.4296455424274973, "grad_norm": 48.267066955566406, "learning_rate": 3.5245508119914683e-07, "logits/chosen": -1.695697546005249, "logits/rejected": -1.5788294076919556, "logps/chosen": -400.60382080078125, "logps/rejected": -472.8611755371094, "loss": 0.4735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0140430927276611, "rewards/margins": 0.6619123220443726, "rewards/rejected": -1.6759554147720337, "step": 400 }, { "epoch": 0.44038668098818473, "grad_norm": 54.982364654541016, "learning_rate": 3.4382562637009484e-07, "logits/chosen": -1.76909601688385, "logits/rejected": -1.6182836294174194, "logps/chosen": -387.72088623046875, "logps/rejected": -486.9130859375, "loss": 0.5232, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9676576852798462, "rewards/margins": 0.9475955963134766, "rewards/rejected": -1.9152532815933228, "step": 410 }, { "epoch": 0.45112781954887216, "grad_norm": 63.29111099243164, "learning_rate": 3.350640056722662e-07, "logits/chosen": -1.5833942890167236, "logits/rejected": -1.4724894762039185, "logps/chosen": -380.0611877441406, "logps/rejected": -419.60699462890625, "loss": 0.506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9589203596115112, "rewards/margins": 0.64564049243927, "rewards/rejected": -1.6045608520507812, "step": 420 }, { "epoch": 0.46186895810955964, "grad_norm": 54.812477111816406, "learning_rate": 3.261825610132634e-07, "logits/chosen": -1.825090765953064, "logits/rejected": -1.742057204246521, "logps/chosen": -434.72491455078125, "logps/rejected": -475.3501892089844, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": -0.6500536799430847, "rewards/margins": 0.9928058385848999, "rewards/rejected": -1.642859697341919, "step": 430 }, { "epoch": 0.47261009667024706, "grad_norm": 55.682682037353516, "learning_rate": 3.1719380308866924e-07, "logits/chosen": -1.8630441427230835, "logits/rejected": -1.7151546478271484, "logps/chosen": -378.4299011230469, "logps/rejected": -382.890869140625, "loss": 0.4969, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6695320010185242, "rewards/margins": 0.7351147532463074, "rewards/rejected": -1.404646873474121, "step": 440 }, { "epoch": 0.4833512352309345, "grad_norm": 55.0916862487793, "learning_rate": 3.0811039375906764e-07, "logits/chosen": -1.9064140319824219, "logits/rejected": -1.714078664779663, "logps/chosen": -397.0462951660156, "logps/rejected": -441.3729553222656, "loss": 0.5073, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7625714540481567, "rewards/margins": 0.8189951181411743, "rewards/rejected": -1.581566572189331, "step": 450 }, { "epoch": 0.4940923737916219, "grad_norm": 58.015785217285156, "learning_rate": 2.9894512821413075e-07, "logits/chosen": -1.7865148782730103, "logits/rejected": -1.5167757272720337, "logps/chosen": -416.95550537109375, "logps/rejected": -436.30450439453125, "loss": 0.4966, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9333486557006836, "rewards/margins": 0.8736823797225952, "rewards/rejected": -1.807031273841858, "step": 460 }, { "epoch": 0.5048335123523093, "grad_norm": 68.10478210449219, "learning_rate": 2.8971091694889264e-07, "logits/chosen": -1.7999794483184814, "logits/rejected": -1.7058521509170532, "logps/chosen": -399.33038330078125, "logps/rejected": -435.877197265625, "loss": 0.5401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9552398920059204, "rewards/margins": 0.6859758496284485, "rewards/rejected": -1.6412159204483032, "step": 470 }, { "epoch": 0.5155746509129968, "grad_norm": 72.0062255859375, "learning_rate": 2.8042076757760066e-07, "logits/chosen": -1.8483903408050537, "logits/rejected": -1.52974534034729, "logps/chosen": -399.46746826171875, "logps/rejected": -427.24981689453125, "loss": 0.5003, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8091894388198853, "rewards/margins": 0.6731428503990173, "rewards/rejected": -1.4823323488235474, "step": 480 }, { "epoch": 0.5263157894736842, "grad_norm": 61.39365768432617, "learning_rate": 2.7108776651076116e-07, "logits/chosen": -1.7545459270477295, "logits/rejected": -1.6814851760864258, "logps/chosen": -398.98699951171875, "logps/rejected": -491.4130859375, "loss": 0.4783, "rewards/accuracies": 0.75, "rewards/chosen": -0.6736662983894348, "rewards/margins": 0.9081080555915833, "rewards/rejected": -1.5817744731903076, "step": 490 }, { "epoch": 0.5370569280343717, "grad_norm": 54.34407424926758, "learning_rate": 2.6172506052119147e-07, "logits/chosen": -1.7606862783432007, "logits/rejected": -1.576406478881836, "logps/chosen": -404.3669128417969, "logps/rejected": -468.853515625, "loss": 0.5235, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9447774887084961, "rewards/margins": 0.9778174161911011, "rewards/rejected": -1.9225950241088867, "step": 500 }, { "epoch": 0.547798066595059, "grad_norm": 76.07147216796875, "learning_rate": 2.523458382250412e-07, "logits/chosen": -1.7519705295562744, "logits/rejected": -1.6435455083847046, "logps/chosen": -410.5835876464844, "logps/rejected": -490.4002380371094, "loss": 0.4922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7955667972564697, "rewards/margins": 0.8166013956069946, "rewards/rejected": -1.612168312072754, "step": 510 }, { "epoch": 0.5585392051557465, "grad_norm": 56.88628387451172, "learning_rate": 2.429633115038737e-07, "logits/chosen": -1.7606322765350342, "logits/rejected": -1.561812162399292, "logps/chosen": -399.27294921875, "logps/rejected": -455.17462158203125, "loss": 0.4751, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7019720673561096, "rewards/margins": 1.1464049816131592, "rewards/rejected": -1.8483772277832031, "step": 520 }, { "epoch": 0.569280343716434, "grad_norm": 108.08949279785156, "learning_rate": 2.3359069689397467e-07, "logits/chosen": -1.5748560428619385, "logits/rejected": -1.5276682376861572, "logps/chosen": -402.18023681640625, "logps/rejected": -493.3003845214844, "loss": 0.509, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0235780477523804, "rewards/margins": 0.8902117609977722, "rewards/rejected": -1.9137897491455078, "step": 530 }, { "epoch": 0.5800214822771214, "grad_norm": 81.13088989257812, "learning_rate": 2.2424119696910278e-07, "logits/chosen": -1.6776129007339478, "logits/rejected": -1.4775701761245728, "logps/chosen": -423.53778076171875, "logps/rejected": -458.8199768066406, "loss": 0.4854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8803874254226685, "rewards/margins": 0.8511655926704407, "rewards/rejected": -1.7315528392791748, "step": 540 }, { "epoch": 0.5907626208378088, "grad_norm": 55.828590393066406, "learning_rate": 2.1492798174291006e-07, "logits/chosen": -1.6470720767974854, "logits/rejected": -1.4030835628509521, "logps/chosen": -370.0120544433594, "logps/rejected": -398.987060546875, "loss": 0.4997, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6602271795272827, "rewards/margins": 0.8755744099617004, "rewards/rejected": -1.535801649093628, "step": 550 }, { "epoch": 0.6015037593984962, "grad_norm": 50.89604187011719, "learning_rate": 2.0566417011722652e-07, "logits/chosen": -1.7970349788665771, "logits/rejected": -1.5006736516952515, "logps/chosen": -410.7815856933594, "logps/rejected": -538.999267578125, "loss": 0.4874, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8133628964424133, "rewards/margins": 1.2357568740844727, "rewards/rejected": -2.049119710922241, "step": 560 }, { "epoch": 0.6122448979591837, "grad_norm": 61.7906494140625, "learning_rate": 1.9646281140234222e-07, "logits/chosen": -1.5956964492797852, "logits/rejected": -1.560612440109253, "logps/chosen": -411.87615966796875, "logps/rejected": -522.2528076171875, "loss": 0.4574, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0511554479599, "rewards/margins": 1.075621485710144, "rewards/rejected": -2.126776933670044, "step": 570 }, { "epoch": 0.6229860365198711, "grad_norm": 69.56195831298828, "learning_rate": 1.8733686693531982e-07, "logits/chosen": -1.601663589477539, "logits/rejected": -1.5559356212615967, "logps/chosen": -434.58880615234375, "logps/rejected": -539.5415649414062, "loss": 0.4856, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2848224639892578, "rewards/margins": 0.8390380144119263, "rewards/rejected": -2.1238605976104736, "step": 580 }, { "epoch": 0.6337271750805585, "grad_norm": 67.03250122070312, "learning_rate": 1.782991918222275e-07, "logits/chosen": -1.6579906940460205, "logits/rejected": -1.3883017301559448, "logps/chosen": -424.90643310546875, "logps/rejected": -479.0643005371094, "loss": 0.4948, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1206614971160889, "rewards/margins": 0.9874579310417175, "rewards/rejected": -2.108119487762451, "step": 590 }, { "epoch": 0.644468313641246, "grad_norm": 49.616844177246094, "learning_rate": 1.693625168300127e-07, "logits/chosen": -1.5930492877960205, "logits/rejected": -1.3797900676727295, "logps/chosen": -399.57135009765625, "logps/rejected": -451.79022216796875, "loss": 0.5333, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0476266145706177, "rewards/margins": 0.8083570599555969, "rewards/rejected": -1.8559834957122803, "step": 600 }, { "epoch": 0.6552094522019334, "grad_norm": 66.87409973144531, "learning_rate": 1.6053943045352516e-07, "logits/chosen": -1.5892466306686401, "logits/rejected": -1.5528477430343628, "logps/chosen": -399.47808837890625, "logps/rejected": -506.40374755859375, "loss": 0.4995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6764686107635498, "rewards/margins": 0.8436732292175293, "rewards/rejected": -1.5201416015625, "step": 610 }, { "epoch": 0.6659505907626209, "grad_norm": 55.837158203125, "learning_rate": 1.5184236118294787e-07, "logits/chosen": -1.5849072933197021, "logits/rejected": -1.4045408964157104, "logps/chosen": -453.03070068359375, "logps/rejected": -508.41705322265625, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2203357219696045, "rewards/margins": 0.8617902994155884, "rewards/rejected": -2.0821259021759033, "step": 620 }, { "epoch": 0.6766917293233082, "grad_norm": 45.06595230102539, "learning_rate": 1.4328355999661586e-07, "logits/chosen": -1.540052890777588, "logits/rejected": -1.3541651964187622, "logps/chosen": -389.3322448730469, "logps/rejected": -468.869384765625, "loss": 0.5054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0844593048095703, "rewards/margins": 1.0829648971557617, "rewards/rejected": -2.167423963546753, "step": 630 }, { "epoch": 0.6874328678839957, "grad_norm": 60.282310485839844, "learning_rate": 1.3487508310388534e-07, "logits/chosen": -1.434007167816162, "logits/rejected": -1.4812556505203247, "logps/chosen": -397.49090576171875, "logps/rejected": -508.686767578125, "loss": 0.5134, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.238869309425354, "rewards/margins": 0.7315988540649414, "rewards/rejected": -1.9704681634902954, "step": 640 }, { "epoch": 0.6981740064446831, "grad_norm": 45.42427444458008, "learning_rate": 1.266287749623594e-07, "logits/chosen": -1.7065637111663818, "logits/rejected": -1.4134385585784912, "logps/chosen": -430.59197998046875, "logps/rejected": -482.938720703125, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": -1.0648348331451416, "rewards/margins": 1.0008587837219238, "rewards/rejected": -2.0656933784484863, "step": 650 }, { "epoch": 0.7089151450053706, "grad_norm": 55.1621208190918, "learning_rate": 1.1855625159339452e-07, "logits/chosen": -1.6233896017074585, "logits/rejected": -1.4236741065979004, "logps/chosen": -445.6341247558594, "logps/rejected": -489.8377380371094, "loss": 0.4783, "rewards/accuracies": 0.75, "rewards/chosen": -1.1494815349578857, "rewards/margins": 0.9501803517341614, "rewards/rejected": -2.0996615886688232, "step": 660 }, { "epoch": 0.719656283566058, "grad_norm": 64.62177276611328, "learning_rate": 1.1066888421939092e-07, "logits/chosen": -1.6147754192352295, "logits/rejected": -1.3309791088104248, "logps/chosen": -438.37042236328125, "logps/rejected": -420.2880859375, "loss": 0.4984, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.012224555015564, "rewards/margins": 0.7375470399856567, "rewards/rejected": -1.7497714757919312, "step": 670 }, { "epoch": 0.7303974221267454, "grad_norm": 59.346073150634766, "learning_rate": 1.0297778324591339e-07, "logits/chosen": -1.5413581132888794, "logits/rejected": -1.2392480373382568, "logps/chosen": -423.99853515625, "logps/rejected": -454.7129821777344, "loss": 0.5099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9406285285949707, "rewards/margins": 1.0149680376052856, "rewards/rejected": -1.9555965662002563, "step": 680 }, { "epoch": 0.7411385606874329, "grad_norm": 49.851619720458984, "learning_rate": 9.549378261120816e-08, "logits/chosen": -1.583778977394104, "logits/rejected": -1.4962458610534668, "logps/chosen": -394.4914245605469, "logps/rejected": -450.92218017578125, "loss": 0.4907, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9432915449142456, "rewards/margins": 0.7003445029258728, "rewards/rejected": -1.6436359882354736, "step": 690 }, { "epoch": 0.7518796992481203, "grad_norm": 87.44384002685547, "learning_rate": 8.822742452516063e-08, "logits/chosen": -1.6286656856536865, "logits/rejected": -1.4821763038635254, "logps/chosen": -421.04083251953125, "logps/rejected": -503.68365478515625, "loss": 0.5017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.912327766418457, "rewards/margins": 0.9774397015571594, "rewards/rejected": -1.8897672891616821, "step": 700 }, { "epoch": 0.7626208378088077, "grad_norm": 50.42548751831055, "learning_rate": 8.118894461919074e-08, "logits/chosen": -1.4116138219833374, "logits/rejected": -1.2908035516738892, "logps/chosen": -368.16656494140625, "logps/rejected": -451.15087890625, "loss": 0.4812, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0490543842315674, "rewards/margins": 0.8819605708122253, "rewards/rejected": -1.9310150146484375, "step": 710 }, { "epoch": 0.7733619763694952, "grad_norm": 53.07373809814453, "learning_rate": 7.438825752800579e-08, "logits/chosen": -1.6596364974975586, "logits/rejected": -1.556316614151001, "logps/chosen": -442.6805114746094, "logps/rejected": -490.9193420410156, "loss": 0.4715, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8844410181045532, "rewards/margins": 0.9804731607437134, "rewards/rejected": -1.8649142980575562, "step": 720 }, { "epoch": 0.7841031149301826, "grad_norm": 53.835243225097656, "learning_rate": 6.783494292351907e-08, "logits/chosen": -1.4840538501739502, "logits/rejected": -1.2394304275512695, "logps/chosen": -376.4036560058594, "logps/rejected": -482.4645080566406, "loss": 0.4999, "rewards/accuracies": 0.875, "rewards/chosen": -0.9420751333236694, "rewards/margins": 1.132046103477478, "rewards/rejected": -2.0741212368011475, "step": 730 }, { "epoch": 0.7948442534908701, "grad_norm": 62.356510162353516, "learning_rate": 6.153823202060846e-08, "logits/chosen": -1.4445642232894897, "logits/rejected": -1.271066665649414, "logps/chosen": -390.6706848144531, "logps/rejected": -478.68463134765625, "loss": 0.4794, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.236981749534607, "rewards/margins": 0.9016791582107544, "rewards/rejected": -2.1386606693267822, "step": 740 }, { "epoch": 0.8055853920515574, "grad_norm": 79.2643051147461, "learning_rate": 5.550699457372282e-08, "logits/chosen": -1.568433403968811, "logits/rejected": -1.4340345859527588, "logps/chosen": -396.7769470214844, "logps/rejected": -476.18585205078125, "loss": 0.4811, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8553333282470703, "rewards/margins": 1.078909158706665, "rewards/rejected": -1.9342422485351562, "step": 750 }, { "epoch": 0.8163265306122449, "grad_norm": 75.26664733886719, "learning_rate": 4.97497263826539e-08, "logits/chosen": -1.6515172719955444, "logits/rejected": -1.4298070669174194, "logps/chosen": -439.5928649902344, "logps/rejected": -471.7366638183594, "loss": 0.4988, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0839134454727173, "rewards/margins": 0.8585917353630066, "rewards/rejected": -1.942505121231079, "step": 760 }, { "epoch": 0.8270676691729323, "grad_norm": 55.086814880371094, "learning_rate": 4.4274537325072384e-08, "logits/chosen": -1.7422411441802979, "logits/rejected": -1.465787649154663, "logps/chosen": -413.664306640625, "logps/rejected": -463.29180908203125, "loss": 0.4741, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8051220178604126, "rewards/margins": 1.0510156154632568, "rewards/rejected": -1.8561376333236694, "step": 770 }, { "epoch": 0.8378088077336198, "grad_norm": 52.02236557006836, "learning_rate": 3.9089139932687534e-08, "logits/chosen": -1.5885370969772339, "logits/rejected": -1.232320785522461, "logps/chosen": -414.0470275878906, "logps/rejected": -457.7579650878906, "loss": 0.4909, "rewards/accuracies": 0.75, "rewards/chosen": -1.0070761442184448, "rewards/margins": 1.148498773574829, "rewards/rejected": -2.1555750370025635, "step": 780 }, { "epoch": 0.8485499462943072, "grad_norm": 57.56011199951172, "learning_rate": 3.4200838527120525e-08, "logits/chosen": -1.5626875162124634, "logits/rejected": -1.446763277053833, "logps/chosen": -366.77081298828125, "logps/rejected": -446.97760009765625, "loss": 0.4699, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8740879893302917, "rewards/margins": 0.9469636082649231, "rewards/rejected": -1.8210515975952148, "step": 790 }, { "epoch": 0.8592910848549946, "grad_norm": 66.50774383544922, "learning_rate": 2.9616518930796447e-08, "logits/chosen": -1.6846020221710205, "logits/rejected": -1.5415384769439697, "logps/chosen": -471.2176818847656, "logps/rejected": -504.05035400390625, "loss": 0.4866, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.8976691961288452, "rewards/margins": 1.1193997859954834, "rewards/rejected": -2.017069101333618, "step": 800 }, { "epoch": 0.8700322234156821, "grad_norm": 49.84431457519531, "learning_rate": 2.534263876734802e-08, "logits/chosen": -1.6844720840454102, "logits/rejected": -1.4568878412246704, "logps/chosen": -412.49371337890625, "logps/rejected": -470.7568359375, "loss": 0.4711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9692155718803406, "rewards/margins": 1.001009464263916, "rewards/rejected": -1.9702249765396118, "step": 810 }, { "epoch": 0.8807733619763695, "grad_norm": 54.95064926147461, "learning_rate": 2.138521836519408e-08, "logits/chosen": -1.6106141805648804, "logits/rejected": -1.4224177598953247, "logps/chosen": -422.16796875, "logps/rejected": -472.8387145996094, "loss": 0.4862, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0354992151260376, "rewards/margins": 0.9904156923294067, "rewards/rejected": -2.0259149074554443, "step": 820 }, { "epoch": 0.8915145005370569, "grad_norm": 59.206207275390625, "learning_rate": 1.7749832277106524e-08, "logits/chosen": -1.5170490741729736, "logits/rejected": -1.3669121265411377, "logps/chosen": -387.67791748046875, "logps/rejected": -476.3672790527344, "loss": 0.4937, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0796597003936768, "rewards/margins": 1.1205719709396362, "rewards/rejected": -2.2002315521240234, "step": 830 }, { "epoch": 0.9022556390977443, "grad_norm": 76.18675231933594, "learning_rate": 1.4441601427711836e-08, "logits/chosen": -1.6329158544540405, "logits/rejected": -1.3677482604980469, "logps/chosen": -407.67523193359375, "logps/rejected": -457.119384765625, "loss": 0.4827, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0345627069473267, "rewards/margins": 0.9469528198242188, "rewards/rejected": -1.9815152883529663, "step": 840 }, { "epoch": 0.9129967776584318, "grad_norm": 52.468204498291016, "learning_rate": 1.1465185899987794e-08, "logits/chosen": -1.5776712894439697, "logits/rejected": -1.3938223123550415, "logps/chosen": -396.31195068359375, "logps/rejected": -443.401123046875, "loss": 0.5082, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0158368349075317, "rewards/margins": 0.9373232126235962, "rewards/rejected": -1.953160047531128, "step": 850 }, { "epoch": 0.9237379162191193, "grad_norm": 69.90252685546875, "learning_rate": 8.824778370916996e-09, "logits/chosen": -1.5731096267700195, "logits/rejected": -1.47088623046875, "logps/chosen": -432.2276916503906, "logps/rejected": -516.0203857421875, "loss": 0.4968, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9636839628219604, "rewards/margins": 0.9173566102981567, "rewards/rejected": -1.881040334701538, "step": 860 }, { "epoch": 0.9344790547798066, "grad_norm": 53.79716873168945, "learning_rate": 6.5240982055440044e-09, "logits/chosen": -1.592310905456543, "logits/rejected": -1.4055845737457275, "logps/chosen": -392.08526611328125, "logps/rejected": -461.09051513671875, "loss": 0.5285, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1570608615875244, "rewards/margins": 0.8815321922302246, "rewards/rejected": -2.038593292236328, "step": 870 }, { "epoch": 0.9452201933404941, "grad_norm": 55.78624725341797, "learning_rate": 4.56638621775518e-09, "logits/chosen": -1.7353003025054932, "logits/rejected": -1.592315435409546, "logps/chosen": -423.08868408203125, "logps/rejected": -525.6699829101562, "loss": 0.4699, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9968613386154175, "rewards/margins": 0.8165915608406067, "rewards/rejected": -1.8134530782699585, "step": 880 }, { "epoch": 0.9559613319011815, "grad_norm": 64.26811981201172, "learning_rate": 2.9544001051613478e-09, "logits/chosen": -1.5674078464508057, "logits/rejected": -1.477450966835022, "logps/chosen": -431.13037109375, "logps/rejected": -499.76849365234375, "loss": 0.4941, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.012650489807129, "rewards/margins": 0.9235459566116333, "rewards/rejected": -1.9361963272094727, "step": 890 }, { "epoch": 0.966702470461869, "grad_norm": 63.50950622558594, "learning_rate": 1.690410564514244e-09, "logits/chosen": -1.5680744647979736, "logits/rejected": -1.3493303060531616, "logps/chosen": -389.19879150390625, "logps/rejected": -441.45098876953125, "loss": 0.4976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0372302532196045, "rewards/margins": 0.7519098520278931, "rewards/rejected": -1.7891401052474976, "step": 900 }, { "epoch": 0.9774436090225563, "grad_norm": 62.333988189697266, "learning_rate": 7.761980931282053e-10, "logits/chosen": -1.7527889013290405, "logits/rejected": -1.5912810564041138, "logps/chosen": -469.765625, "logps/rejected": -504.1661071777344, "loss": 0.4791, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1952266693115234, "rewards/margins": 0.7948960065841675, "rewards/rejected": -1.9901227951049805, "step": 910 }, { "epoch": 0.9881847475832438, "grad_norm": 56.70859909057617, "learning_rate": 2.1305048081288191e-10, "logits/chosen": -1.6409803628921509, "logits/rejected": -1.3718929290771484, "logps/chosen": -389.1742858886719, "logps/rejected": -463.9933166503906, "loss": 0.4863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9753316044807434, "rewards/margins": 1.0109294652938843, "rewards/rejected": -1.986261010169983, "step": 920 }, { "epoch": 0.9989258861439313, "grad_norm": 53.95326614379883, "learning_rate": 1.7609958501973998e-12, "logits/chosen": -1.5267393589019775, "logits/rejected": -1.3435981273651123, "logps/chosen": -425.5747985839844, "logps/rejected": -472.15435791015625, "loss": 0.4831, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9775764346122742, "rewards/margins": 0.9130144119262695, "rewards/rejected": -1.8905909061431885, "step": 930 }, { "epoch": 1.0, "step": 931, "total_flos": 0.0, "train_loss": 0.524798985838762, "train_runtime": 12257.7363, "train_samples_per_second": 4.859, "train_steps_per_second": 0.076 } ], "logging_steps": 10, "max_steps": 931, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }