{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0075282308657467, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1e-07, "logits/chosen": -2.255735158920288, "logits/rejected": -1.715524435043335, "logps/chosen": -406.91876220703125, "logps/rejected": -222.7034454345703, "loss": 0.6912, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.01643981970846653, "rewards/margins": 0.02151082456111908, "rewards/rejected": -0.00507100485265255, "step": 1 }, { "epoch": 0.01, "learning_rate": 2e-07, "logits/chosen": -2.270268440246582, "logits/rejected": -1.7129734754562378, "logps/chosen": -403.281982421875, "logps/rejected": -216.48435974121094, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": 0.01064732950180769, "rewards/margins": 0.018948109820485115, "rewards/rejected": -0.00830078125, "step": 2 }, { "epoch": 0.02, "learning_rate": 3e-07, "logits/chosen": -2.3482441902160645, "logits/rejected": -1.7339589595794678, "logps/chosen": -406.253662109375, "logps/rejected": -225.86363220214844, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.00515124062076211, "rewards/margins": 0.003809865564107895, "rewards/rejected": 0.0013413750566542149, "step": 3 }, { "epoch": 0.02, "learning_rate": 4e-07, "logits/chosen": -2.3932580947875977, "logits/rejected": -1.717940092086792, "logps/chosen": -383.88018798828125, "logps/rejected": -232.95260620117188, "loss": 0.6886, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.0040194196626544, "rewards/margins": 0.008515550754964352, "rewards/rejected": -0.012534968554973602, "step": 4 }, { "epoch": 0.03, "learning_rate": 5e-07, "logits/chosen": -2.374025344848633, "logits/rejected": -1.790544033050537, "logps/chosen": -381.5960998535156, "logps/rejected": -224.89280700683594, "loss": 0.6841, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.011044693179428577, "rewards/margins": 0.011520260013639927, "rewards/rejected": -0.0004755654954351485, "step": 5 }, { "epoch": 0.03, "learning_rate": 6e-07, "logits/chosen": -2.3211307525634766, "logits/rejected": -1.8033502101898193, "logps/chosen": -395.9403381347656, "logps/rejected": -227.61026000976562, "loss": 0.6924, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.002454122295603156, "rewards/margins": 0.01087900809943676, "rewards/rejected": -0.013333131559193134, "step": 6 }, { "epoch": 0.04, "learning_rate": 7e-07, "logits/chosen": -2.312662124633789, "logits/rejected": -1.7145819664001465, "logps/chosen": -418.522216796875, "logps/rejected": -235.27044677734375, "loss": 0.6824, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.0033233645372092724, "rewards/margins": 0.002655028598383069, "rewards/rejected": -0.005978393834084272, "step": 7 }, { "epoch": 0.04, "learning_rate": 8e-07, "logits/chosen": -2.3392677307128906, "logits/rejected": -1.7684417963027954, "logps/chosen": -415.1978454589844, "logps/rejected": -233.99502563476562, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008266442455351353, "rewards/margins": 0.006310400553047657, "rewards/rejected": -0.00548375491052866, "step": 8 }, { "epoch": 0.05, "learning_rate": 9e-07, "logits/chosen": -2.4016027450561523, "logits/rejected": -1.8090660572052002, "logps/chosen": -437.22686767578125, "logps/rejected": -242.0886688232422, "loss": 0.6842, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": 0.00249633751809597, "rewards/margins": 0.019575374200940132, "rewards/rejected": -0.017079036682844162, "step": 9 }, { "epoch": 0.05, "learning_rate": 1e-06, "logits/chosen": -2.286271572113037, "logits/rejected": -1.765137791633606, "logps/chosen": -372.4206848144531, "logps/rejected": -237.12986755371094, "loss": 0.6791, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.021054396405816078, "rewards/margins": 0.046052236109972, "rewards/rejected": -0.02499784156680107, "step": 10 }, { "epoch": 0.06, "learning_rate": 9.999928635197407e-07, "logits/chosen": -2.3678841590881348, "logits/rejected": -1.6936702728271484, "logps/chosen": -446.262939453125, "logps/rejected": -226.2487030029297, "loss": 0.6801, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.007172901649028063, "rewards/margins": 0.02706197462975979, "rewards/rejected": -0.019889069721102715, "step": 11 }, { "epoch": 0.06, "learning_rate": 9.999714542826805e-07, "logits/chosen": -2.32342267036438, "logits/rejected": -1.660724401473999, "logps/chosen": -392.6808776855469, "logps/rejected": -213.53292846679688, "loss": 0.6775, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": 0.011206183582544327, "rewards/margins": 0.034867607057094574, "rewards/rejected": -0.023661423474550247, "step": 12 }, { "epoch": 0.07, "learning_rate": 9.999357728999656e-07, "logits/chosen": -2.372107982635498, "logits/rejected": -1.7802695035934448, "logps/chosen": -413.37530517578125, "logps/rejected": -243.32130432128906, "loss": 0.6785, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.0066390992142260075, "rewards/margins": 0.029553350061178207, "rewards/rejected": -0.022914253175258636, "step": 13 }, { "epoch": 0.07, "learning_rate": 9.99885820390154e-07, "logits/chosen": -2.3564772605895996, "logits/rejected": -1.7367222309112549, "logps/chosen": -417.22198486328125, "logps/rejected": -229.80526733398438, "loss": 0.6745, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.016272228211164474, "rewards/margins": 0.04719391465187073, "rewards/rejected": -0.030921682715415955, "step": 14 }, { "epoch": 0.08, "learning_rate": 9.99821598179186e-07, "logits/chosen": -2.258823871612549, "logits/rejected": -1.7435674667358398, "logps/chosen": -354.0164489746094, "logps/rejected": -241.83929443359375, "loss": 0.6692, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.00755993602797389, "rewards/margins": 0.029054105281829834, "rewards/rejected": -0.021494168788194656, "step": 15 }, { "epoch": 0.08, "learning_rate": 9.99743108100344e-07, "logits/chosen": -2.388901472091675, "logits/rejected": -1.7188632488250732, "logps/chosen": -449.16424560546875, "logps/rejected": -221.1320343017578, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.013864136300981045, "rewards/margins": 0.03957417979836464, "rewards/rejected": -0.02571004442870617, "step": 16 }, { "epoch": 0.09, "learning_rate": 9.996503523941992e-07, "logits/chosen": -2.3594629764556885, "logits/rejected": -1.7600576877593994, "logps/chosen": -422.7988586425781, "logps/rejected": -242.28831481933594, "loss": 0.6722, "rewards/accuracies": 0.75, "rewards/chosen": -0.009711965918540955, "rewards/margins": 0.028896205127239227, "rewards/rejected": -0.03860817104578018, "step": 17 }, { "epoch": 0.09, "learning_rate": 9.99543333708549e-07, "logits/chosen": -2.3321311473846436, "logits/rejected": -1.7426540851593018, "logps/chosen": -426.822509765625, "logps/rejected": -234.19598388671875, "loss": 0.6658, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.01716003753244877, "rewards/margins": 0.05482915788888931, "rewards/rejected": -0.03766912221908569, "step": 18 }, { "epoch": 0.1, "learning_rate": 9.994220550983403e-07, "logits/chosen": -2.372408390045166, "logits/rejected": -1.7021995782852173, "logps/chosen": -464.4961242675781, "logps/rejected": -227.12640380859375, "loss": 0.6683, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.00833536870777607, "rewards/margins": 0.06423734128475189, "rewards/rejected": -0.05590197443962097, "step": 19 }, { "epoch": 0.1, "learning_rate": 9.992865200255829e-07, "logits/chosen": -2.359961748123169, "logits/rejected": -1.7921862602233887, "logps/chosen": -424.4770202636719, "logps/rejected": -250.455322265625, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 0.02021026611328125, "rewards/margins": 0.07474848628044128, "rewards/rejected": -0.05453822389245033, "step": 20 }, { "epoch": 0.11, "learning_rate": 9.991367323592497e-07, "logits/chosen": -2.375795602798462, "logits/rejected": -1.7240290641784668, "logps/chosen": -430.834228515625, "logps/rejected": -241.43362426757812, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 0.009901427663862705, "rewards/margins": 0.0611012801527977, "rewards/rejected": -0.05119985342025757, "step": 21 }, { "epoch": 0.11, "learning_rate": 9.989726963751682e-07, "logits/chosen": -2.376769542694092, "logits/rejected": -1.7410842180252075, "logps/chosen": -437.9547119140625, "logps/rejected": -247.85177612304688, "loss": 0.6565, "rewards/accuracies": 1.0, "rewards/chosen": 0.02139994315803051, "rewards/margins": 0.08655980974435806, "rewards/rejected": -0.0651598572731018, "step": 22 }, { "epoch": 0.12, "learning_rate": 9.987944167558962e-07, "logits/chosen": -2.352534294128418, "logits/rejected": -1.7078654766082764, "logps/chosen": -448.8846435546875, "logps/rejected": -245.959716796875, "loss": 0.6598, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -0.00972900353372097, "rewards/margins": 0.0559973381459713, "rewards/rejected": -0.06572634726762772, "step": 23 }, { "epoch": 0.12, "learning_rate": 9.986018985905899e-07, "logits/chosen": -2.299417018890381, "logits/rejected": -1.6663830280303955, "logps/chosen": -460.1045227050781, "logps/rejected": -221.133544921875, "loss": 0.66, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.00504913367331028, "rewards/margins": 0.06341387331485748, "rewards/rejected": -0.05836474150419235, "step": 24 }, { "epoch": 0.13, "learning_rate": 9.983951473748577e-07, "logits/chosen": -2.3385000228881836, "logits/rejected": -1.75907301902771, "logps/chosen": -402.1591491699219, "logps/rejected": -224.9604949951172, "loss": 0.6561, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.011491522192955017, "rewards/margins": 0.07259712368249893, "rewards/rejected": -0.06110560521483421, "step": 25 }, { "epoch": 0.13, "learning_rate": 9.981741690106034e-07, "logits/chosen": -2.386759042739868, "logits/rejected": -1.7684199810028076, "logps/chosen": -474.9812927246094, "logps/rejected": -249.0093994140625, "loss": 0.649, "rewards/accuracies": 1.0, "rewards/chosen": 0.025104016065597534, "rewards/margins": 0.10096079856157303, "rewards/rejected": -0.07585678994655609, "step": 26 }, { "epoch": 0.14, "learning_rate": 9.979389698058578e-07, "logits/chosen": -2.369291067123413, "logits/rejected": -1.7810336351394653, "logps/chosen": -383.74267578125, "logps/rejected": -231.25775146484375, "loss": 0.6512, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.00433400459587574, "rewards/margins": 0.0785469114780426, "rewards/rejected": -0.07421290874481201, "step": 27 }, { "epoch": 0.14, "learning_rate": 9.976895564745991e-07, "logits/chosen": -2.361295700073242, "logits/rejected": -1.7187690734863281, "logps/chosen": -427.01763916015625, "logps/rejected": -249.55853271484375, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 0.01534500252455473, "rewards/margins": 0.1016998291015625, "rewards/rejected": -0.08635483682155609, "step": 28 }, { "epoch": 0.15, "learning_rate": 9.974259361365602e-07, "logits/chosen": -2.2896504402160645, "logits/rejected": -1.7174427509307861, "logps/chosen": -408.708984375, "logps/rejected": -246.1773681640625, "loss": 0.6428, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.029611460864543915, "rewards/margins": 0.10940934717655182, "rewards/rejected": -0.07979787886142731, "step": 29 }, { "epoch": 0.15, "learning_rate": 9.971481163170269e-07, "logits/chosen": -2.3385965824127197, "logits/rejected": -1.74043869972229, "logps/chosen": -425.06695556640625, "logps/rejected": -235.05984497070312, "loss": 0.6439, "rewards/accuracies": 1.0, "rewards/chosen": 0.0180028285831213, "rewards/margins": 0.10506986081600189, "rewards/rejected": -0.08706703782081604, "step": 30 }, { "epoch": 0.16, "learning_rate": 9.968561049466213e-07, "logits/chosen": -2.3590800762176514, "logits/rejected": -1.7566580772399902, "logps/chosen": -400.0718994140625, "logps/rejected": -215.52410888671875, "loss": 0.6432, "rewards/accuracies": 1.0, "rewards/chosen": 0.007580948993563652, "rewards/margins": 0.08420728147029877, "rewards/rejected": -0.07662633806467056, "step": 31 }, { "epoch": 0.16, "learning_rate": 9.965499103610773e-07, "logits/chosen": -2.3469502925872803, "logits/rejected": -1.7395415306091309, "logps/chosen": -401.7705383300781, "logps/rejected": -227.20973205566406, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 0.02456207200884819, "rewards/margins": 0.11759631335735321, "rewards/rejected": -0.09303423762321472, "step": 32 }, { "epoch": 0.17, "learning_rate": 9.962295413010012e-07, "logits/chosen": -2.343597173690796, "logits/rejected": -1.7927014827728271, "logps/chosen": -411.3066711425781, "logps/rejected": -243.44483947753906, "loss": 0.6328, "rewards/accuracies": 1.0, "rewards/chosen": 0.02006937749683857, "rewards/margins": 0.12393748015165329, "rewards/rejected": -0.10386811196804047, "step": 33 }, { "epoch": 0.17, "learning_rate": 9.95895006911623e-07, "logits/chosen": -2.378286123275757, "logits/rejected": -1.742477297782898, "logps/chosen": -431.7130432128906, "logps/rejected": -230.1571044921875, "loss": 0.6366, "rewards/accuracies": 1.0, "rewards/chosen": 0.019404985010623932, "rewards/margins": 0.11559194326400757, "rewards/rejected": -0.09618696570396423, "step": 34 }, { "epoch": 0.18, "learning_rate": 9.955463167425348e-07, "logits/chosen": -2.3461527824401855, "logits/rejected": -1.7773241996765137, "logps/chosen": -383.312744140625, "logps/rejected": -232.4522247314453, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": -0.00024070727522484958, "rewards/margins": 0.12035013735294342, "rewards/rejected": -0.12059084326028824, "step": 35 }, { "epoch": 0.18, "learning_rate": 9.95183480747419e-07, "logits/chosen": -2.2968497276306152, "logits/rejected": -1.7697654962539673, "logps/chosen": -399.4654541015625, "logps/rejected": -242.6841278076172, "loss": 0.632, "rewards/accuracies": 1.0, "rewards/chosen": 0.01641947403550148, "rewards/margins": 0.14037209749221802, "rewards/rejected": -0.12395261973142624, "step": 36 }, { "epoch": 0.19, "learning_rate": 9.94806509283763e-07, "logits/chosen": -2.310312271118164, "logits/rejected": -1.694748044013977, "logps/chosen": -440.7718811035156, "logps/rejected": -232.01487731933594, "loss": 0.6297, "rewards/accuracies": 1.0, "rewards/chosen": 0.017212169244885445, "rewards/margins": 0.13497377932071686, "rewards/rejected": -0.11776161938905716, "step": 37 }, { "epoch": 0.19, "learning_rate": 9.944154131125642e-07, "logits/chosen": -2.3658957481384277, "logits/rejected": -1.6975268125534058, "logps/chosen": -444.21173095703125, "logps/rejected": -240.94131469726562, "loss": 0.6287, "rewards/accuracies": 1.0, "rewards/chosen": 0.005380503833293915, "rewards/margins": 0.13953450322151184, "rewards/rejected": -0.13415402173995972, "step": 38 }, { "epoch": 0.2, "learning_rate": 9.940102033980235e-07, "logits/chosen": -2.3857851028442383, "logits/rejected": -1.7804780006408691, "logps/chosen": -389.0456848144531, "logps/rejected": -243.52114868164062, "loss": 0.6235, "rewards/accuracies": 1.0, "rewards/chosen": 0.00982513464987278, "rewards/margins": 0.15222449600696564, "rewards/rejected": -0.1423993557691574, "step": 39 }, { "epoch": 0.2, "learning_rate": 9.93590891707225e-07, "logits/chosen": -2.3742823600769043, "logits/rejected": -1.777465581893921, "logps/chosen": -415.0690002441406, "logps/rejected": -237.30067443847656, "loss": 0.6159, "rewards/accuracies": 1.0, "rewards/chosen": 0.03390197828412056, "rewards/margins": 0.1639704406261444, "rewards/rejected": -0.13006846606731415, "step": 40 }, { "epoch": 0.21, "learning_rate": 9.931574900098074e-07, "logits/chosen": -2.3900654315948486, "logits/rejected": -1.7384684085845947, "logps/chosen": -436.23553466796875, "logps/rejected": -240.9998779296875, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 0.01531041506677866, "rewards/margins": 0.17367619276046753, "rewards/rejected": -0.15836575627326965, "step": 41 }, { "epoch": 0.21, "learning_rate": 9.927100106776212e-07, "logits/chosen": -2.3776233196258545, "logits/rejected": -1.7312960624694824, "logps/chosen": -409.340087890625, "logps/rejected": -224.3486328125, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 0.01719258725643158, "rewards/margins": 0.15459582209587097, "rewards/rejected": -0.13740324974060059, "step": 42 }, { "epoch": 0.22, "learning_rate": 9.922484664843763e-07, "logits/chosen": -2.4081950187683105, "logits/rejected": -1.7982332706451416, "logps/chosen": -393.2861328125, "logps/rejected": -228.45692443847656, "loss": 0.6115, "rewards/accuracies": 1.0, "rewards/chosen": 0.015950776636600494, "rewards/margins": 0.15181490778923035, "rewards/rejected": -0.13586413860321045, "step": 43 }, { "epoch": 0.22, "learning_rate": 9.917728706052764e-07, "logits/chosen": -2.3471217155456543, "logits/rejected": -1.6950238943099976, "logps/chosen": -442.8841552734375, "logps/rejected": -229.4197998046875, "loss": 0.6135, "rewards/accuracies": 1.0, "rewards/chosen": 0.00894673727452755, "rewards/margins": 0.15806923806667328, "rewards/rejected": -0.14912250638008118, "step": 44 }, { "epoch": 0.23, "learning_rate": 9.912832366166441e-07, "logits/chosen": -2.3768410682678223, "logits/rejected": -1.759801983833313, "logps/chosen": -425.1090087890625, "logps/rejected": -253.46405029296875, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 0.024756114929914474, "rewards/margins": 0.19806596636772156, "rewards/rejected": -0.1733098328113556, "step": 45 }, { "epoch": 0.23, "learning_rate": 9.907795784955326e-07, "logits/chosen": -2.3527581691741943, "logits/rejected": -1.7585816383361816, "logps/chosen": -433.7560119628906, "logps/rejected": -236.2068328857422, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": 0.03209178149700165, "rewards/margins": 0.20157599449157715, "rewards/rejected": -0.1694842129945755, "step": 46 }, { "epoch": 0.24, "learning_rate": 9.90261910619327e-07, "logits/chosen": -2.316863775253296, "logits/rejected": -1.6900603771209717, "logps/chosen": -410.6593933105469, "logps/rejected": -211.7102813720703, "loss": 0.6087, "rewards/accuracies": 1.0, "rewards/chosen": 0.015608088113367558, "rewards/margins": 0.17497165501117706, "rewards/rejected": -0.15936356782913208, "step": 47 }, { "epoch": 0.24, "learning_rate": 9.897302477653334e-07, "logits/chosen": -2.344198226928711, "logits/rejected": -1.7520121335983276, "logps/chosen": -430.35162353515625, "logps/rejected": -216.02198791503906, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": 0.0145454416051507, "rewards/margins": 0.18488237261772156, "rewards/rejected": -0.17033693194389343, "step": 48 }, { "epoch": 0.25, "learning_rate": 9.891846051103575e-07, "logits/chosen": -2.3107919692993164, "logits/rejected": -1.6821303367614746, "logps/chosen": -409.79693603515625, "logps/rejected": -251.96954345703125, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.03449045866727829, "rewards/margins": 0.25322458148002625, "rewards/rejected": -0.21873411536216736, "step": 49 }, { "epoch": 0.25, "learning_rate": 9.886249982302718e-07, "logits/chosen": -2.344055652618408, "logits/rejected": -1.6889526844024658, "logps/chosen": -472.56524658203125, "logps/rejected": -243.5905303955078, "loss": 0.5881, "rewards/accuracies": 1.0, "rewards/chosen": 0.05994110554456711, "rewards/margins": 0.25679296255111694, "rewards/rejected": -0.19685186445713043, "step": 50 }, { "epoch": 0.26, "learning_rate": 9.8805144309957e-07, "logits/chosen": -2.334566593170166, "logits/rejected": -1.711836576461792, "logps/chosen": -434.8371887207031, "logps/rejected": -228.9910888671875, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.03057200461626053, "rewards/margins": 0.23844006657600403, "rewards/rejected": -0.2078680843114853, "step": 51 }, { "epoch": 0.26, "learning_rate": 9.874639560909118e-07, "logits/chosen": -2.322694778442383, "logits/rejected": -1.7913904190063477, "logps/chosen": -407.7518005371094, "logps/rejected": -254.45538330078125, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 0.027757517993450165, "rewards/margins": 0.2553138732910156, "rewards/rejected": -0.22755637764930725, "step": 52 }, { "epoch": 0.27, "learning_rate": 9.868625539746544e-07, "logits/chosen": -2.351858139038086, "logits/rejected": -1.7765597105026245, "logps/chosen": -393.1485595703125, "logps/rejected": -246.63424682617188, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": 0.02648811601102352, "rewards/margins": 0.24637845158576965, "rewards/rejected": -0.21989034116268158, "step": 53 }, { "epoch": 0.27, "learning_rate": 9.862472539183755e-07, "logits/chosen": -2.2986531257629395, "logits/rejected": -1.6773505210876465, "logps/chosen": -404.07537841796875, "logps/rejected": -215.46900939941406, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.027726110070943832, "rewards/margins": 0.21862855553627014, "rewards/rejected": -0.1909024715423584, "step": 54 }, { "epoch": 0.28, "learning_rate": 9.85618073486382e-07, "logits/chosen": -2.348318099975586, "logits/rejected": -1.7531386613845825, "logps/chosen": -425.8829345703125, "logps/rejected": -233.40704345703125, "loss": 0.577, "rewards/accuracies": 1.0, "rewards/chosen": 0.015077210031449795, "rewards/margins": 0.2264123260974884, "rewards/rejected": -0.21133512258529663, "step": 55 }, { "epoch": 0.28, "learning_rate": 9.849750306392085e-07, "logits/chosen": -2.3148324489593506, "logits/rejected": -1.6857810020446777, "logps/chosen": -441.5430603027344, "logps/rejected": -220.44387817382812, "loss": 0.5682, "rewards/accuracies": 1.0, "rewards/chosen": 0.03912658989429474, "rewards/margins": 0.25977325439453125, "rewards/rejected": -0.2206466794013977, "step": 56 }, { "epoch": 0.29, "learning_rate": 9.843181437331054e-07, "logits/chosen": -2.3673195838928223, "logits/rejected": -1.73558509349823, "logps/chosen": -449.279541015625, "logps/rejected": -250.8281707763672, "loss": 0.5717, "rewards/accuracies": 1.0, "rewards/chosen": 0.05381368100643158, "rewards/margins": 0.3091966509819031, "rewards/rejected": -0.2553829252719879, "step": 57 }, { "epoch": 0.29, "learning_rate": 9.836474315195147e-07, "logits/chosen": -2.3354876041412354, "logits/rejected": -1.7699546813964844, "logps/chosen": -370.8974609375, "logps/rejected": -239.3045196533203, "loss": 0.571, "rewards/accuracies": 1.0, "rewards/chosen": 0.006744511425495148, "rewards/margins": 0.2502562403678894, "rewards/rejected": -0.24351172149181366, "step": 58 }, { "epoch": 0.3, "learning_rate": 9.82962913144534e-07, "logits/chosen": -2.304596424102783, "logits/rejected": -1.750684380531311, "logps/chosen": -413.2340087890625, "logps/rejected": -236.6236114501953, "loss": 0.5688, "rewards/accuracies": 1.0, "rewards/chosen": 0.023909762501716614, "rewards/margins": 0.25233080983161926, "rewards/rejected": -0.22842103242874146, "step": 59 }, { "epoch": 0.3, "learning_rate": 9.822646081483712e-07, "logits/chosen": -2.3595619201660156, "logits/rejected": -1.7285479307174683, "logps/chosen": -429.37774658203125, "logps/rejected": -225.66567993164062, "loss": 0.5602, "rewards/accuracies": 1.0, "rewards/chosen": 0.03526865690946579, "rewards/margins": 0.2945444881916046, "rewards/rejected": -0.2592758238315582, "step": 60 }, { "epoch": 0.31, "learning_rate": 9.815525364647852e-07, "logits/chosen": -2.3304920196533203, "logits/rejected": -1.716930866241455, "logps/chosen": -415.82763671875, "logps/rejected": -239.05308532714844, "loss": 0.5599, "rewards/accuracies": 1.0, "rewards/chosen": 0.03172862157225609, "rewards/margins": 0.30161911249160767, "rewards/rejected": -0.2698904871940613, "step": 61 }, { "epoch": 0.31, "learning_rate": 9.808267184205181e-07, "logits/chosen": -2.297617197036743, "logits/rejected": -1.761113166809082, "logps/chosen": -402.2159423828125, "logps/rejected": -230.9447021484375, "loss": 0.557, "rewards/accuracies": 1.0, "rewards/chosen": 0.030259961262345314, "rewards/margins": 0.29170405864715576, "rewards/rejected": -0.261444091796875, "step": 62 }, { "epoch": 0.32, "learning_rate": 9.800871747347147e-07, "logits/chosen": -2.3050127029418945, "logits/rejected": -1.685072660446167, "logps/chosen": -395.39764404296875, "logps/rejected": -217.38046264648438, "loss": 0.5503, "rewards/accuracies": 1.0, "rewards/chosen": 0.035826876759529114, "rewards/margins": 0.2983129024505615, "rewards/rejected": -0.2624860405921936, "step": 63 }, { "epoch": 0.32, "learning_rate": 9.793339265183303e-07, "logits/chosen": -2.3273816108703613, "logits/rejected": -1.7320973873138428, "logps/chosen": -434.2486572265625, "logps/rejected": -231.22503662109375, "loss": 0.5484, "rewards/accuracies": 1.0, "rewards/chosen": 0.028608448803424835, "rewards/margins": 0.3181728422641754, "rewards/rejected": -0.2895644009113312, "step": 64 }, { "epoch": 0.33, "learning_rate": 9.785669952735292e-07, "logits/chosen": -2.3719441890716553, "logits/rejected": -1.7355579137802124, "logps/chosen": -399.7314147949219, "logps/rejected": -240.45098876953125, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 0.039104461669921875, "rewards/margins": 0.34003591537475586, "rewards/rejected": -0.300931453704834, "step": 65 }, { "epoch": 0.33, "learning_rate": 9.777864028930705e-07, "logits/chosen": -2.34771728515625, "logits/rejected": -1.7566125392913818, "logps/chosen": -413.45416259765625, "logps/rejected": -238.31631469726562, "loss": 0.5458, "rewards/accuracies": 1.0, "rewards/chosen": 0.030201975256204605, "rewards/margins": 0.3159496486186981, "rewards/rejected": -0.2857476770877838, "step": 66 }, { "epoch": 0.34, "learning_rate": 9.769921716596818e-07, "logits/chosen": -2.3373610973358154, "logits/rejected": -1.7271639108657837, "logps/chosen": -402.1241149902344, "logps/rejected": -222.05313110351562, "loss": 0.5324, "rewards/accuracies": 1.0, "rewards/chosen": 0.043776195496320724, "rewards/margins": 0.3254074454307556, "rewards/rejected": -0.2816312313079834, "step": 67 }, { "epoch": 0.34, "learning_rate": 9.76184324245426e-07, "logits/chosen": -2.3652660846710205, "logits/rejected": -1.6916803121566772, "logps/chosen": -399.132568359375, "logps/rejected": -221.5078887939453, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.01293487660586834, "rewards/margins": 0.2995944023132324, "rewards/rejected": -0.28665950894355774, "step": 68 }, { "epoch": 0.35, "learning_rate": 9.753628837110513e-07, "logits/chosen": -2.36065936088562, "logits/rejected": -1.7677361965179443, "logps/chosen": -423.0574951171875, "logps/rejected": -249.51437377929688, "loss": 0.5299, "rewards/accuracies": 1.0, "rewards/chosen": 0.031139884144067764, "rewards/margins": 0.36295193433761597, "rewards/rejected": -0.3318120241165161, "step": 69 }, { "epoch": 0.35, "learning_rate": 9.745278735053343e-07, "logits/chosen": -2.3809893131256104, "logits/rejected": -1.7460397481918335, "logps/chosen": -430.45562744140625, "logps/rejected": -230.37530517578125, "loss": 0.5206, "rewards/accuracies": 1.0, "rewards/chosen": 0.03769226372241974, "rewards/margins": 0.3662698268890381, "rewards/rejected": -0.32857757806777954, "step": 70 }, { "epoch": 0.36, "learning_rate": 9.736793174644105e-07, "logits/chosen": -2.3731751441955566, "logits/rejected": -1.754599928855896, "logps/chosen": -440.43768310546875, "logps/rejected": -255.9036407470703, "loss": 0.5181, "rewards/accuracies": 1.0, "rewards/chosen": 0.06334737688302994, "rewards/margins": 0.42244473099708557, "rewards/rejected": -0.35909730195999146, "step": 71 }, { "epoch": 0.36, "learning_rate": 9.728172398110933e-07, "logits/chosen": -2.3817567825317383, "logits/rejected": -1.756287693977356, "logps/chosen": -443.7930908203125, "logps/rejected": -243.22853088378906, "loss": 0.5131, "rewards/accuracies": 1.0, "rewards/chosen": 0.048569999635219574, "rewards/margins": 0.4198525846004486, "rewards/rejected": -0.37128257751464844, "step": 72 }, { "epoch": 0.37, "learning_rate": 9.719416651541837e-07, "logits/chosen": -2.346971273422241, "logits/rejected": -1.7343378067016602, "logps/chosen": -414.0525207519531, "logps/rejected": -243.70193481445312, "loss": 0.5077, "rewards/accuracies": 1.0, "rewards/chosen": 0.05425339192152023, "rewards/margins": 0.4347574710845947, "rewards/rejected": -0.38050413131713867, "step": 73 }, { "epoch": 0.37, "learning_rate": 9.710526184877666e-07, "logits/chosen": -2.3919267654418945, "logits/rejected": -1.7560001611709595, "logps/chosen": -413.6714172363281, "logps/rejected": -238.31436157226562, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 0.03681691735982895, "rewards/margins": 0.4134042263031006, "rewards/rejected": -0.3765873312950134, "step": 74 }, { "epoch": 0.38, "learning_rate": 9.70150125190498e-07, "logits/chosen": -2.358880043029785, "logits/rejected": -1.745048999786377, "logps/chosen": -396.5554504394531, "logps/rejected": -236.57676696777344, "loss": 0.5077, "rewards/accuracies": 1.0, "rewards/chosen": 0.04479828104376793, "rewards/margins": 0.43102166056632996, "rewards/rejected": -0.38622337579727173, "step": 75 }, { "epoch": 0.38, "learning_rate": 9.692342110248802e-07, "logits/chosen": -2.3461804389953613, "logits/rejected": -1.7231651544570923, "logps/chosen": -445.49072265625, "logps/rejected": -233.7114715576172, "loss": 0.5003, "rewards/accuracies": 1.0, "rewards/chosen": 0.044211581349372864, "rewards/margins": 0.4490654170513153, "rewards/rejected": -0.40485385060310364, "step": 76 }, { "epoch": 0.39, "learning_rate": 9.683049021365266e-07, "logits/chosen": -2.3552205562591553, "logits/rejected": -1.7052409648895264, "logps/chosen": -413.4826354980469, "logps/rejected": -227.46371459960938, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 0.031481046229600906, "rewards/margins": 0.394098699092865, "rewards/rejected": -0.3626176416873932, "step": 77 }, { "epoch": 0.39, "learning_rate": 9.673622250534155e-07, "logits/chosen": -2.4223287105560303, "logits/rejected": -1.7525030374526978, "logps/chosen": -462.57537841796875, "logps/rejected": -229.95297241210938, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 0.04138845205307007, "rewards/margins": 0.4713382124900818, "rewards/rejected": -0.42994970083236694, "step": 78 }, { "epoch": 0.4, "learning_rate": 9.664062066851324e-07, "logits/chosen": -2.375359296798706, "logits/rejected": -1.7496776580810547, "logps/chosen": -433.4971923828125, "logps/rejected": -241.90176391601562, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": 0.07500915974378586, "rewards/margins": 0.5250797271728516, "rewards/rejected": -0.4500705897808075, "step": 79 }, { "epoch": 0.4, "learning_rate": 9.65436874322102e-07, "logits/chosen": -2.404475688934326, "logits/rejected": -1.7835043668746948, "logps/chosen": -414.5566101074219, "logps/rejected": -214.783203125, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.07704111933708191, "rewards/margins": 0.4780234098434448, "rewards/rejected": -0.4009822905063629, "step": 80 }, { "epoch": 0.41, "learning_rate": 9.644542556348097e-07, "logits/chosen": -2.2628133296966553, "logits/rejected": -1.7775275707244873, "logps/chosen": -376.2358093261719, "logps/rejected": -253.47933959960938, "loss": 0.4849, "rewards/accuracies": 1.0, "rewards/chosen": 0.04127184674143791, "rewards/margins": 0.4878663718700409, "rewards/rejected": -0.4465945065021515, "step": 81 }, { "epoch": 0.41, "learning_rate": 9.634583786730108e-07, "logits/chosen": -2.3669261932373047, "logits/rejected": -1.7970472574234009, "logps/chosen": -404.4597473144531, "logps/rejected": -254.3710479736328, "loss": 0.4749, "rewards/accuracies": 1.0, "rewards/chosen": 0.050060778856277466, "rewards/margins": 0.5404148101806641, "rewards/rejected": -0.490354061126709, "step": 82 }, { "epoch": 0.42, "learning_rate": 9.624492718649303e-07, "logits/chosen": -2.2318215370178223, "logits/rejected": -1.7477295398712158, "logps/chosen": -436.651123046875, "logps/rejected": -257.62579345703125, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 0.02764180488884449, "rewards/margins": 0.5197595357894897, "rewards/rejected": -0.4921177327632904, "step": 83 }, { "epoch": 0.42, "learning_rate": 9.61426964016452e-07, "logits/chosen": -2.382734775543213, "logits/rejected": -1.7316808700561523, "logps/chosen": -414.62017822265625, "logps/rejected": -222.22511291503906, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 0.029449207708239555, "rewards/margins": 0.5030165910720825, "rewards/rejected": -0.4735673666000366, "step": 84 }, { "epoch": 0.43, "learning_rate": 9.60391484310294e-07, "logits/chosen": -2.266514301300049, "logits/rejected": -1.6887891292572021, "logps/chosen": -423.61627197265625, "logps/rejected": -218.39918518066406, "loss": 0.4719, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0345609076321125, "rewards/margins": 0.4929486811161041, "rewards/rejected": -0.4583877921104431, "step": 85 }, { "epoch": 0.43, "learning_rate": 9.593428623051791e-07, "logits/chosen": -2.3854944705963135, "logits/rejected": -1.7098981142044067, "logps/chosen": -440.6033020019531, "logps/rejected": -229.67633056640625, "loss": 0.4575, "rewards/accuracies": 1.0, "rewards/chosen": 0.06371816247701645, "rewards/margins": 0.5454168915748596, "rewards/rejected": -0.48169875144958496, "step": 86 }, { "epoch": 0.44, "learning_rate": 9.58281127934988e-07, "logits/chosen": -2.3662829399108887, "logits/rejected": -1.8201098442077637, "logps/chosen": -431.1653137207031, "logps/rejected": -238.65867614746094, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 0.05593719333410263, "rewards/margins": 0.5440417528152466, "rewards/rejected": -0.48810455203056335, "step": 87 }, { "epoch": 0.44, "learning_rate": 9.572063115079062e-07, "logits/chosen": -2.3622806072235107, "logits/rejected": -1.7328242063522339, "logps/chosen": -448.46966552734375, "logps/rejected": -241.63418579101562, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 0.10153733193874359, "rewards/margins": 0.6085439324378967, "rewards/rejected": -0.5070066452026367, "step": 88 }, { "epoch": 0.45, "learning_rate": 9.561184437055585e-07, "logits/chosen": -2.3403146266937256, "logits/rejected": -1.8651738166809082, "logps/chosen": -405.59527587890625, "logps/rejected": -250.83309936523438, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": 0.09715297073125839, "rewards/margins": 0.6546545624732971, "rewards/rejected": -0.5575016736984253, "step": 89 }, { "epoch": 0.45, "learning_rate": 9.550175555821334e-07, "logits/chosen": -2.2955446243286133, "logits/rejected": -1.7062526941299438, "logps/chosen": -378.6717529296875, "logps/rejected": -200.58377075195312, "loss": 0.4649, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.060583628714084625, "rewards/margins": 0.4816117584705353, "rewards/rejected": -0.42102816700935364, "step": 90 }, { "epoch": 0.46, "learning_rate": 9.53903678563496e-07, "logits/chosen": -2.351724147796631, "logits/rejected": -1.7544307708740234, "logps/chosen": -420.3045349121094, "logps/rejected": -235.31301879882812, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 0.051120124757289886, "rewards/margins": 0.6100142002105713, "rewards/rejected": -0.558894157409668, "step": 91 }, { "epoch": 0.46, "learning_rate": 9.527768444462921e-07, "logits/chosen": -2.3463001251220703, "logits/rejected": -1.7564659118652344, "logps/chosen": -401.5494384765625, "logps/rejected": -236.9974365234375, "loss": 0.4357, "rewards/accuracies": 1.0, "rewards/chosen": 0.06855825334787369, "rewards/margins": 0.6093655824661255, "rewards/rejected": -0.5408073663711548, "step": 92 }, { "epoch": 0.47, "learning_rate": 9.516370853970394e-07, "logits/chosen": -2.328899621963501, "logits/rejected": -1.7436096668243408, "logps/chosen": -407.427978515625, "logps/rejected": -229.18496704101562, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": 0.03632228076457977, "rewards/margins": 0.5754761099815369, "rewards/rejected": -0.5391538143157959, "step": 93 }, { "epoch": 0.47, "learning_rate": 9.504844339512094e-07, "logits/chosen": -2.3602890968322754, "logits/rejected": -1.835445761680603, "logps/chosen": -427.4205322265625, "logps/rejected": -259.57666015625, "loss": 0.4211, "rewards/accuracies": 1.0, "rewards/chosen": 0.050824232399463654, "rewards/margins": 0.6678048968315125, "rewards/rejected": -0.6169806718826294, "step": 94 }, { "epoch": 0.48, "learning_rate": 9.493189230122998e-07, "logits/chosen": -2.410205602645874, "logits/rejected": -1.7674560546875, "logps/chosen": -431.581298828125, "logps/rejected": -227.7965545654297, "loss": 0.4311, "rewards/accuracies": 1.0, "rewards/chosen": 0.0653231292963028, "rewards/margins": 0.6254738569259644, "rewards/rejected": -0.5601507425308228, "step": 95 }, { "epoch": 0.48, "learning_rate": 9.481405858508933e-07, "logits/chosen": -2.374988317489624, "logits/rejected": -1.7726953029632568, "logps/chosen": -397.0668029785156, "logps/rejected": -231.8905487060547, "loss": 0.416, "rewards/accuracies": 1.0, "rewards/chosen": 0.08870163559913635, "rewards/margins": 0.6524338126182556, "rewards/rejected": -0.5637321472167969, "step": 96 }, { "epoch": 0.49, "learning_rate": 9.469494561037097e-07, "logits/chosen": -2.276761770248413, "logits/rejected": -1.7354525327682495, "logps/chosen": -370.6647033691406, "logps/rejected": -221.45701599121094, "loss": 0.4119, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.06141357496380806, "rewards/margins": 0.6324462890625, "rewards/rejected": -0.5710327625274658, "step": 97 }, { "epoch": 0.49, "learning_rate": 9.457455677726447e-07, "logits/chosen": -2.3553466796875, "logits/rejected": -1.73296320438385, "logps/chosen": -424.4140625, "logps/rejected": -248.7335662841797, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 0.06948699802160263, "rewards/margins": 0.7330302000045776, "rewards/rejected": -0.6635432243347168, "step": 98 }, { "epoch": 0.5, "learning_rate": 9.445289552237996e-07, "logits/chosen": -2.296659469604492, "logits/rejected": -1.7516462802886963, "logps/chosen": -434.2308044433594, "logps/rejected": -251.72531127929688, "loss": 0.396, "rewards/accuracies": 1.0, "rewards/chosen": 0.07637355476617813, "rewards/margins": 0.7368894815444946, "rewards/rejected": -0.6605159044265747, "step": 99 }, { "epoch": 0.5, "learning_rate": 9.432996531865001e-07, "logits/chosen": -2.302313804626465, "logits/rejected": -1.6839463710784912, "logps/chosen": -424.2760009765625, "logps/rejected": -221.21902465820312, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 0.0443037673830986, "rewards/margins": 0.6646233797073364, "rewards/rejected": -0.620319664478302, "step": 100 }, { "epoch": 0.51, "learning_rate": 9.420576967523048e-07, "logits/chosen": -2.399622678756714, "logits/rejected": -1.7549206018447876, "logps/chosen": -436.25360107421875, "logps/rejected": -241.64930725097656, "loss": 0.3934, "rewards/accuracies": 1.0, "rewards/chosen": 0.09657847881317139, "rewards/margins": 0.757413387298584, "rewards/rejected": -0.6608349084854126, "step": 101 }, { "epoch": 0.51, "learning_rate": 9.408031213740044e-07, "logits/chosen": -2.2774851322174072, "logits/rejected": -1.686330795288086, "logps/chosen": -382.4078369140625, "logps/rejected": -208.27102661132812, "loss": 0.4028, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.06867777556180954, "rewards/margins": 0.6393014192581177, "rewards/rejected": -0.5706236958503723, "step": 102 }, { "epoch": 0.52, "learning_rate": 9.395359628646085e-07, "logits/chosen": -2.309946060180664, "logits/rejected": -1.6876697540283203, "logps/chosen": -442.4737548828125, "logps/rejected": -218.44361877441406, "loss": 0.4119, "rewards/accuracies": 1.0, "rewards/chosen": 0.07979177683591843, "rewards/margins": 0.6863774061203003, "rewards/rejected": -0.6065855622291565, "step": 103 }, { "epoch": 0.52, "learning_rate": 9.382562573963238e-07, "logits/chosen": -2.378871440887451, "logits/rejected": -1.7466343641281128, "logps/chosen": -433.78594970703125, "logps/rejected": -226.9006805419922, "loss": 0.3951, "rewards/accuracies": 1.0, "rewards/chosen": 0.07470144331455231, "rewards/margins": 0.7442824840545654, "rewards/rejected": -0.6695809960365295, "step": 104 }, { "epoch": 0.53, "learning_rate": 9.369640414995215e-07, "logits/chosen": -2.343970775604248, "logits/rejected": -1.7456142902374268, "logps/chosen": -386.2289123535156, "logps/rejected": -245.0520477294922, "loss": 0.3949, "rewards/accuracies": 1.0, "rewards/chosen": 0.08588536083698273, "rewards/margins": 0.7823786735534668, "rewards/rejected": -0.6964933276176453, "step": 105 }, { "epoch": 0.53, "learning_rate": 9.356593520616946e-07, "logits/chosen": -2.25773286819458, "logits/rejected": -1.7212233543395996, "logps/chosen": -398.23486328125, "logps/rejected": -216.6339569091797, "loss": 0.3969, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.07008464634418488, "rewards/margins": 0.7252227067947388, "rewards/rejected": -0.6551380753517151, "step": 106 }, { "epoch": 0.54, "learning_rate": 9.34342226326405e-07, "logits/chosen": -2.3374149799346924, "logits/rejected": -1.7678660154342651, "logps/chosen": -420.0682373046875, "logps/rejected": -260.9276123046875, "loss": 0.3589, "rewards/accuracies": 1.0, "rewards/chosen": 0.06332232803106308, "rewards/margins": 0.8194485902786255, "rewards/rejected": -0.756126344203949, "step": 107 }, { "epoch": 0.54, "learning_rate": 9.330127018922193e-07, "logits/chosen": -2.326606035232544, "logits/rejected": -1.6986491680145264, "logps/chosen": -437.9176025390625, "logps/rejected": -235.58706665039062, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 0.12426784634590149, "rewards/margins": 0.8690575361251831, "rewards/rejected": -0.7447896003723145, "step": 108 }, { "epoch": 0.55, "learning_rate": 9.316708167116376e-07, "logits/chosen": -2.350536823272705, "logits/rejected": -1.7768884897232056, "logps/chosen": -405.07684326171875, "logps/rejected": -223.91233825683594, "loss": 0.3618, "rewards/accuracies": 1.0, "rewards/chosen": 0.06380031257867813, "rewards/margins": 0.8270809650421143, "rewards/rejected": -0.7632806301116943, "step": 109 }, { "epoch": 0.55, "learning_rate": 9.303166090900081e-07, "logits/chosen": -2.3844566345214844, "logits/rejected": -1.753096580505371, "logps/chosen": -425.1029052734375, "logps/rejected": -226.478515625, "loss": 0.3758, "rewards/accuracies": 1.0, "rewards/chosen": 0.08837815374135971, "rewards/margins": 0.7483861446380615, "rewards/rejected": -0.6600080132484436, "step": 110 }, { "epoch": 0.56, "learning_rate": 9.289501176844346e-07, "logits/chosen": -2.3582537174224854, "logits/rejected": -1.7496472597122192, "logps/chosen": -414.363525390625, "logps/rejected": -231.7440185546875, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 0.11145731806755066, "rewards/margins": 0.8387743234634399, "rewards/rejected": -0.7273169755935669, "step": 111 }, { "epoch": 0.56, "learning_rate": 9.275713815026732e-07, "logits/chosen": -2.313246011734009, "logits/rejected": -1.7322365045547485, "logps/chosen": -367.5860595703125, "logps/rejected": -237.1516571044922, "loss": 0.3504, "rewards/accuracies": 1.0, "rewards/chosen": 0.0921834260225296, "rewards/margins": 0.8668049573898315, "rewards/rejected": -0.7746214866638184, "step": 112 }, { "epoch": 0.57, "learning_rate": 9.261804399020175e-07, "logits/chosen": -2.305097818374634, "logits/rejected": -1.8011976480484009, "logps/chosen": -437.6880798339844, "logps/rejected": -274.9678955078125, "loss": 0.3504, "rewards/accuracies": 1.0, "rewards/chosen": 0.0472615547478199, "rewards/margins": 0.9594218730926514, "rewards/rejected": -0.912160336971283, "step": 113 }, { "epoch": 0.57, "learning_rate": 9.247773325881769e-07, "logits/chosen": -2.3485984802246094, "logits/rejected": -1.7659518718719482, "logps/chosen": -419.9745178222656, "logps/rejected": -250.2913818359375, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 0.04809850454330444, "rewards/margins": 0.9206442832946777, "rewards/rejected": -0.8725457787513733, "step": 114 }, { "epoch": 0.58, "learning_rate": 9.233620996141421e-07, "logits/chosen": -2.3236169815063477, "logits/rejected": -1.7452483177185059, "logps/chosen": -388.727294921875, "logps/rejected": -238.9022674560547, "loss": 0.3703, "rewards/accuracies": 1.0, "rewards/chosen": -0.01670125499367714, "rewards/margins": 0.6932958364486694, "rewards/rejected": -0.7099971175193787, "step": 115 }, { "epoch": 0.58, "learning_rate": 9.219347813790416e-07, "logits/chosen": -2.2801899909973145, "logits/rejected": -1.7169462442398071, "logps/chosen": -412.2328796386719, "logps/rejected": -222.73336791992188, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 0.09207204729318619, "rewards/margins": 0.914324939250946, "rewards/rejected": -0.8222529292106628, "step": 116 }, { "epoch": 0.59, "learning_rate": 9.204954186269892e-07, "logits/chosen": -2.4072670936584473, "logits/rejected": -1.7558844089508057, "logps/chosen": -392.07080078125, "logps/rejected": -223.88111877441406, "loss": 0.3188, "rewards/accuracies": 1.0, "rewards/chosen": 0.11216507107019424, "rewards/margins": 0.9786129593849182, "rewards/rejected": -0.8664478659629822, "step": 117 }, { "epoch": 0.59, "learning_rate": 9.190440524459202e-07, "logits/chosen": -2.313469886779785, "logits/rejected": -1.7978636026382446, "logps/chosen": -413.97613525390625, "logps/rejected": -259.41949462890625, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 0.09133607149124146, "rewards/margins": 0.9540255665779114, "rewards/rejected": -0.8626894950866699, "step": 118 }, { "epoch": 0.6, "learning_rate": 9.175807242664193e-07, "logits/chosen": -2.29794979095459, "logits/rejected": -1.7357946634292603, "logps/chosen": -409.6379089355469, "logps/rejected": -251.05032348632812, "loss": 0.3392, "rewards/accuracies": 1.0, "rewards/chosen": 0.04507523030042648, "rewards/margins": 0.9120150804519653, "rewards/rejected": -0.866939902305603, "step": 119 }, { "epoch": 0.6, "learning_rate": 9.161054758605368e-07, "logits/chosen": -2.406315803527832, "logits/rejected": -1.7403538227081299, "logps/chosen": -421.10064697265625, "logps/rejected": -231.68992614746094, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 0.05808563530445099, "rewards/margins": 0.930579662322998, "rewards/rejected": -0.872494101524353, "step": 120 }, { "epoch": 0.61, "learning_rate": 9.146183493405974e-07, "logits/chosen": -2.3562979698181152, "logits/rejected": -1.6579082012176514, "logps/chosen": -459.4985046386719, "logps/rejected": -224.77520751953125, "loss": 0.3288, "rewards/accuracies": 1.0, "rewards/chosen": 0.0789286345243454, "rewards/margins": 0.99262535572052, "rewards/rejected": -0.9136967062950134, "step": 121 }, { "epoch": 0.61, "learning_rate": 9.131193871579974e-07, "logits/chosen": -2.3773159980773926, "logits/rejected": -1.7242114543914795, "logps/chosen": -424.0648498535156, "logps/rejected": -220.70220947265625, "loss": 0.2966, "rewards/accuracies": 1.0, "rewards/chosen": 0.0946706235408783, "rewards/margins": 1.0034306049346924, "rewards/rejected": -0.9087600708007812, "step": 122 }, { "epoch": 0.62, "learning_rate": 9.116086321019927e-07, "logits/chosen": -2.3325231075286865, "logits/rejected": -1.7702919244766235, "logps/chosen": -430.6216125488281, "logps/rejected": -251.85089111328125, "loss": 0.302, "rewards/accuracies": 1.0, "rewards/chosen": 0.1074625700712204, "rewards/margins": 1.093393325805664, "rewards/rejected": -0.9859306812286377, "step": 123 }, { "epoch": 0.62, "learning_rate": 9.100861272984778e-07, "logits/chosen": -2.3275578022003174, "logits/rejected": -1.7917448282241821, "logps/chosen": -405.86328125, "logps/rejected": -247.26361083984375, "loss": 0.2937, "rewards/accuracies": 1.0, "rewards/chosen": 0.09578196704387665, "rewards/margins": 1.0862236022949219, "rewards/rejected": -0.9904416799545288, "step": 124 }, { "epoch": 0.63, "learning_rate": 9.085519162087549e-07, "logits/chosen": -2.315498113632202, "logits/rejected": -1.7212566137313843, "logps/chosen": -427.92681884765625, "logps/rejected": -263.6812744140625, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 0.12770704925060272, "rewards/margins": 1.1388293504714966, "rewards/rejected": -1.0111223459243774, "step": 125 }, { "epoch": 0.63, "learning_rate": 9.070060426282924e-07, "logits/chosen": -2.3237569332122803, "logits/rejected": -1.7812209129333496, "logps/chosen": -453.78863525390625, "logps/rejected": -262.3250427246094, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 0.10156962275505066, "rewards/margins": 1.0849241018295288, "rewards/rejected": -0.9833545684814453, "step": 126 }, { "epoch": 0.64, "learning_rate": 9.054485506854755e-07, "logits/chosen": -2.3785390853881836, "logits/rejected": -1.845327615737915, "logps/chosen": -404.6642150878906, "logps/rejected": -255.60223388671875, "loss": 0.3232, "rewards/accuracies": 1.0, "rewards/chosen": 0.09451548755168915, "rewards/margins": 1.0977139472961426, "rewards/rejected": -1.0031983852386475, "step": 127 }, { "epoch": 0.64, "learning_rate": 9.038794848403462e-07, "logits/chosen": -2.317946672439575, "logits/rejected": -1.7601368427276611, "logps/chosen": -432.4928283691406, "logps/rejected": -234.49168395996094, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 0.1047566756606102, "rewards/margins": 1.1364277601242065, "rewards/rejected": -1.0316710472106934, "step": 128 }, { "epoch": 0.65, "learning_rate": 9.022988898833342e-07, "logits/chosen": -2.385056972503662, "logits/rejected": -1.7618261575698853, "logps/chosen": -434.1672668457031, "logps/rejected": -263.1429748535156, "loss": 0.2761, "rewards/accuracies": 1.0, "rewards/chosen": 0.12021943181753159, "rewards/margins": 1.2107895612716675, "rewards/rejected": -1.090570092201233, "step": 129 }, { "epoch": 0.65, "learning_rate": 9.007068109339783e-07, "logits/chosen": -2.295499324798584, "logits/rejected": -1.728455662727356, "logps/chosen": -390.18328857421875, "logps/rejected": -249.26953125, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 0.08724518120288849, "rewards/margins": 1.17958402633667, "rewards/rejected": -1.0923389196395874, "step": 130 }, { "epoch": 0.66, "learning_rate": 8.991032934396386e-07, "logits/chosen": -2.3488993644714355, "logits/rejected": -1.720859408378601, "logps/chosen": -430.67755126953125, "logps/rejected": -247.77947998046875, "loss": 0.2759, "rewards/accuracies": 1.0, "rewards/chosen": 0.08383102715015411, "rewards/margins": 1.1948963403701782, "rewards/rejected": -1.11106538772583, "step": 131 }, { "epoch": 0.66, "learning_rate": 8.974883831741989e-07, "logits/chosen": -2.3680620193481445, "logits/rejected": -1.716318130493164, "logps/chosen": -430.8277587890625, "logps/rejected": -223.50978088378906, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": 0.10719554871320724, "rewards/margins": 1.1470305919647217, "rewards/rejected": -1.039834976196289, "step": 132 }, { "epoch": 0.67, "learning_rate": 8.958621262367598e-07, "logits/chosen": -2.230820655822754, "logits/rejected": -1.7033137083053589, "logps/chosen": -368.60015869140625, "logps/rejected": -229.6012725830078, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": 0.06556817144155502, "rewards/margins": 0.9582837820053101, "rewards/rejected": -0.8927156329154968, "step": 133 }, { "epoch": 0.67, "learning_rate": 8.942245690503238e-07, "logits/chosen": -2.3599350452423096, "logits/rejected": -1.7824196815490723, "logps/chosen": -421.876953125, "logps/rejected": -267.8891296386719, "loss": 0.2618, "rewards/accuracies": 1.0, "rewards/chosen": 0.1000775694847107, "rewards/margins": 1.4201455116271973, "rewards/rejected": -1.3200678825378418, "step": 134 }, { "epoch": 0.68, "learning_rate": 8.925757583604689e-07, "logits/chosen": -2.419376850128174, "logits/rejected": -1.784630298614502, "logps/chosen": -422.77655029296875, "logps/rejected": -230.90945434570312, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": 0.11120504140853882, "rewards/margins": 1.1198475360870361, "rewards/rejected": -1.0086424350738525, "step": 135 }, { "epoch": 0.68, "learning_rate": 8.909157412340149e-07, "logits/chosen": -2.3980765342712402, "logits/rejected": -1.7874189615249634, "logps/chosen": -418.0338134765625, "logps/rejected": -234.7857666015625, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": 0.06728719174861908, "rewards/margins": 1.1861863136291504, "rewards/rejected": -1.1188991069793701, "step": 136 }, { "epoch": 0.69, "learning_rate": 8.892445650576794e-07, "logits/chosen": -2.3215763568878174, "logits/rejected": -1.7103643417358398, "logps/chosen": -428.76812744140625, "logps/rejected": -249.06536865234375, "loss": 0.2627, "rewards/accuracies": 1.0, "rewards/chosen": 0.11178424209356308, "rewards/margins": 1.306850790977478, "rewards/rejected": -1.1950665712356567, "step": 137 }, { "epoch": 0.69, "learning_rate": 8.875622775367259e-07, "logits/chosen": -2.4116361141204834, "logits/rejected": -1.746964693069458, "logps/chosen": -417.92388916015625, "logps/rejected": -223.80828857421875, "loss": 0.2592, "rewards/accuracies": 1.0, "rewards/chosen": 0.15721920132637024, "rewards/margins": 1.274531602859497, "rewards/rejected": -1.1173124313354492, "step": 138 }, { "epoch": 0.7, "learning_rate": 8.858689266936008e-07, "logits/chosen": -2.3939359188079834, "logits/rejected": -1.7181127071380615, "logps/chosen": -423.7195129394531, "logps/rejected": -242.014404296875, "loss": 0.2581, "rewards/accuracies": 1.0, "rewards/chosen": 0.13689906895160675, "rewards/margins": 1.224957823753357, "rewards/rejected": -1.0880587100982666, "step": 139 }, { "epoch": 0.7, "learning_rate": 8.841645608665639e-07, "logits/chosen": -2.3554303646087646, "logits/rejected": -1.7345848083496094, "logps/chosen": -388.30328369140625, "logps/rejected": -240.800537109375, "loss": 0.2575, "rewards/accuracies": 1.0, "rewards/chosen": 0.1403249204158783, "rewards/margins": 1.3793652057647705, "rewards/rejected": -1.2390403747558594, "step": 140 }, { "epoch": 0.71, "learning_rate": 8.824492287083071e-07, "logits/chosen": -2.3161697387695312, "logits/rejected": -1.6870753765106201, "logps/chosen": -437.71258544921875, "logps/rejected": -224.8647918701172, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 0.11015091091394424, "rewards/margins": 1.287543773651123, "rewards/rejected": -1.1773927211761475, "step": 141 }, { "epoch": 0.71, "learning_rate": 8.807229791845671e-07, "logits/chosen": -2.4036004543304443, "logits/rejected": -1.8893616199493408, "logps/chosen": -397.8560791015625, "logps/rejected": -288.0263671875, "loss": 0.2447, "rewards/accuracies": 1.0, "rewards/chosen": 0.08967640995979309, "rewards/margins": 1.361191749572754, "rewards/rejected": -1.271515130996704, "step": 142 }, { "epoch": 0.72, "learning_rate": 8.789858615727264e-07, "logits/chosen": -2.43672513961792, "logits/rejected": -1.7865599393844604, "logps/chosen": -380.53253173828125, "logps/rejected": -255.96661376953125, "loss": 0.2501, "rewards/accuracies": 1.0, "rewards/chosen": 0.09457423537969589, "rewards/margins": 1.360377311706543, "rewards/rejected": -1.2658030986785889, "step": 143 }, { "epoch": 0.72, "learning_rate": 8.772379254604072e-07, "logits/chosen": -2.292776584625244, "logits/rejected": -1.7155327796936035, "logps/chosen": -432.17620849609375, "logps/rejected": -239.72390747070312, "loss": 0.2526, "rewards/accuracies": 1.0, "rewards/chosen": 0.12057928740978241, "rewards/margins": 1.3411376476287842, "rewards/rejected": -1.2205584049224854, "step": 144 }, { "epoch": 0.73, "learning_rate": 8.754792207440556e-07, "logits/chosen": -2.357753276824951, "logits/rejected": -1.6902601718902588, "logps/chosen": -449.3105163574219, "logps/rejected": -220.51034545898438, "loss": 0.2455, "rewards/accuracies": 1.0, "rewards/chosen": 0.13404032588005066, "rewards/margins": 1.2930368185043335, "rewards/rejected": -1.1589964628219604, "step": 145 }, { "epoch": 0.73, "learning_rate": 8.737097976275176e-07, "logits/chosen": -2.4060750007629395, "logits/rejected": -1.7780792713165283, "logps/chosen": -421.6581115722656, "logps/rejected": -236.85427856445312, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 0.11715394258499146, "rewards/margins": 1.4071135520935059, "rewards/rejected": -1.2899596691131592, "step": 146 }, { "epoch": 0.74, "learning_rate": 8.719297066206059e-07, "logits/chosen": -2.3706471920013428, "logits/rejected": -1.7989836931228638, "logps/chosen": -424.7387390136719, "logps/rejected": -268.2914733886719, "loss": 0.2365, "rewards/accuracies": 1.0, "rewards/chosen": 0.10021540522575378, "rewards/margins": 1.3642604351043701, "rewards/rejected": -1.264045000076294, "step": 147 }, { "epoch": 0.74, "learning_rate": 8.701389985376577e-07, "logits/chosen": -2.3071541786193848, "logits/rejected": -1.6954561471939087, "logps/chosen": -418.8785095214844, "logps/rejected": -234.91978454589844, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": 0.08363698422908783, "rewards/margins": 1.3431799411773682, "rewards/rejected": -1.2595430612564087, "step": 148 }, { "epoch": 0.75, "learning_rate": 8.683377244960846e-07, "logits/chosen": -2.3204758167266846, "logits/rejected": -1.6942288875579834, "logps/chosen": -389.1016845703125, "logps/rejected": -223.63568115234375, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 0.09061349928379059, "rewards/margins": 1.304807424545288, "rewards/rejected": -1.2141939401626587, "step": 149 }, { "epoch": 0.75, "learning_rate": 8.66525935914913e-07, "logits/chosen": -2.2673897743225098, "logits/rejected": -1.7737685441970825, "logps/chosen": -390.1370849609375, "logps/rejected": -254.79981994628906, "loss": 0.246, "rewards/accuracies": 1.0, "rewards/chosen": 0.10101611167192459, "rewards/margins": 1.3608810901641846, "rewards/rejected": -1.2598650455474854, "step": 150 }, { "epoch": 0.76, "learning_rate": 8.647036845133171e-07, "logits/chosen": -2.359968662261963, "logits/rejected": -1.748475193977356, "logps/chosen": -448.51556396484375, "logps/rejected": -254.92457580566406, "loss": 0.2069, "rewards/accuracies": 1.0, "rewards/chosen": 0.2057947814464569, "rewards/margins": 1.563599705696106, "rewards/rejected": -1.3578048944473267, "step": 151 }, { "epoch": 0.76, "learning_rate": 8.62871022309141e-07, "logits/chosen": -2.3874001502990723, "logits/rejected": -1.822591781616211, "logps/chosen": -435.06134033203125, "logps/rejected": -250.98220825195312, "loss": 0.2309, "rewards/accuracies": 1.0, "rewards/chosen": 0.13095475733280182, "rewards/margins": 1.4472978115081787, "rewards/rejected": -1.316343069076538, "step": 152 }, { "epoch": 0.77, "learning_rate": 8.610280016174155e-07, "logits/chosen": -2.384272813796997, "logits/rejected": -1.7946633100509644, "logps/chosen": -443.0965881347656, "logps/rejected": -274.38037109375, "loss": 0.2023, "rewards/accuracies": 1.0, "rewards/chosen": 0.0783994048833847, "rewards/margins": 1.5976977348327637, "rewards/rejected": -1.5192983150482178, "step": 153 }, { "epoch": 0.77, "learning_rate": 8.591746750488637e-07, "logits/chosen": -2.3081254959106445, "logits/rejected": -1.6912481784820557, "logps/chosen": -451.3265686035156, "logps/rejected": -236.5503387451172, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 0.141992449760437, "rewards/margins": 1.462505578994751, "rewards/rejected": -1.3205132484436035, "step": 154 }, { "epoch": 0.78, "learning_rate": 8.573110955083996e-07, "logits/chosen": -2.404266834259033, "logits/rejected": -1.7257637977600098, "logps/chosen": -446.27020263671875, "logps/rejected": -237.20352172851562, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 0.14163005352020264, "rewards/margins": 1.5048242807388306, "rewards/rejected": -1.363194227218628, "step": 155 }, { "epoch": 0.78, "learning_rate": 8.554373161936175e-07, "logits/chosen": -2.3184006214141846, "logits/rejected": -1.7596100568771362, "logps/chosen": -427.9510192871094, "logps/rejected": -243.73013305664062, "loss": 0.2111, "rewards/accuracies": 1.0, "rewards/chosen": 0.13798728585243225, "rewards/margins": 1.4847362041473389, "rewards/rejected": -1.3467490673065186, "step": 156 }, { "epoch": 0.79, "learning_rate": 8.535533905932737e-07, "logits/chosen": -2.3209409713745117, "logits/rejected": -1.7519683837890625, "logps/chosen": -414.7841796875, "logps/rejected": -238.59425354003906, "loss": 0.1882, "rewards/accuracies": 1.0, "rewards/chosen": 0.14703166484832764, "rewards/margins": 1.5360753536224365, "rewards/rejected": -1.3890438079833984, "step": 157 }, { "epoch": 0.79, "learning_rate": 8.516593724857597e-07, "logits/chosen": -2.325869083404541, "logits/rejected": -1.7032279968261719, "logps/chosen": -433.857666015625, "logps/rejected": -242.71237182617188, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 0.11811625957489014, "rewards/margins": 1.5760319232940674, "rewards/rejected": -1.4579156637191772, "step": 158 }, { "epoch": 0.8, "learning_rate": 8.49755315937567e-07, "logits/chosen": -2.4094128608703613, "logits/rejected": -1.7675931453704834, "logps/chosen": -427.9541320800781, "logps/rejected": -246.40489196777344, "loss": 0.1939, "rewards/accuracies": 1.0, "rewards/chosen": 0.1982579529285431, "rewards/margins": 1.6175042390823364, "rewards/rejected": -1.4192461967468262, "step": 159 }, { "epoch": 0.8, "learning_rate": 8.478412753017432e-07, "logits/chosen": -2.388759136199951, "logits/rejected": -1.7635258436203003, "logps/chosen": -456.8898620605469, "logps/rejected": -263.1271057128906, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 0.09290695935487747, "rewards/margins": 1.6084740161895752, "rewards/rejected": -1.5155669450759888, "step": 160 }, { "epoch": 0.81, "learning_rate": 8.459173052163413e-07, "logits/chosen": -2.310825824737549, "logits/rejected": -1.7240328788757324, "logps/chosen": -396.31964111328125, "logps/rejected": -232.20787048339844, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": 0.009786097332835197, "rewards/margins": 1.4758687019348145, "rewards/rejected": -1.4660826921463013, "step": 161 }, { "epoch": 0.81, "learning_rate": 8.439834606028593e-07, "logits/chosen": -2.3431029319763184, "logits/rejected": -1.7749687433242798, "logps/chosen": -395.314453125, "logps/rejected": -243.02005004882812, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": 0.11201668530702591, "rewards/margins": 1.5237908363342285, "rewards/rejected": -1.4117743968963623, "step": 162 }, { "epoch": 0.82, "learning_rate": 8.420397966646731e-07, "logits/chosen": -2.385366439819336, "logits/rejected": -1.77970290184021, "logps/chosen": -391.3575744628906, "logps/rejected": -231.453369140625, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 0.2382354736328125, "rewards/margins": 1.6484943628311157, "rewards/rejected": -1.4102587699890137, "step": 163 }, { "epoch": 0.82, "learning_rate": 8.400863688854596e-07, "logits/chosen": -2.32381010055542, "logits/rejected": -1.6914910078048706, "logps/chosen": -438.3463134765625, "logps/rejected": -249.5445556640625, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": 0.09939169138669968, "rewards/margins": 1.6547143459320068, "rewards/rejected": -1.5553228855133057, "step": 164 }, { "epoch": 0.83, "learning_rate": 8.381232330276143e-07, "logits/chosen": -2.336649179458618, "logits/rejected": -1.6781638860702515, "logps/chosen": -409.78582763671875, "logps/rejected": -247.93991088867188, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": 0.02594146691262722, "rewards/margins": 1.537541389465332, "rewards/rejected": -1.5115998983383179, "step": 165 }, { "epoch": 0.83, "learning_rate": 8.361504451306584e-07, "logits/chosen": -2.33518648147583, "logits/rejected": -1.7061748504638672, "logps/chosen": -416.81689453125, "logps/rejected": -243.02435302734375, "loss": 0.1723, "rewards/accuracies": 1.0, "rewards/chosen": 0.21223171055316925, "rewards/margins": 1.6770662069320679, "rewards/rejected": -1.4648343324661255, "step": 166 }, { "epoch": 0.84, "learning_rate": 8.341680615096391e-07, "logits/chosen": -2.3758351802825928, "logits/rejected": -1.7754679918289185, "logps/chosen": -420.1386413574219, "logps/rejected": -248.68569946289062, "loss": 0.1858, "rewards/accuracies": 1.0, "rewards/chosen": 0.11341756582260132, "rewards/margins": 1.6961886882781982, "rewards/rejected": -1.5827710628509521, "step": 167 }, { "epoch": 0.84, "learning_rate": 8.321761387535229e-07, "logits/chosen": -2.3806052207946777, "logits/rejected": -1.7757240533828735, "logps/chosen": -430.906982421875, "logps/rejected": -251.41799926757812, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 0.0926055982708931, "rewards/margins": 1.7989130020141602, "rewards/rejected": -1.7063074111938477, "step": 168 }, { "epoch": 0.85, "learning_rate": 8.301747337235796e-07, "logits/chosen": -2.3566908836364746, "logits/rejected": -1.7478523254394531, "logps/chosen": -432.0615539550781, "logps/rejected": -249.97256469726562, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": 0.14809469878673553, "rewards/margins": 1.7186052799224854, "rewards/rejected": -1.5705105066299438, "step": 169 }, { "epoch": 0.85, "learning_rate": 8.281639035517591e-07, "logits/chosen": -2.3831849098205566, "logits/rejected": -1.7154203653335571, "logps/chosen": -425.0347595214844, "logps/rejected": -250.52938842773438, "loss": 0.1712, "rewards/accuracies": 1.0, "rewards/chosen": 0.10739822685718536, "rewards/margins": 1.6952239274978638, "rewards/rejected": -1.5878257751464844, "step": 170 }, { "epoch": 0.86, "learning_rate": 8.261437056390606e-07, "logits/chosen": -2.3428475856781006, "logits/rejected": -1.7119221687316895, "logps/chosen": -427.6769714355469, "logps/rejected": -224.28375244140625, "loss": 0.1755, "rewards/accuracies": 1.0, "rewards/chosen": 0.17022070288658142, "rewards/margins": 1.7057090997695923, "rewards/rejected": -1.535488486289978, "step": 171 }, { "epoch": 0.86, "learning_rate": 8.241141976538942e-07, "logits/chosen": -2.274014472961426, "logits/rejected": -1.7084298133850098, "logps/chosen": -455.44439697265625, "logps/rejected": -257.0169982910156, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 0.11147614568471909, "rewards/margins": 1.7180328369140625, "rewards/rejected": -1.6065566539764404, "step": 172 }, { "epoch": 0.87, "learning_rate": 8.220754375304347e-07, "logits/chosen": -2.399148464202881, "logits/rejected": -1.7076518535614014, "logps/chosen": -448.07220458984375, "logps/rejected": -242.0417022705078, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 0.11448797583580017, "rewards/margins": 1.7063040733337402, "rewards/rejected": -1.5918160676956177, "step": 173 }, { "epoch": 0.87, "learning_rate": 8.200274834669675e-07, "logits/chosen": -2.371319055557251, "logits/rejected": -1.7134429216384888, "logps/chosen": -448.1710205078125, "logps/rejected": -235.3464813232422, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 0.15597115457057953, "rewards/margins": 1.691119909286499, "rewards/rejected": -1.5351487398147583, "step": 174 }, { "epoch": 0.88, "learning_rate": 8.179703939242275e-07, "logits/chosen": -2.4326086044311523, "logits/rejected": -1.7696926593780518, "logps/chosen": -413.26129150390625, "logps/rejected": -239.28012084960938, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 0.18165868520736694, "rewards/margins": 1.9033472537994385, "rewards/rejected": -1.7216886281967163, "step": 175 }, { "epoch": 0.88, "learning_rate": 8.159042276237307e-07, "logits/chosen": -2.3397960662841797, "logits/rejected": -1.7284985780715942, "logps/chosen": -383.0244445800781, "logps/rejected": -244.87164306640625, "loss": 0.1636, "rewards/accuracies": 1.0, "rewards/chosen": 0.10478147119283676, "rewards/margins": 1.6815428733825684, "rewards/rejected": -1.5767613649368286, "step": 176 }, { "epoch": 0.89, "learning_rate": 8.138290435460968e-07, "logits/chosen": -2.3284847736358643, "logits/rejected": -1.7142457962036133, "logps/chosen": -420.0850830078125, "logps/rejected": -246.7850341796875, "loss": 0.1615, "rewards/accuracies": 1.0, "rewards/chosen": 0.04485499486327171, "rewards/margins": 1.7577266693115234, "rewards/rejected": -1.7128715515136719, "step": 177 }, { "epoch": 0.89, "learning_rate": 8.117449009293668e-07, "logits/chosen": -2.320773124694824, "logits/rejected": -1.737979531288147, "logps/chosen": -433.32684326171875, "logps/rejected": -260.42266845703125, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 0.2081751525402069, "rewards/margins": 1.9847412109375, "rewards/rejected": -1.7765660285949707, "step": 178 }, { "epoch": 0.9, "learning_rate": 8.096518592673111e-07, "logits/chosen": -2.2880947589874268, "logits/rejected": -1.7228150367736816, "logps/chosen": -411.1377868652344, "logps/rejected": -237.13084411621094, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": 0.18292337656021118, "rewards/margins": 1.8514668941497803, "rewards/rejected": -1.6685435771942139, "step": 179 }, { "epoch": 0.9, "learning_rate": 8.07549978307732e-07, "logits/chosen": -2.406970977783203, "logits/rejected": -1.7610397338867188, "logps/chosen": -423.91461181640625, "logps/rejected": -248.35716247558594, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 0.15978673100471497, "rewards/margins": 2.0041348934173584, "rewards/rejected": -1.8443483114242554, "step": 180 }, { "epoch": 0.91, "learning_rate": 8.054393180507571e-07, "logits/chosen": -2.3204801082611084, "logits/rejected": -1.6617151498794556, "logps/chosen": -436.10968017578125, "logps/rejected": -247.89251708984375, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 0.21290132403373718, "rewards/margins": 1.9405009746551514, "rewards/rejected": -1.7275996208190918, "step": 181 }, { "epoch": 0.91, "learning_rate": 8.033199387471276e-07, "logits/chosen": -2.409226894378662, "logits/rejected": -1.7613660097122192, "logps/chosen": -441.06793212890625, "logps/rejected": -256.4024658203125, "loss": 0.1485, "rewards/accuracies": 1.0, "rewards/chosen": 0.09123332053422928, "rewards/margins": 1.8427760601043701, "rewards/rejected": -1.7515426874160767, "step": 182 }, { "epoch": 0.92, "learning_rate": 8.011919008964779e-07, "logits/chosen": -2.2990188598632812, "logits/rejected": -1.7037487030029297, "logps/chosen": -457.4071044921875, "logps/rejected": -270.1855163574219, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 0.20898641645908356, "rewards/margins": 1.9484333992004395, "rewards/rejected": -1.7394468784332275, "step": 183 }, { "epoch": 0.92, "learning_rate": 7.990552652456079e-07, "logits/chosen": -2.3206450939178467, "logits/rejected": -1.7227449417114258, "logps/chosen": -431.5004577636719, "logps/rejected": -259.06439208984375, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 0.10346387326717377, "rewards/margins": 1.804671049118042, "rewards/rejected": -1.701207160949707, "step": 184 }, { "epoch": 0.93, "learning_rate": 7.969100927867507e-07, "logits/chosen": -2.3881309032440186, "logits/rejected": -1.7506178617477417, "logps/chosen": -439.07781982421875, "logps/rejected": -244.4168701171875, "loss": 0.1375, "rewards/accuracies": 1.0, "rewards/chosen": 0.1506747007369995, "rewards/margins": 1.8724853992462158, "rewards/rejected": -1.7218106985092163, "step": 185 }, { "epoch": 0.93, "learning_rate": 7.947564447558299e-07, "logits/chosen": -2.337578296661377, "logits/rejected": -1.741979956626892, "logps/chosen": -422.3631896972656, "logps/rejected": -245.05859375, "loss": 0.1479, "rewards/accuracies": 1.0, "rewards/chosen": 0.1314590573310852, "rewards/margins": 1.8891122341156006, "rewards/rejected": -1.7576531171798706, "step": 186 }, { "epoch": 0.94, "learning_rate": 7.925943826307117e-07, "logits/chosen": -2.3357491493225098, "logits/rejected": -1.706378698348999, "logps/chosen": -463.224609375, "logps/rejected": -252.25180053710938, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 0.15241114795207977, "rewards/margins": 2.0661392211914062, "rewards/rejected": -1.913727879524231, "step": 187 }, { "epoch": 0.94, "learning_rate": 7.904239681294513e-07, "logits/chosen": -2.328556537628174, "logits/rejected": -1.7144217491149902, "logps/chosen": -436.79632568359375, "logps/rejected": -261.33087158203125, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": 0.19306030869483948, "rewards/margins": 2.0765292644500732, "rewards/rejected": -1.8834688663482666, "step": 188 }, { "epoch": 0.95, "learning_rate": 7.882452632085295e-07, "logits/chosen": -2.3383805751800537, "logits/rejected": -1.773716688156128, "logps/chosen": -456.63494873046875, "logps/rejected": -242.8838653564453, "loss": 0.1424, "rewards/accuracies": 1.0, "rewards/chosen": 0.16036759316921234, "rewards/margins": 1.975510835647583, "rewards/rejected": -1.815143346786499, "step": 189 }, { "epoch": 0.95, "learning_rate": 7.860583300610847e-07, "logits/chosen": -2.3597428798675537, "logits/rejected": -1.7476630210876465, "logps/chosen": -430.1130065917969, "logps/rejected": -258.6775817871094, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 0.10283038020133972, "rewards/margins": 1.9616904258728027, "rewards/rejected": -1.8588601350784302, "step": 190 }, { "epoch": 0.96, "learning_rate": 7.838632311151383e-07, "logits/chosen": -2.326508045196533, "logits/rejected": -1.7232400178909302, "logps/chosen": -424.65625, "logps/rejected": -238.4003448486328, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 0.1308087706565857, "rewards/margins": 1.8875113725662231, "rewards/rejected": -1.7567026615142822, "step": 191 }, { "epoch": 0.96, "learning_rate": 7.81660029031811e-07, "logits/chosen": -2.367722749710083, "logits/rejected": -1.7441189289093018, "logps/chosen": -443.6425476074219, "logps/rejected": -265.9601135253906, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 0.08243382722139359, "rewards/margins": 2.0577261447906494, "rewards/rejected": -1.975292444229126, "step": 192 }, { "epoch": 0.97, "learning_rate": 7.794487867035358e-07, "logits/chosen": -2.3231446743011475, "logits/rejected": -1.778619647026062, "logps/chosen": -398.2384338378906, "logps/rejected": -227.8143310546875, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": 0.08439954370260239, "rewards/margins": 1.785447597503662, "rewards/rejected": -1.7010481357574463, "step": 193 }, { "epoch": 0.97, "learning_rate": 7.772295672522614e-07, "logits/chosen": -2.3635432720184326, "logits/rejected": -1.739173173904419, "logps/chosen": -403.1354064941406, "logps/rejected": -225.5125732421875, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 0.22871577739715576, "rewards/margins": 2.12044095993042, "rewards/rejected": -1.8917250633239746, "step": 194 }, { "epoch": 0.98, "learning_rate": 7.750024340276512e-07, "logits/chosen": -2.388641119003296, "logits/rejected": -1.7989921569824219, "logps/chosen": -419.755615234375, "logps/rejected": -246.90708923339844, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 0.09495342522859573, "rewards/margins": 2.041985273361206, "rewards/rejected": -1.9470319747924805, "step": 195 }, { "epoch": 0.98, "learning_rate": 7.727674506052743e-07, "logits/chosen": -2.309828758239746, "logits/rejected": -1.7137950658798218, "logps/chosen": -397.6074523925781, "logps/rejected": -230.72744750976562, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 0.12126573920249939, "rewards/margins": 1.8476148843765259, "rewards/rejected": -1.726349115371704, "step": 196 }, { "epoch": 0.99, "learning_rate": 7.70524680784791e-07, "logits/chosen": -2.315549850463867, "logits/rejected": -1.7068395614624023, "logps/chosen": -451.6809387207031, "logps/rejected": -262.5193176269531, "loss": 0.1281, "rewards/accuracies": 1.0, "rewards/chosen": 0.0650506317615509, "rewards/margins": 2.1650729179382324, "rewards/rejected": -2.100022315979004, "step": 197 }, { "epoch": 0.99, "learning_rate": 7.682741885881314e-07, "logits/chosen": -2.3721389770507812, "logits/rejected": -1.7505693435668945, "logps/chosen": -404.636474609375, "logps/rejected": -256.31707763671875, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 0.11515708267688751, "rewards/margins": 2.078113555908203, "rewards/rejected": -1.962956428527832, "step": 198 }, { "epoch": 1.0, "learning_rate": 7.660160382576683e-07, "logits/chosen": -2.3731746673583984, "logits/rejected": -1.8136448860168457, "logps/chosen": -397.923828125, "logps/rejected": -263.9638977050781, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 0.08503112941980362, "rewards/margins": 2.1683878898620605, "rewards/rejected": -2.0833568572998047, "step": 199 }, { "epoch": 1.0, "learning_rate": 7.637502942543823e-07, "logits/chosen": -2.3539133071899414, "logits/rejected": -1.7467124462127686, "logps/chosen": -424.90045166015625, "logps/rejected": -280.39691162109375, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 0.07011514902114868, "rewards/margins": 2.352435827255249, "rewards/rejected": -2.282320499420166, "step": 200 }, { "epoch": 1.01, "learning_rate": 7.614770212560233e-07, "logits/chosen": -2.3378798961639404, "logits/rejected": -1.748886227607727, "logps/chosen": -423.13775634765625, "logps/rejected": -246.6858673095703, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 0.1511233150959015, "rewards/margins": 2.1637344360351562, "rewards/rejected": -2.012610912322998, "step": 201 }, { "epoch": 1.01, "learning_rate": 7.591962841552626e-07, "logits/chosen": -2.380680799484253, "logits/rejected": -1.7155252695083618, "logps/chosen": -389.64910888671875, "logps/rejected": -255.35784912109375, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 0.03893038630485535, "rewards/margins": 1.9744110107421875, "rewards/rejected": -1.9354807138442993, "step": 202 }, { "epoch": 1.02, "learning_rate": 7.569081480578412e-07, "logits/chosen": -2.37414288520813, "logits/rejected": -1.7875672578811646, "logps/chosen": -383.9082946777344, "logps/rejected": -249.59518432617188, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 0.15397796034812927, "rewards/margins": 2.061845064163208, "rewards/rejected": -1.9078669548034668, "step": 203 }, { "epoch": 1.02, "learning_rate": 7.546126782807117e-07, "logits/chosen": -2.337282419204712, "logits/rejected": -1.7536512613296509, "logps/chosen": -431.340576171875, "logps/rejected": -254.70860290527344, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 0.17993292212486267, "rewards/margins": 2.2500832080841064, "rewards/rejected": -2.070150375366211, "step": 204 }, { "epoch": 1.03, "learning_rate": 7.523099403501729e-07, "logits/chosen": -2.2855844497680664, "logits/rejected": -1.683856725692749, "logps/chosen": -444.0831604003906, "logps/rejected": -251.0932159423828, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 0.0007780734449625015, "rewards/margins": 2.09063720703125, "rewards/rejected": -2.0898592472076416, "step": 205 }, { "epoch": 1.03, "learning_rate": 7.5e-07, "logits/chosen": -2.351289749145508, "logits/rejected": -1.6911373138427734, "logps/chosen": -435.5600280761719, "logps/rejected": -255.68995666503906, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 0.17635804414749146, "rewards/margins": 2.300365924835205, "rewards/rejected": -2.1240081787109375, "step": 206 }, { "epoch": 1.04, "learning_rate": 7.476829231695679e-07, "logits/chosen": -2.376387119293213, "logits/rejected": -1.7667851448059082, "logps/chosen": -404.8138427734375, "logps/rejected": -259.134765625, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 0.1162160262465477, "rewards/margins": 2.206512451171875, "rewards/rejected": -2.0902962684631348, "step": 207 }, { "epoch": 1.04, "learning_rate": 7.45358776001969e-07, "logits/chosen": -2.3352673053741455, "logits/rejected": -1.7181344032287598, "logps/chosen": -412.27734375, "logps/rejected": -264.72216796875, "loss": 0.1197, "rewards/accuracies": 1.0, "rewards/chosen": 0.13626988232135773, "rewards/margins": 2.24605655670166, "rewards/rejected": -2.1097865104675293, "step": 208 }, { "epoch": 1.05, "learning_rate": 7.430276248421246e-07, "logits/chosen": -2.3441288471221924, "logits/rejected": -1.750102162361145, "logps/chosen": -442.572265625, "logps/rejected": -261.34246826171875, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 0.2355346828699112, "rewards/margins": 2.4083404541015625, "rewards/rejected": -2.1728057861328125, "step": 209 }, { "epoch": 1.05, "learning_rate": 7.406895362348915e-07, "logits/chosen": -2.308690309524536, "logits/rejected": -1.7486011981964111, "logps/chosen": -419.981201171875, "logps/rejected": -259.8365478515625, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 0.0854237973690033, "rewards/margins": 2.2288451194763184, "rewards/rejected": -2.1434214115142822, "step": 210 }, { "epoch": 1.06, "learning_rate": 7.383445769231627e-07, "logits/chosen": -2.3371410369873047, "logits/rejected": -1.7576262950897217, "logps/chosen": -429.950439453125, "logps/rejected": -269.0849609375, "loss": 0.1125, "rewards/accuracies": 1.0, "rewards/chosen": 0.04518279805779457, "rewards/margins": 2.189702033996582, "rewards/rejected": -2.144519329071045, "step": 211 }, { "epoch": 1.06, "learning_rate": 7.359928138459615e-07, "logits/chosen": -2.3197529315948486, "logits/rejected": -1.723442792892456, "logps/chosen": -431.7857360839844, "logps/rejected": -252.51528930664062, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 0.06412442773580551, "rewards/margins": 2.1517128944396973, "rewards/rejected": -2.087588310241699, "step": 212 }, { "epoch": 1.07, "learning_rate": 7.33634314136531e-07, "logits/chosen": -2.434603214263916, "logits/rejected": -1.7381677627563477, "logps/chosen": -429.2737731933594, "logps/rejected": -274.3027038574219, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 0.1759287565946579, "rewards/margins": 2.5088939666748047, "rewards/rejected": -2.332965135574341, "step": 213 }, { "epoch": 1.07, "learning_rate": 7.312691451204177e-07, "logits/chosen": -2.302788734436035, "logits/rejected": -1.7119388580322266, "logps/chosen": -403.673583984375, "logps/rejected": -258.3208312988281, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 0.04119987413287163, "rewards/margins": 2.42299747467041, "rewards/rejected": -2.3817975521087646, "step": 214 }, { "epoch": 1.08, "learning_rate": 7.288973743135494e-07, "logits/chosen": -2.3675503730773926, "logits/rejected": -1.795715570449829, "logps/chosen": -372.0311279296875, "logps/rejected": -247.83828735351562, "loss": 0.1058, "rewards/accuracies": 1.0, "rewards/chosen": 0.07224629819393158, "rewards/margins": 2.159656524658203, "rewards/rejected": -2.0874102115631104, "step": 215 }, { "epoch": 1.08, "learning_rate": 7.265190694203085e-07, "logits/chosen": -2.344999074935913, "logits/rejected": -1.7765824794769287, "logps/chosen": -392.13128662109375, "logps/rejected": -266.5599060058594, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 0.051146697252988815, "rewards/margins": 2.1383237838745117, "rewards/rejected": -2.087177276611328, "step": 216 }, { "epoch": 1.09, "learning_rate": 7.241342983315985e-07, "logits/chosen": -2.3834431171417236, "logits/rejected": -1.8301745653152466, "logps/chosen": -434.12109375, "logps/rejected": -291.0428466796875, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 0.24449846148490906, "rewards/margins": 2.718428373336792, "rewards/rejected": -2.4739298820495605, "step": 217 }, { "epoch": 1.09, "learning_rate": 7.217431291229067e-07, "logits/chosen": -2.3413658142089844, "logits/rejected": -1.7566564083099365, "logps/chosen": -410.8603515625, "logps/rejected": -249.85227966308594, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 0.10216370224952698, "rewards/margins": 2.267808198928833, "rewards/rejected": -2.165644645690918, "step": 218 }, { "epoch": 1.1, "learning_rate": 7.193456300523606e-07, "logits/chosen": -2.341492176055908, "logits/rejected": -1.678481101989746, "logps/chosen": -436.3293151855469, "logps/rejected": -241.99493408203125, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 0.19662807881832123, "rewards/margins": 2.3196163177490234, "rewards/rejected": -2.122988224029541, "step": 219 }, { "epoch": 1.1, "learning_rate": 7.16941869558779e-07, "logits/chosen": -2.2765002250671387, "logits/rejected": -1.722936987876892, "logps/chosen": -390.4914855957031, "logps/rejected": -242.495361328125, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 0.12669438123703003, "rewards/margins": 2.1280059814453125, "rewards/rejected": -2.001311779022217, "step": 220 }, { "epoch": 1.11, "learning_rate": 7.145319162597195e-07, "logits/chosen": -2.351325273513794, "logits/rejected": -1.7813061475753784, "logps/chosen": -395.7627868652344, "logps/rejected": -241.35682678222656, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 0.12300366163253784, "rewards/margins": 2.2347261905670166, "rewards/rejected": -2.111722469329834, "step": 221 }, { "epoch": 1.11, "learning_rate": 7.121158389495185e-07, "logits/chosen": -2.4053966999053955, "logits/rejected": -1.7174131870269775, "logps/chosen": -427.7624206542969, "logps/rejected": -234.64218139648438, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 0.20366746187210083, "rewards/margins": 2.489047050476074, "rewards/rejected": -2.285379409790039, "step": 222 }, { "epoch": 1.12, "learning_rate": 7.096937065973284e-07, "logits/chosen": -2.3479108810424805, "logits/rejected": -1.737382173538208, "logps/chosen": -435.11273193359375, "logps/rejected": -270.39312744140625, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": 0.22516277432441711, "rewards/margins": 2.5252723693847656, "rewards/rejected": -2.300109624862671, "step": 223 }, { "epoch": 1.12, "learning_rate": 7.072655883451477e-07, "logits/chosen": -2.342780351638794, "logits/rejected": -1.7374378442764282, "logps/chosen": -431.38177490234375, "logps/rejected": -254.7841796875, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 0.13850834965705872, "rewards/margins": 2.4736056327819824, "rewards/rejected": -2.335097312927246, "step": 224 }, { "epoch": 1.13, "learning_rate": 7.048315535058484e-07, "logits/chosen": -2.267658233642578, "logits/rejected": -1.7265418767929077, "logps/chosen": -397.67974853515625, "logps/rejected": -232.21173095703125, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 0.14593544602394104, "rewards/margins": 2.084913730621338, "rewards/rejected": -1.9389784336090088, "step": 225 }, { "epoch": 1.13, "learning_rate": 7.023916715611968e-07, "logits/chosen": -2.4059083461761475, "logits/rejected": -1.7588121891021729, "logps/chosen": -440.70269775390625, "logps/rejected": -246.7205352783203, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": 0.18735402822494507, "rewards/margins": 2.4085888862609863, "rewards/rejected": -2.2212347984313965, "step": 226 }, { "epoch": 1.14, "learning_rate": 6.999460121598704e-07, "logits/chosen": -2.3134045600891113, "logits/rejected": -1.7372225522994995, "logps/chosen": -427.0145568847656, "logps/rejected": -256.63580322265625, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": 0.09429550170898438, "rewards/margins": 2.32369065284729, "rewards/rejected": -2.2293949127197266, "step": 227 }, { "epoch": 1.14, "learning_rate": 6.974946451154693e-07, "logits/chosen": -2.3103394508361816, "logits/rejected": -1.678167462348938, "logps/chosen": -436.6080627441406, "logps/rejected": -242.18801879882812, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": 0.08466848731040955, "rewards/margins": 2.1948671340942383, "rewards/rejected": -2.110198497772217, "step": 228 }, { "epoch": 1.15, "learning_rate": 6.950376404045234e-07, "logits/chosen": -2.3920536041259766, "logits/rejected": -1.7544630765914917, "logps/chosen": -415.2604064941406, "logps/rejected": -235.70162963867188, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 0.16189856827259064, "rewards/margins": 2.3796370029449463, "rewards/rejected": -2.217738389968872, "step": 229 }, { "epoch": 1.15, "learning_rate": 6.925750681644953e-07, "logits/chosen": -2.367316484451294, "logits/rejected": -1.766503095626831, "logps/chosen": -422.7502136230469, "logps/rejected": -244.46307373046875, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 0.12147115916013718, "rewards/margins": 2.4595017433166504, "rewards/rejected": -2.3380305767059326, "step": 230 }, { "epoch": 1.16, "learning_rate": 6.901069986917779e-07, "logits/chosen": -2.3849711418151855, "logits/rejected": -1.7086421251296997, "logps/chosen": -438.94384765625, "logps/rejected": -246.41441345214844, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 0.13257014751434326, "rewards/margins": 2.4610097408294678, "rewards/rejected": -2.328439712524414, "step": 231 }, { "epoch": 1.16, "learning_rate": 6.876335024396871e-07, "logits/chosen": -2.310183048248291, "logits/rejected": -1.7083977460861206, "logps/chosen": -426.9100646972656, "logps/rejected": -263.59747314453125, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 0.1016690656542778, "rewards/margins": 2.4162721633911133, "rewards/rejected": -2.314603090286255, "step": 232 }, { "epoch": 1.17, "learning_rate": 6.851546500164518e-07, "logits/chosen": -2.304593324661255, "logits/rejected": -1.7517566680908203, "logps/chosen": -419.87615966796875, "logps/rejected": -254.977783203125, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": 0.17918117344379425, "rewards/margins": 2.6142454147338867, "rewards/rejected": -2.4350640773773193, "step": 233 }, { "epoch": 1.17, "learning_rate": 6.826705121831976e-07, "logits/chosen": -2.370234251022339, "logits/rejected": -1.7599321603775024, "logps/chosen": -438.5599365234375, "logps/rejected": -281.7724914550781, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 0.15683645009994507, "rewards/margins": 2.8124709129333496, "rewards/rejected": -2.655634641647339, "step": 234 }, { "epoch": 1.18, "learning_rate": 6.801811598519267e-07, "logits/chosen": -2.338111162185669, "logits/rejected": -1.7682147026062012, "logps/chosen": -424.9007568359375, "logps/rejected": -255.73095703125, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 0.21471533179283142, "rewards/margins": 2.6777727603912354, "rewards/rejected": -2.463057518005371, "step": 235 }, { "epoch": 1.18, "learning_rate": 6.776866640834944e-07, "logits/chosen": -2.3958899974823, "logits/rejected": -1.7253276109695435, "logps/chosen": -419.009033203125, "logps/rejected": -257.5193786621094, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 0.13883796334266663, "rewards/margins": 2.5850179195404053, "rewards/rejected": -2.4461798667907715, "step": 236 }, { "epoch": 1.19, "learning_rate": 6.751870960855799e-07, "logits/chosen": -2.423508644104004, "logits/rejected": -1.7694196701049805, "logps/chosen": -414.6655578613281, "logps/rejected": -256.6501770019531, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 0.16208650171756744, "rewards/margins": 2.4537010192871094, "rewards/rejected": -2.291614532470703, "step": 237 }, { "epoch": 1.19, "learning_rate": 6.726825272106538e-07, "logits/chosen": -2.3706841468811035, "logits/rejected": -1.7054877281188965, "logps/chosen": -412.6179504394531, "logps/rejected": -234.31256103515625, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 0.10194601118564606, "rewards/margins": 2.4212799072265625, "rewards/rejected": -2.319334030151367, "step": 238 }, { "epoch": 1.2, "learning_rate": 6.701730289539416e-07, "logits/chosen": -2.3876986503601074, "logits/rejected": -1.7459017038345337, "logps/chosen": -436.50738525390625, "logps/rejected": -266.0874328613281, "loss": 0.0962, "rewards/accuracies": 1.0, "rewards/chosen": 0.07048556208610535, "rewards/margins": 2.5263779163360596, "rewards/rejected": -2.455892562866211, "step": 239 }, { "epoch": 1.2, "learning_rate": 6.676586729513822e-07, "logits/chosen": -2.401823043823242, "logits/rejected": -1.716629147529602, "logps/chosen": -404.302734375, "logps/rejected": -259.7847595214844, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 0.19542033970355988, "rewards/margins": 2.5385708808898926, "rewards/rejected": -2.3431506156921387, "step": 240 }, { "epoch": 1.21, "learning_rate": 6.651395309775836e-07, "logits/chosen": -2.264378547668457, "logits/rejected": -1.6765244007110596, "logps/chosen": -369.4326477050781, "logps/rejected": -213.77305603027344, "loss": 0.0932, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.13931477069854736, "rewards/margins": 2.2793521881103516, "rewards/rejected": -2.1400372982025146, "step": 241 }, { "epoch": 1.21, "learning_rate": 6.626156749437736e-07, "logits/chosen": -2.295827627182007, "logits/rejected": -1.7228349447250366, "logps/chosen": -426.27410888671875, "logps/rejected": -231.58831787109375, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": 0.130548357963562, "rewards/margins": 2.3856260776519775, "rewards/rejected": -2.255077838897705, "step": 242 }, { "epoch": 1.22, "learning_rate": 6.600871768957473e-07, "logits/chosen": -2.3645496368408203, "logits/rejected": -1.80003821849823, "logps/chosen": -410.95458984375, "logps/rejected": -263.6954650878906, "loss": 0.0915, "rewards/accuracies": 1.0, "rewards/chosen": 0.09567337483167648, "rewards/margins": 2.387070894241333, "rewards/rejected": -2.2913975715637207, "step": 243 }, { "epoch": 1.22, "learning_rate": 6.575541090118104e-07, "logits/chosen": -2.3592305183410645, "logits/rejected": -1.7382762432098389, "logps/chosen": -419.4551086425781, "logps/rejected": -260.99615478515625, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 0.18826624751091003, "rewards/margins": 2.8487932682037354, "rewards/rejected": -2.660527229309082, "step": 244 }, { "epoch": 1.23, "learning_rate": 6.550165436007185e-07, "logits/chosen": -2.277360677719116, "logits/rejected": -1.7717269659042358, "logps/chosen": -336.88677978515625, "logps/rejected": -246.82167053222656, "loss": 0.0934, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.06853103637695312, "rewards/margins": 2.4099345207214355, "rewards/rejected": -2.3414034843444824, "step": 245 }, { "epoch": 1.23, "learning_rate": 6.524745530996136e-07, "logits/chosen": -2.377992868423462, "logits/rejected": -1.7860827445983887, "logps/chosen": -445.5641174316406, "logps/rejected": -264.3681945800781, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 0.05355861037969589, "rewards/margins": 2.47422456741333, "rewards/rejected": -2.420665979385376, "step": 246 }, { "epoch": 1.24, "learning_rate": 6.499282100719557e-07, "logits/chosen": -2.3801891803741455, "logits/rejected": -1.7512145042419434, "logps/chosen": -406.6275939941406, "logps/rejected": -252.7587127685547, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 0.19577689468860626, "rewards/margins": 2.6691811084747314, "rewards/rejected": -2.4734041690826416, "step": 247 }, { "epoch": 1.24, "learning_rate": 6.473775872054521e-07, "logits/chosen": -2.38934326171875, "logits/rejected": -1.7393754720687866, "logps/chosen": -437.0791015625, "logps/rejected": -252.34339904785156, "loss": 0.0724, "rewards/accuracies": 1.0, "rewards/chosen": 0.12121327221393585, "rewards/margins": 2.6064813137054443, "rewards/rejected": -2.4852681159973145, "step": 248 }, { "epoch": 1.25, "learning_rate": 6.448227573099814e-07, "logits/chosen": -2.3685765266418457, "logits/rejected": -1.691235065460205, "logps/chosen": -428.91778564453125, "logps/rejected": -257.01812744140625, "loss": 0.0823, "rewards/accuracies": 1.0, "rewards/chosen": 0.07672246545553207, "rewards/margins": 2.68721342086792, "rewards/rejected": -2.6104910373687744, "step": 249 }, { "epoch": 1.25, "learning_rate": 6.422637933155162e-07, "logits/chosen": -2.3204076290130615, "logits/rejected": -1.7479755878448486, "logps/chosen": -449.3860778808594, "logps/rejected": -251.58949279785156, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 0.17678198218345642, "rewards/margins": 2.7703094482421875, "rewards/rejected": -2.5935275554656982, "step": 250 }, { "epoch": 1.26, "learning_rate": 6.397007682700406e-07, "logits/chosen": -2.3798999786376953, "logits/rejected": -1.766818642616272, "logps/chosen": -426.2226867675781, "logps/rejected": -259.5941162109375, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 0.15167772769927979, "rewards/margins": 2.7991347312927246, "rewards/rejected": -2.6474571228027344, "step": 251 }, { "epoch": 1.26, "learning_rate": 6.371337553374652e-07, "logits/chosen": -2.3331456184387207, "logits/rejected": -1.7717633247375488, "logps/chosen": -421.0743408203125, "logps/rejected": -264.52520751953125, "loss": 0.0724, "rewards/accuracies": 1.0, "rewards/chosen": 0.12990011274814606, "rewards/margins": 2.7025034427642822, "rewards/rejected": -2.572603225708008, "step": 252 }, { "epoch": 1.27, "learning_rate": 6.345628277955384e-07, "logits/chosen": -2.325971841812134, "logits/rejected": -1.7423770427703857, "logps/chosen": -395.7026062011719, "logps/rejected": -242.02496337890625, "loss": 0.0724, "rewards/accuracies": 1.0, "rewards/chosen": 0.13671569526195526, "rewards/margins": 2.6941137313842773, "rewards/rejected": -2.5573978424072266, "step": 253 }, { "epoch": 1.27, "learning_rate": 6.319880590337548e-07, "logits/chosen": -2.3660659790039062, "logits/rejected": -1.7303261756896973, "logps/chosen": -430.8882141113281, "logps/rejected": -261.0942077636719, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 0.13551229238510132, "rewards/margins": 2.7276532649993896, "rewards/rejected": -2.5921409130096436, "step": 254 }, { "epoch": 1.28, "learning_rate": 6.294095225512604e-07, "logits/chosen": -2.335515022277832, "logits/rejected": -1.7695233821868896, "logps/chosen": -396.23291015625, "logps/rejected": -271.73443603515625, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 0.16727346181869507, "rewards/margins": 2.791339874267578, "rewards/rejected": -2.6240663528442383, "step": 255 }, { "epoch": 1.28, "learning_rate": 6.268272919547536e-07, "logits/chosen": -2.3912339210510254, "logits/rejected": -1.7574775218963623, "logps/chosen": -438.34417724609375, "logps/rejected": -261.10003662109375, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 0.14332225918769836, "rewards/margins": 2.791689157485962, "rewards/rejected": -2.648366928100586, "step": 256 }, { "epoch": 1.29, "learning_rate": 6.242414409563854e-07, "logits/chosen": -2.4201464653015137, "logits/rejected": -1.7969746589660645, "logps/chosen": -403.97015380859375, "logps/rejected": -256.91180419921875, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 0.18637695908546448, "rewards/margins": 2.87021541595459, "rewards/rejected": -2.683838367462158, "step": 257 }, { "epoch": 1.29, "learning_rate": 6.216520433716544e-07, "logits/chosen": -2.274958610534668, "logits/rejected": -1.668092966079712, "logps/chosen": -414.37054443359375, "logps/rejected": -237.93179321289062, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 0.19002407789230347, "rewards/margins": 2.686426877975464, "rewards/rejected": -2.4964027404785156, "step": 258 }, { "epoch": 1.3, "learning_rate": 6.190591731172991e-07, "logits/chosen": -2.298405885696411, "logits/rejected": -1.7300269603729248, "logps/chosen": -409.93768310546875, "logps/rejected": -259.57586669921875, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 0.14650052785873413, "rewards/margins": 2.8130640983581543, "rewards/rejected": -2.6665637493133545, "step": 259 }, { "epoch": 1.3, "learning_rate": 6.164629042091893e-07, "logits/chosen": -2.316250801086426, "logits/rejected": -1.729601263999939, "logps/chosen": -442.1346435546875, "logps/rejected": -263.43121337890625, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": 0.21427308022975922, "rewards/margins": 2.916144609451294, "rewards/rejected": -2.701871395111084, "step": 260 }, { "epoch": 1.31, "learning_rate": 6.138633107602122e-07, "logits/chosen": -2.2845702171325684, "logits/rejected": -1.6965854167938232, "logps/chosen": -399.87322998046875, "logps/rejected": -250.21141052246094, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 0.11017303168773651, "rewards/margins": 2.6052889823913574, "rewards/rejected": -2.4951157569885254, "step": 261 }, { "epoch": 1.31, "learning_rate": 6.112604669781572e-07, "logits/chosen": -2.377619504928589, "logits/rejected": -1.783381462097168, "logps/chosen": -416.8450927734375, "logps/rejected": -289.55267333984375, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 0.0772758424282074, "rewards/margins": 2.984128713607788, "rewards/rejected": -2.9068527221679688, "step": 262 }, { "epoch": 1.32, "learning_rate": 6.086544471635973e-07, "logits/chosen": -2.3264455795288086, "logits/rejected": -1.7060034275054932, "logps/chosen": -410.70159912109375, "logps/rejected": -228.84542846679688, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 0.12947770953178406, "rewards/margins": 2.7377753257751465, "rewards/rejected": -2.608297824859619, "step": 263 }, { "epoch": 1.32, "learning_rate": 6.060453257077684e-07, "logits/chosen": -2.379850149154663, "logits/rejected": -1.7486438751220703, "logps/chosen": -428.7838439941406, "logps/rejected": -250.73114013671875, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 0.024870682507753372, "rewards/margins": 2.4543213844299316, "rewards/rejected": -2.4294509887695312, "step": 264 }, { "epoch": 1.33, "learning_rate": 6.034331770904454e-07, "logits/chosen": -2.3978912830352783, "logits/rejected": -1.703794240951538, "logps/chosen": -431.33636474609375, "logps/rejected": -230.14425659179688, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 0.1727997064590454, "rewards/margins": 2.486541748046875, "rewards/rejected": -2.313742160797119, "step": 265 }, { "epoch": 1.34, "learning_rate": 6.008180758778166e-07, "logits/chosen": -2.392954111099243, "logits/rejected": -1.776149868965149, "logps/chosen": -425.32794189453125, "logps/rejected": -263.6713562011719, "loss": 0.0818, "rewards/accuracies": 1.0, "rewards/chosen": 0.10202230513095856, "rewards/margins": 2.727464199066162, "rewards/rejected": -2.6254422664642334, "step": 266 }, { "epoch": 1.34, "learning_rate": 5.982000967203548e-07, "logits/chosen": -2.352900981903076, "logits/rejected": -1.6844431161880493, "logps/chosen": -456.0523681640625, "logps/rejected": -249.87429809570312, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 0.11272863298654556, "rewards/margins": 2.769131660461426, "rewards/rejected": -2.656402826309204, "step": 267 }, { "epoch": 1.35, "learning_rate": 5.955793143506862e-07, "logits/chosen": -2.3868536949157715, "logits/rejected": -1.7029414176940918, "logps/chosen": -466.1007080078125, "logps/rejected": -259.82073974609375, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 0.07507756352424622, "rewards/margins": 2.7306840419769287, "rewards/rejected": -2.655606269836426, "step": 268 }, { "epoch": 1.35, "learning_rate": 5.929558035814574e-07, "logits/chosen": -2.382384777069092, "logits/rejected": -1.7629936933517456, "logps/chosen": -445.67724609375, "logps/rejected": -277.3305969238281, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 0.1937253177165985, "rewards/margins": 3.0656538009643555, "rewards/rejected": -2.8719286918640137, "step": 269 }, { "epoch": 1.36, "learning_rate": 5.903296393031995e-07, "logits/chosen": -2.244957208633423, "logits/rejected": -1.7190240621566772, "logps/chosen": -374.845947265625, "logps/rejected": -244.2581329345703, "loss": 0.0915, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.07157592475414276, "rewards/margins": 2.604867696762085, "rewards/rejected": -2.533291816711426, "step": 270 }, { "epoch": 1.36, "learning_rate": 5.877008964821908e-07, "logits/chosen": -2.407914638519287, "logits/rejected": -1.8057191371917725, "logps/chosen": -404.29547119140625, "logps/rejected": -257.3990478515625, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 0.10843569040298462, "rewards/margins": 2.6466469764709473, "rewards/rejected": -2.5382111072540283, "step": 271 }, { "epoch": 1.37, "learning_rate": 5.850696501583163e-07, "logits/chosen": -2.285498857498169, "logits/rejected": -1.7230048179626465, "logps/chosen": -464.6734619140625, "logps/rejected": -296.6295166015625, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 0.12881064414978027, "rewards/margins": 3.1766433715820312, "rewards/rejected": -3.047832727432251, "step": 272 }, { "epoch": 1.37, "learning_rate": 5.824359754429258e-07, "logits/chosen": -2.297363758087158, "logits/rejected": -1.7457754611968994, "logps/chosen": -397.4841003417969, "logps/rejected": -236.37115478515625, "loss": 0.0752, "rewards/accuracies": 1.0, "rewards/chosen": 0.03496767580509186, "rewards/margins": 2.550320625305176, "rewards/rejected": -2.515352964401245, "step": 273 }, { "epoch": 1.38, "learning_rate": 5.797999475166896e-07, "logits/chosen": -2.378110408782959, "logits/rejected": -1.8216311931610107, "logps/chosen": -397.5712890625, "logps/rejected": -278.0706787109375, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.22290945053100586, "rewards/margins": 2.9911346435546875, "rewards/rejected": -2.7682254314422607, "step": 274 }, { "epoch": 1.38, "learning_rate": 5.771616416274529e-07, "logits/chosen": -2.2767889499664307, "logits/rejected": -1.770648717880249, "logps/chosen": -401.74395751953125, "logps/rejected": -270.3096923828125, "loss": 0.0615, "rewards/accuracies": 1.0, "rewards/chosen": 0.034437812864780426, "rewards/margins": 2.772711992263794, "rewards/rejected": -2.738274335861206, "step": 275 }, { "epoch": 1.39, "learning_rate": 5.745211330880872e-07, "logits/chosen": -2.349541664123535, "logits/rejected": -1.719818115234375, "logps/chosen": -458.4740905761719, "logps/rejected": -278.17041015625, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 0.21464590728282928, "rewards/margins": 3.1903936862945557, "rewards/rejected": -2.97574782371521, "step": 276 }, { "epoch": 1.39, "learning_rate": 5.718784972743409e-07, "logits/chosen": -2.3479745388031006, "logits/rejected": -1.7935783863067627, "logps/chosen": -427.6744689941406, "logps/rejected": -272.4552001953125, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": 0.1144818663597107, "rewards/margins": 2.920607089996338, "rewards/rejected": -2.8061249256134033, "step": 277 }, { "epoch": 1.4, "learning_rate": 5.69233809622687e-07, "logits/chosen": -2.3434205055236816, "logits/rejected": -1.7131800651550293, "logps/chosen": -396.79656982421875, "logps/rejected": -229.24725341796875, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 0.022773995995521545, "rewards/margins": 2.510826826095581, "rewards/rejected": -2.4880528450012207, "step": 278 }, { "epoch": 1.4, "learning_rate": 5.665871456281706e-07, "logits/chosen": -2.364231586456299, "logits/rejected": -1.7683095932006836, "logps/chosen": -445.9037780761719, "logps/rejected": -272.45947265625, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.25256627798080444, "rewards/margins": 3.2530884742736816, "rewards/rejected": -3.0005221366882324, "step": 279 }, { "epoch": 1.41, "learning_rate": 5.63938580842253e-07, "logits/chosen": -2.355039119720459, "logits/rejected": -1.7931667566299438, "logps/chosen": -417.5256042480469, "logps/rejected": -249.78610229492188, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 0.11699780076742172, "rewards/margins": 2.7330501079559326, "rewards/rejected": -2.6160521507263184, "step": 280 }, { "epoch": 1.41, "learning_rate": 5.612881908706555e-07, "logits/chosen": -2.3476576805114746, "logits/rejected": -1.7782140970230103, "logps/chosen": -412.81756591796875, "logps/rejected": -250.68936157226562, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 0.1952303647994995, "rewards/margins": 2.8541109561920166, "rewards/rejected": -2.6588802337646484, "step": 281 }, { "epoch": 1.42, "learning_rate": 5.586360513712009e-07, "logits/chosen": -2.306473731994629, "logits/rejected": -1.7217028141021729, "logps/chosen": -420.11700439453125, "logps/rejected": -272.59210205078125, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.07007751613855362, "rewards/margins": 2.9052658081054688, "rewards/rejected": -2.835188388824463, "step": 282 }, { "epoch": 1.42, "learning_rate": 5.559822380516539e-07, "logits/chosen": -2.3847508430480957, "logits/rejected": -1.7739917039871216, "logps/chosen": -440.3993225097656, "logps/rejected": -250.36312866210938, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 0.11770935356616974, "rewards/margins": 2.798539638519287, "rewards/rejected": -2.680830240249634, "step": 283 }, { "epoch": 1.43, "learning_rate": 5.533268266675601e-07, "logits/chosen": -2.3561079502105713, "logits/rejected": -1.7553430795669556, "logps/chosen": -400.4366455078125, "logps/rejected": -256.1666564941406, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 0.16530686616897583, "rewards/margins": 2.8094210624694824, "rewards/rejected": -2.6441140174865723, "step": 284 }, { "epoch": 1.43, "learning_rate": 5.50669893020083e-07, "logits/chosen": -2.3829970359802246, "logits/rejected": -1.7772233486175537, "logps/chosen": -449.0572509765625, "logps/rejected": -258.5106506347656, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 0.12428157031536102, "rewards/margins": 3.189300537109375, "rewards/rejected": -3.065019130706787, "step": 285 }, { "epoch": 1.44, "learning_rate": 5.480115129538409e-07, "logits/chosen": -2.39876651763916, "logits/rejected": -1.728998064994812, "logps/chosen": -424.659912109375, "logps/rejected": -250.39572143554688, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": 0.1672505885362625, "rewards/margins": 2.98520827293396, "rewards/rejected": -2.8179574012756348, "step": 286 }, { "epoch": 1.44, "learning_rate": 5.453517623547411e-07, "logits/chosen": -2.408125638961792, "logits/rejected": -1.7963461875915527, "logps/chosen": -420.00238037109375, "logps/rejected": -273.04461669921875, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.20011571049690247, "rewards/margins": 3.1372787952423096, "rewards/rejected": -2.9371631145477295, "step": 287 }, { "epoch": 1.45, "learning_rate": 5.426907171478142e-07, "logits/chosen": -2.382516384124756, "logits/rejected": -1.723081111907959, "logps/chosen": -436.1118469238281, "logps/rejected": -264.4705505371094, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.23992463946342468, "rewards/margins": 3.091334819793701, "rewards/rejected": -2.851410150527954, "step": 288 }, { "epoch": 1.45, "learning_rate": 5.400284532950467e-07, "logits/chosen": -2.309596061706543, "logits/rejected": -1.6997047662734985, "logps/chosen": -462.1573181152344, "logps/rejected": -250.8166961669922, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 0.2242269068956375, "rewards/margins": 3.0380609035491943, "rewards/rejected": -2.8138341903686523, "step": 289 }, { "epoch": 1.46, "learning_rate": 5.373650467932121e-07, "logits/chosen": -2.291428565979004, "logits/rejected": -1.7298626899719238, "logps/chosen": -407.12945556640625, "logps/rejected": -242.7195281982422, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": 0.0792006254196167, "rewards/margins": 2.804159641265869, "rewards/rejected": -2.724959373474121, "step": 290 }, { "epoch": 1.46, "learning_rate": 5.347005736717023e-07, "logits/chosen": -2.308332681655884, "logits/rejected": -1.7228946685791016, "logps/chosen": -433.12841796875, "logps/rejected": -277.14178466796875, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 0.1475720852613449, "rewards/margins": 3.250002861022949, "rewards/rejected": -3.1024303436279297, "step": 291 }, { "epoch": 1.47, "learning_rate": 5.320351099903565e-07, "logits/chosen": -2.372591972351074, "logits/rejected": -1.742047905921936, "logps/chosen": -401.8935546875, "logps/rejected": -261.46240234375, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.09059321135282516, "rewards/margins": 3.019700050354004, "rewards/rejected": -2.9291067123413086, "step": 292 }, { "epoch": 1.47, "learning_rate": 5.293687318372906e-07, "logits/chosen": -2.292476177215576, "logits/rejected": -1.7819393873214722, "logps/chosen": -402.5439147949219, "logps/rejected": -263.81170654296875, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.08270744979381561, "rewards/margins": 2.931674003601074, "rewards/rejected": -2.848966360092163, "step": 293 }, { "epoch": 1.48, "learning_rate": 5.267015153267245e-07, "logits/chosen": -2.3920092582702637, "logits/rejected": -1.8131598234176636, "logps/chosen": -405.3302307128906, "logps/rejected": -277.8349609375, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 0.04517810046672821, "rewards/margins": 2.9974071979522705, "rewards/rejected": -2.9522290229797363, "step": 294 }, { "epoch": 1.48, "learning_rate": 5.240335365968104e-07, "logits/chosen": -2.3056249618530273, "logits/rejected": -1.6620099544525146, "logps/chosen": -441.333251953125, "logps/rejected": -255.95648193359375, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 0.1791486144065857, "rewards/margins": 3.12168550491333, "rewards/rejected": -2.9425368309020996, "step": 295 }, { "epoch": 1.49, "learning_rate": 5.213648718074583e-07, "logits/chosen": -2.2956366539001465, "logits/rejected": -1.7464789152145386, "logps/chosen": -432.6912841796875, "logps/rejected": -263.73828125, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": 0.12890371680259705, "rewards/margins": 3.010897159576416, "rewards/rejected": -2.881993293762207, "step": 296 }, { "epoch": 1.49, "learning_rate": 5.18695597138163e-07, "logits/chosen": -2.2530694007873535, "logits/rejected": -1.6632601022720337, "logps/chosen": -439.92645263671875, "logps/rejected": -246.80950927734375, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 0.1034320816397667, "rewards/margins": 2.9011268615722656, "rewards/rejected": -2.797694444656372, "step": 297 }, { "epoch": 1.5, "learning_rate": 5.160257887858277e-07, "logits/chosen": -2.302536725997925, "logits/rejected": -1.7609385251998901, "logps/chosen": -420.3564453125, "logps/rejected": -258.6732177734375, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 0.1590297967195511, "rewards/margins": 3.0309081077575684, "rewards/rejected": -2.871878147125244, "step": 298 }, { "epoch": 1.5, "learning_rate": 5.13355522962591e-07, "logits/chosen": -2.2845797538757324, "logits/rejected": -1.785184383392334, "logps/chosen": -408.77899169921875, "logps/rejected": -253.6124267578125, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 0.12409694492816925, "rewards/margins": 2.9676055908203125, "rewards/rejected": -2.8435089588165283, "step": 299 }, { "epoch": 1.51, "learning_rate": 5.106848758936507e-07, "logits/chosen": -2.3593192100524902, "logits/rejected": -1.6951512098312378, "logps/chosen": -444.5079345703125, "logps/rejected": -251.72247314453125, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 0.20827993750572205, "rewards/margins": 3.170576810836792, "rewards/rejected": -2.962296724319458, "step": 300 }, { "epoch": 1.51, "learning_rate": 5.080139238150869e-07, "logits/chosen": -2.3523402214050293, "logits/rejected": -1.7351994514465332, "logps/chosen": -416.8442077636719, "logps/rejected": -248.22515869140625, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 0.20718206465244293, "rewards/margins": 3.155034065246582, "rewards/rejected": -2.9478518962860107, "step": 301 }, { "epoch": 1.52, "learning_rate": 5.053427429716866e-07, "logits/chosen": -2.330598831176758, "logits/rejected": -1.785291314125061, "logps/chosen": -463.30560302734375, "logps/rejected": -283.8675537109375, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 0.19924774765968323, "rewards/margins": 3.081430435180664, "rewards/rejected": -2.8821825981140137, "step": 302 }, { "epoch": 1.52, "learning_rate": 5.026714096147673e-07, "logits/chosen": -2.363349437713623, "logits/rejected": -1.7406888008117676, "logps/chosen": -405.87457275390625, "logps/rejected": -268.89129638671875, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": 0.2788078486919403, "rewards/margins": 3.3024039268493652, "rewards/rejected": -3.0235958099365234, "step": 303 }, { "epoch": 1.53, "learning_rate": 5e-07, "logits/chosen": -2.354295492172241, "logits/rejected": -1.7695475816726685, "logps/chosen": -438.3371276855469, "logps/rejected": -293.65362548828125, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.24793320894241333, "rewards/margins": 3.261539936065674, "rewards/rejected": -3.0136067867279053, "step": 304 }, { "epoch": 1.53, "learning_rate": 4.973285903852328e-07, "logits/chosen": -2.3633718490600586, "logits/rejected": -1.758758544921875, "logps/chosen": -430.4385681152344, "logps/rejected": -260.6502685546875, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 0.2826113700866699, "rewards/margins": 3.223409652709961, "rewards/rejected": -2.940798282623291, "step": 305 }, { "epoch": 1.54, "learning_rate": 4.946572570283134e-07, "logits/chosen": -2.273345470428467, "logits/rejected": -1.7033841609954834, "logps/chosen": -461.306640625, "logps/rejected": -279.2411193847656, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 0.10287807136774063, "rewards/margins": 3.2181012630462646, "rewards/rejected": -3.1152231693267822, "step": 306 }, { "epoch": 1.54, "learning_rate": 4.919860761849131e-07, "logits/chosen": -2.36112642288208, "logits/rejected": -1.7409255504608154, "logps/chosen": -388.01678466796875, "logps/rejected": -258.4703063964844, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.17219112813472748, "rewards/margins": 3.092069625854492, "rewards/rejected": -2.9198784828186035, "step": 307 }, { "epoch": 1.55, "learning_rate": 4.893151241063493e-07, "logits/chosen": -2.327737331390381, "logits/rejected": -1.7158126831054688, "logps/chosen": -470.7218017578125, "logps/rejected": -247.77761840820312, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 0.21137367188930511, "rewards/margins": 3.2304582595825195, "rewards/rejected": -3.0190844535827637, "step": 308 }, { "epoch": 1.55, "learning_rate": 4.86644477037409e-07, "logits/chosen": -2.353895664215088, "logits/rejected": -1.7294992208480835, "logps/chosen": -405.198486328125, "logps/rejected": -243.98597717285156, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": 0.22426682710647583, "rewards/margins": 2.9473352432250977, "rewards/rejected": -2.7230684757232666, "step": 309 }, { "epoch": 1.56, "learning_rate": 4.839742112141724e-07, "logits/chosen": -2.351862668991089, "logits/rejected": -1.7356750965118408, "logps/chosen": -416.07342529296875, "logps/rejected": -279.2021789550781, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.08277104049921036, "rewards/margins": 3.141765594482422, "rewards/rejected": -3.058994770050049, "step": 310 }, { "epoch": 1.56, "learning_rate": 4.813044028618372e-07, "logits/chosen": -2.301844596862793, "logits/rejected": -1.700169563293457, "logps/chosen": -431.1022033691406, "logps/rejected": -253.8983154296875, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 0.23116430640220642, "rewards/margins": 3.1452701091766357, "rewards/rejected": -2.9141054153442383, "step": 311 }, { "epoch": 1.57, "learning_rate": 4.786351281925416e-07, "logits/chosen": -2.309739112854004, "logits/rejected": -1.768857717514038, "logps/chosen": -407.50958251953125, "logps/rejected": -249.8413543701172, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.14258499443531036, "rewards/margins": 2.8647007942199707, "rewards/rejected": -2.722115993499756, "step": 312 }, { "epoch": 1.57, "learning_rate": 4.7596646340318967e-07, "logits/chosen": -2.3405656814575195, "logits/rejected": -1.7431800365447998, "logps/chosen": -406.21142578125, "logps/rejected": -274.1960144042969, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 0.11213354766368866, "rewards/margins": 3.446531295776367, "rewards/rejected": -3.334397792816162, "step": 313 }, { "epoch": 1.58, "learning_rate": 4.732984846732755e-07, "logits/chosen": -2.3828063011169434, "logits/rejected": -1.7673869132995605, "logps/chosen": -428.1217956542969, "logps/rejected": -265.07373046875, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 0.09230398386716843, "rewards/margins": 3.3259596824645996, "rewards/rejected": -3.2336556911468506, "step": 314 }, { "epoch": 1.58, "learning_rate": 4.706312681627096e-07, "logits/chosen": -2.336902141571045, "logits/rejected": -1.7377604246139526, "logps/chosen": -417.0897216796875, "logps/rejected": -246.1240234375, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 0.2234901487827301, "rewards/margins": 3.1882638931274414, "rewards/rejected": -2.9647738933563232, "step": 315 }, { "epoch": 1.59, "learning_rate": 4.6796489000964354e-07, "logits/chosen": -2.3566503524780273, "logits/rejected": -1.767578363418579, "logps/chosen": -393.03955078125, "logps/rejected": -258.19232177734375, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 0.0734606459736824, "rewards/margins": 3.2001118659973145, "rewards/rejected": -3.1266512870788574, "step": 316 }, { "epoch": 1.59, "learning_rate": 4.6529942632829785e-07, "logits/chosen": -2.3761370182037354, "logits/rejected": -1.81003737449646, "logps/chosen": -410.22076416015625, "logps/rejected": -287.153076171875, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 0.10380210727453232, "rewards/margins": 3.2886857986450195, "rewards/rejected": -3.1848838329315186, "step": 317 }, { "epoch": 1.6, "learning_rate": 4.626349532067879e-07, "logits/chosen": -2.2825796604156494, "logits/rejected": -1.7708665132522583, "logps/chosen": -396.02227783203125, "logps/rejected": -247.07977294921875, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 0.042575329542160034, "rewards/margins": 2.7145297527313232, "rewards/rejected": -2.67195463180542, "step": 318 }, { "epoch": 1.6, "learning_rate": 4.5997154670495334e-07, "logits/chosen": -2.3446130752563477, "logits/rejected": -1.767841100692749, "logps/chosen": -402.7386169433594, "logps/rejected": -250.5200958251953, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 0.24670158326625824, "rewards/margins": 3.309128761291504, "rewards/rejected": -3.062427043914795, "step": 319 }, { "epoch": 1.61, "learning_rate": 4.5730928285218566e-07, "logits/chosen": -2.3448667526245117, "logits/rejected": -1.7214677333831787, "logps/chosen": -381.89801025390625, "logps/rejected": -257.2233581542969, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -0.016006598249077797, "rewards/margins": 2.952582836151123, "rewards/rejected": -2.9685893058776855, "step": 320 }, { "epoch": 1.61, "learning_rate": 4.5464823764525887e-07, "logits/chosen": -2.402895450592041, "logits/rejected": -1.777315378189087, "logps/chosen": -427.9627685546875, "logps/rejected": -279.32366943359375, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 0.21710562705993652, "rewards/margins": 3.4619503021240234, "rewards/rejected": -3.244844436645508, "step": 321 }, { "epoch": 1.62, "learning_rate": 4.519884870461591e-07, "logits/chosen": -2.37373685836792, "logits/rejected": -1.7873560190200806, "logps/chosen": -406.30108642578125, "logps/rejected": -263.6998596191406, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 0.11645203083753586, "rewards/margins": 3.1198477745056152, "rewards/rejected": -3.0033957958221436, "step": 322 }, { "epoch": 1.62, "learning_rate": 4.4933010697991704e-07, "logits/chosen": -2.340186595916748, "logits/rejected": -1.7729861736297607, "logps/chosen": -426.29840087890625, "logps/rejected": -253.99264526367188, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 0.2557070553302765, "rewards/margins": 3.3570449352264404, "rewards/rejected": -3.101337432861328, "step": 323 }, { "epoch": 1.63, "learning_rate": 4.466731733324399e-07, "logits/chosen": -2.374258041381836, "logits/rejected": -1.7368574142456055, "logps/chosen": -437.7850341796875, "logps/rejected": -259.38494873046875, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 0.0602007657289505, "rewards/margins": 3.178605556488037, "rewards/rejected": -3.1184048652648926, "step": 324 }, { "epoch": 1.63, "learning_rate": 4.4401776194834603e-07, "logits/chosen": -2.386868953704834, "logits/rejected": -1.7495367527008057, "logps/chosen": -418.1893615722656, "logps/rejected": -261.65740966796875, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 0.2164706289768219, "rewards/margins": 3.2443623542785645, "rewards/rejected": -3.0278921127319336, "step": 325 }, { "epoch": 1.64, "learning_rate": 4.413639486287991e-07, "logits/chosen": -2.352489471435547, "logits/rejected": -1.7001850605010986, "logps/chosen": -365.7145080566406, "logps/rejected": -249.81492614746094, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": -0.006723526865243912, "rewards/margins": 2.9177966117858887, "rewards/rejected": -2.9245200157165527, "step": 326 }, { "epoch": 1.64, "learning_rate": 4.3871180912934456e-07, "logits/chosen": -2.3094587326049805, "logits/rejected": -1.6824119091033936, "logps/chosen": -405.4281921386719, "logps/rejected": -256.5493469238281, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 0.10220667719841003, "rewards/margins": 3.1907010078430176, "rewards/rejected": -3.088494300842285, "step": 327 }, { "epoch": 1.65, "learning_rate": 4.3606141915774693e-07, "logits/chosen": -2.394684314727783, "logits/rejected": -1.7109336853027344, "logps/chosen": -441.2231750488281, "logps/rejected": -262.58721923828125, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 0.25229036808013916, "rewards/margins": 3.5330824851989746, "rewards/rejected": -3.280792236328125, "step": 328 }, { "epoch": 1.65, "learning_rate": 4.3341285437182946e-07, "logits/chosen": -2.423304319381714, "logits/rejected": -1.7291061878204346, "logps/chosen": -422.4158020019531, "logps/rejected": -235.93687438964844, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 0.12351532280445099, "rewards/margins": 2.9314589500427246, "rewards/rejected": -2.807943820953369, "step": 329 }, { "epoch": 1.66, "learning_rate": 4.3076619037731287e-07, "logits/chosen": -2.3090338706970215, "logits/rejected": -1.718928337097168, "logps/chosen": -402.37005615234375, "logps/rejected": -263.482666015625, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 0.1976829469203949, "rewards/margins": 3.408734083175659, "rewards/rejected": -3.2110514640808105, "step": 330 }, { "epoch": 1.66, "learning_rate": 4.2812150272565915e-07, "logits/chosen": -2.3599648475646973, "logits/rejected": -1.7350865602493286, "logps/chosen": -416.153076171875, "logps/rejected": -258.5077209472656, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 0.1920926570892334, "rewards/margins": 3.396846055984497, "rewards/rejected": -3.2047533988952637, "step": 331 }, { "epoch": 1.67, "learning_rate": 4.254788669119127e-07, "logits/chosen": -2.305109977722168, "logits/rejected": -1.7331202030181885, "logps/chosen": -409.9639892578125, "logps/rejected": -261.8056640625, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 0.07724406570196152, "rewards/margins": 3.2732481956481934, "rewards/rejected": -3.1960041522979736, "step": 332 }, { "epoch": 1.67, "learning_rate": 4.2283835837254713e-07, "logits/chosen": -2.2761332988739014, "logits/rejected": -1.745134711265564, "logps/chosen": -376.0856018066406, "logps/rejected": -271.1671142578125, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 0.051339730620384216, "rewards/margins": 3.29599666595459, "rewards/rejected": -3.244657039642334, "step": 333 }, { "epoch": 1.68, "learning_rate": 4.202000524833105e-07, "logits/chosen": -2.3424808979034424, "logits/rejected": -1.7437753677368164, "logps/chosen": -386.6900939941406, "logps/rejected": -263.4174499511719, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.06066500395536423, "rewards/margins": 3.2420220375061035, "rewards/rejected": -3.1813571453094482, "step": 334 }, { "epoch": 1.68, "learning_rate": 4.1756402455707417e-07, "logits/chosen": -2.2433695793151855, "logits/rejected": -1.6789488792419434, "logps/chosen": -401.126708984375, "logps/rejected": -207.55435180664062, "loss": 0.0648, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.12278825044631958, "rewards/margins": 2.7207870483398438, "rewards/rejected": -2.59799861907959, "step": 335 }, { "epoch": 1.69, "learning_rate": 4.1493034984168374e-07, "logits/chosen": -2.4209156036376953, "logits/rejected": -1.809239149093628, "logps/chosen": -393.9102783203125, "logps/rejected": -271.1549987792969, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 0.15670573711395264, "rewards/margins": 3.5317249298095703, "rewards/rejected": -3.3750195503234863, "step": 336 }, { "epoch": 1.69, "learning_rate": 4.122991035178092e-07, "logits/chosen": -2.33018159866333, "logits/rejected": -1.7070420980453491, "logps/chosen": -419.5726318359375, "logps/rejected": -255.0614471435547, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 0.13375766575336456, "rewards/margins": 3.049534797668457, "rewards/rejected": -2.9157769680023193, "step": 337 }, { "epoch": 1.7, "learning_rate": 4.096703606968006e-07, "logits/chosen": -2.3205504417419434, "logits/rejected": -1.77838134765625, "logps/chosen": -427.19952392578125, "logps/rejected": -264.96087646484375, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 0.06373012065887451, "rewards/margins": 3.2497427463531494, "rewards/rejected": -3.1860125064849854, "step": 338 }, { "epoch": 1.7, "learning_rate": 4.070441964185427e-07, "logits/chosen": -2.3165786266326904, "logits/rejected": -1.7180248498916626, "logps/chosen": -420.74285888671875, "logps/rejected": -261.3934020996094, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": 0.03465627133846283, "rewards/margins": 3.2158708572387695, "rewards/rejected": -3.1812145709991455, "step": 339 }, { "epoch": 1.71, "learning_rate": 4.044206856493139e-07, "logits/chosen": -2.3411905765533447, "logits/rejected": -1.7688506841659546, "logps/chosen": -418.6695251464844, "logps/rejected": -278.01611328125, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 0.13454487919807434, "rewards/margins": 3.3236734867095947, "rewards/rejected": -3.1891286373138428, "step": 340 }, { "epoch": 1.71, "learning_rate": 4.0179990327964526e-07, "logits/chosen": -2.370434522628784, "logits/rejected": -1.740971326828003, "logps/chosen": -449.83453369140625, "logps/rejected": -278.0010986328125, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 0.17094650864601135, "rewards/margins": 3.5012459754943848, "rewards/rejected": -3.3302993774414062, "step": 341 }, { "epoch": 1.72, "learning_rate": 3.991819241221835e-07, "logits/chosen": -2.2941460609436035, "logits/rejected": -1.7165008783340454, "logps/chosen": -408.765380859375, "logps/rejected": -267.74462890625, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 0.12336846441030502, "rewards/margins": 3.3327622413635254, "rewards/rejected": -3.2093939781188965, "step": 342 }, { "epoch": 1.72, "learning_rate": 3.965668229095546e-07, "logits/chosen": -2.3572726249694824, "logits/rejected": -1.7265605926513672, "logps/chosen": -424.3651428222656, "logps/rejected": -257.20880126953125, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 0.15230002999305725, "rewards/margins": 3.3892860412597656, "rewards/rejected": -3.2369861602783203, "step": 343 }, { "epoch": 1.73, "learning_rate": 3.9395467429223173e-07, "logits/chosen": -2.425060749053955, "logits/rejected": -1.7491129636764526, "logps/chosen": -428.16815185546875, "logps/rejected": -267.23199462890625, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 0.2022600769996643, "rewards/margins": 3.428938865661621, "rewards/rejected": -3.2266788482666016, "step": 344 }, { "epoch": 1.73, "learning_rate": 3.9134555283640266e-07, "logits/chosen": -2.317636489868164, "logits/rejected": -1.7163525819778442, "logps/chosen": -400.28192138671875, "logps/rejected": -258.9980773925781, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": 0.019412994384765625, "rewards/margins": 3.149421453475952, "rewards/rejected": -3.1300086975097656, "step": 345 }, { "epoch": 1.74, "learning_rate": 3.8873953302184283e-07, "logits/chosen": -2.366058588027954, "logits/rejected": -1.7056124210357666, "logps/chosen": -408.33740234375, "logps/rejected": -256.6706237792969, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 0.28778648376464844, "rewards/margins": 3.3904972076416016, "rewards/rejected": -3.102710723876953, "step": 346 }, { "epoch": 1.74, "learning_rate": 3.8613668923978777e-07, "logits/chosen": -2.3611273765563965, "logits/rejected": -1.836683988571167, "logps/chosen": -421.1358337402344, "logps/rejected": -276.04974365234375, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 0.18613383173942566, "rewards/margins": 3.210736036300659, "rewards/rejected": -3.024601936340332, "step": 347 }, { "epoch": 1.75, "learning_rate": 3.8353709579081077e-07, "logits/chosen": -2.3924667835235596, "logits/rejected": -1.7394758462905884, "logps/chosen": -423.8448181152344, "logps/rejected": -259.02093505859375, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023503582924604416, "rewards/margins": 3.1649463176727295, "rewards/rejected": -3.167296886444092, "step": 348 }, { "epoch": 1.75, "learning_rate": 3.8094082688270087e-07, "logits/chosen": -2.336930990219116, "logits/rejected": -1.7387880086898804, "logps/chosen": -439.75323486328125, "logps/rejected": -263.6993408203125, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 0.14981944859027863, "rewards/margins": 3.3548636436462402, "rewards/rejected": -3.2050440311431885, "step": 349 }, { "epoch": 1.76, "learning_rate": 3.7834795662834566e-07, "logits/chosen": -2.3640592098236084, "logits/rejected": -1.7369496822357178, "logps/chosen": -413.7544860839844, "logps/rejected": -280.06072998046875, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 0.13715311884880066, "rewards/margins": 3.5521230697631836, "rewards/rejected": -3.4149699211120605, "step": 350 }, { "epoch": 1.76, "learning_rate": 3.757585590436144e-07, "logits/chosen": -2.2869207859039307, "logits/rejected": -1.7643429040908813, "logps/chosen": -390.6207275390625, "logps/rejected": -247.9221954345703, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 0.11548131704330444, "rewards/margins": 3.004934787750244, "rewards/rejected": -2.889453411102295, "step": 351 }, { "epoch": 1.77, "learning_rate": 3.7317270804524636e-07, "logits/chosen": -2.3762760162353516, "logits/rejected": -1.7363593578338623, "logps/chosen": -407.7137451171875, "logps/rejected": -247.53067016601562, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 0.09606197476387024, "rewards/margins": 3.135126829147339, "rewards/rejected": -3.039064884185791, "step": 352 }, { "epoch": 1.77, "learning_rate": 3.7059047744873955e-07, "logits/chosen": -2.3638484477996826, "logits/rejected": -1.769869089126587, "logps/chosen": -390.258056640625, "logps/rejected": -263.0291442871094, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 0.11095275729894638, "rewards/margins": 3.400367021560669, "rewards/rejected": -3.289414167404175, "step": 353 }, { "epoch": 1.78, "learning_rate": 3.680119409662451e-07, "logits/chosen": -2.330995559692383, "logits/rejected": -1.7148663997650146, "logps/chosen": -392.270751953125, "logps/rejected": -245.05865478515625, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 0.03439120948314667, "rewards/margins": 3.1898043155670166, "rewards/rejected": -3.1554131507873535, "step": 354 }, { "epoch": 1.78, "learning_rate": 3.6543717220446156e-07, "logits/chosen": -2.3132553100585938, "logits/rejected": -1.7082587480545044, "logps/chosen": -378.57147216796875, "logps/rejected": -241.52328491210938, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.016113154590129852, "rewards/margins": 3.0402169227600098, "rewards/rejected": -3.0241036415100098, "step": 355 }, { "epoch": 1.79, "learning_rate": 3.628662446625349e-07, "logits/chosen": -2.3655269145965576, "logits/rejected": -1.8089683055877686, "logps/chosen": -391.21917724609375, "logps/rejected": -279.35479736328125, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 0.20484544336795807, "rewards/margins": 3.471787691116333, "rewards/rejected": -3.2669425010681152, "step": 356 }, { "epoch": 1.79, "learning_rate": 3.6029923172995937e-07, "logits/chosen": -2.300511121749878, "logits/rejected": -1.7091872692108154, "logps/chosen": -424.1812438964844, "logps/rejected": -258.38018798828125, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 0.1543935239315033, "rewards/margins": 3.3684282302856445, "rewards/rejected": -3.2140345573425293, "step": 357 }, { "epoch": 1.8, "learning_rate": 3.577362066844838e-07, "logits/chosen": -2.381406307220459, "logits/rejected": -1.7363593578338623, "logps/chosen": -378.59637451171875, "logps/rejected": -252.83575439453125, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": -0.03738619014620781, "rewards/margins": 3.0389394760131836, "rewards/rejected": -3.0763258934020996, "step": 358 }, { "epoch": 1.8, "learning_rate": 3.551772426900185e-07, "logits/chosen": -2.3532145023345947, "logits/rejected": -1.7703391313552856, "logps/chosen": -411.0793762207031, "logps/rejected": -255.93658447265625, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 0.2275845855474472, "rewards/margins": 3.5406172275543213, "rewards/rejected": -3.313032627105713, "step": 359 }, { "epoch": 1.81, "learning_rate": 3.526224127945478e-07, "logits/chosen": -2.331430196762085, "logits/rejected": -1.7987477779388428, "logps/chosen": -381.6449279785156, "logps/rejected": -268.12896728515625, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 0.23906148970127106, "rewards/margins": 3.395420551300049, "rewards/rejected": -3.1563591957092285, "step": 360 }, { "epoch": 1.81, "learning_rate": 3.5007178992804417e-07, "logits/chosen": -2.36647891998291, "logits/rejected": -1.6790127754211426, "logps/chosen": -413.47174072265625, "logps/rejected": -249.9410400390625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 0.13434958457946777, "rewards/margins": 3.4697561264038086, "rewards/rejected": -3.33540678024292, "step": 361 }, { "epoch": 1.82, "learning_rate": 3.4752544690038643e-07, "logits/chosen": -2.373753547668457, "logits/rejected": -1.7782487869262695, "logps/chosen": -431.5639343261719, "logps/rejected": -254.77743530273438, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 0.16176021099090576, "rewards/margins": 3.1825475692749023, "rewards/rejected": -3.0207877159118652, "step": 362 }, { "epoch": 1.82, "learning_rate": 3.449834563992816e-07, "logits/chosen": -2.3641390800476074, "logits/rejected": -1.7513372898101807, "logps/chosen": -427.12017822265625, "logps/rejected": -290.6725158691406, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 0.07637812942266464, "rewards/margins": 3.536339282989502, "rewards/rejected": -3.459961414337158, "step": 363 }, { "epoch": 1.83, "learning_rate": 3.4244589098818963e-07, "logits/chosen": -2.3725316524505615, "logits/rejected": -1.7579982280731201, "logps/chosen": -416.9312744140625, "logps/rejected": -261.4740295410156, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": 0.22733309864997864, "rewards/margins": 3.6392250061035156, "rewards/rejected": -3.4118919372558594, "step": 364 }, { "epoch": 1.83, "learning_rate": 3.399128231042527e-07, "logits/chosen": -2.306899070739746, "logits/rejected": -1.6892657279968262, "logps/chosen": -408.30120849609375, "logps/rejected": -266.2808837890625, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.04006894305348396, "rewards/margins": 3.2564191818237305, "rewards/rejected": -3.296487808227539, "step": 365 }, { "epoch": 1.84, "learning_rate": 3.373843250562265e-07, "logits/chosen": -2.4144420623779297, "logits/rejected": -1.80048668384552, "logps/chosen": -439.11138916015625, "logps/rejected": -270.07037353515625, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 0.17007218301296234, "rewards/margins": 3.4678196907043457, "rewards/rejected": -3.2977473735809326, "step": 366 }, { "epoch": 1.84, "learning_rate": 3.348604690224166e-07, "logits/chosen": -2.3663125038146973, "logits/rejected": -1.7858319282531738, "logps/chosen": -387.385986328125, "logps/rejected": -273.0640563964844, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 0.09575895220041275, "rewards/margins": 3.5851335525512695, "rewards/rejected": -3.4893746376037598, "step": 367 }, { "epoch": 1.85, "learning_rate": 3.3234132704861786e-07, "logits/chosen": -2.36065673828125, "logits/rejected": -1.691451907157898, "logps/chosen": -408.43878173828125, "logps/rejected": -243.265380859375, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 0.16299033164978027, "rewards/margins": 3.305713176727295, "rewards/rejected": -3.1427228450775146, "step": 368 }, { "epoch": 1.85, "learning_rate": 3.298269710460585e-07, "logits/chosen": -2.3416857719421387, "logits/rejected": -1.8144396543502808, "logps/chosen": -386.6210632324219, "logps/rejected": -278.87841796875, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 0.12227732688188553, "rewards/margins": 3.5494277477264404, "rewards/rejected": -3.427150249481201, "step": 369 }, { "epoch": 1.86, "learning_rate": 3.2731747278934623e-07, "logits/chosen": -2.3129501342773438, "logits/rejected": -1.751501202583313, "logps/chosen": -376.982666015625, "logps/rejected": -268.9819641113281, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 0.07297618687152863, "rewards/margins": 3.662343978881836, "rewards/rejected": -3.5893681049346924, "step": 370 }, { "epoch": 1.86, "learning_rate": 3.248129039144203e-07, "logits/chosen": -2.3683881759643555, "logits/rejected": -1.819028615951538, "logps/chosen": -431.4526062011719, "logps/rejected": -267.4171142578125, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 0.10969366133213043, "rewards/margins": 3.296834945678711, "rewards/rejected": -3.1871414184570312, "step": 371 }, { "epoch": 1.87, "learning_rate": 3.223133359165056e-07, "logits/chosen": -2.3332748413085938, "logits/rejected": -1.6956508159637451, "logps/chosen": -421.7422790527344, "logps/rejected": -262.87353515625, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 0.1209205761551857, "rewards/margins": 3.601036310195923, "rewards/rejected": -3.4801156520843506, "step": 372 }, { "epoch": 1.87, "learning_rate": 3.1981884014807337e-07, "logits/chosen": -2.3548624515533447, "logits/rejected": -1.7960623502731323, "logps/chosen": -430.720947265625, "logps/rejected": -301.63446044921875, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 0.12174580991268158, "rewards/margins": 3.8920111656188965, "rewards/rejected": -3.7702651023864746, "step": 373 }, { "epoch": 1.88, "learning_rate": 3.173294878168025e-07, "logits/chosen": -2.3556814193725586, "logits/rejected": -1.7514936923980713, "logps/chosen": -405.23406982421875, "logps/rejected": -262.18780517578125, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 0.14618530869483948, "rewards/margins": 3.6190123558044434, "rewards/rejected": -3.472827196121216, "step": 374 }, { "epoch": 1.88, "learning_rate": 3.148453499835483e-07, "logits/chosen": -2.3198795318603516, "logits/rejected": -1.7371431589126587, "logps/chosen": -430.19927978515625, "logps/rejected": -249.78765869140625, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.04714508354663849, "rewards/margins": 3.298361301422119, "rewards/rejected": -3.2512168884277344, "step": 375 }, { "epoch": 1.89, "learning_rate": 3.1236649756031297e-07, "logits/chosen": -2.269211769104004, "logits/rejected": -1.7002761363983154, "logps/chosen": -417.3943176269531, "logps/rejected": -251.10452270507812, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 0.18305334448814392, "rewards/margins": 3.427292585372925, "rewards/rejected": -3.244239330291748, "step": 376 }, { "epoch": 1.89, "learning_rate": 3.098930013082223e-07, "logits/chosen": -2.286076545715332, "logits/rejected": -1.7661198377609253, "logps/chosen": -369.5989990234375, "logps/rejected": -242.8919677734375, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 0.016552099958062172, "rewards/margins": 2.8397443294525146, "rewards/rejected": -2.8231921195983887, "step": 377 }, { "epoch": 1.9, "learning_rate": 3.0742493183550454e-07, "logits/chosen": -2.340603828430176, "logits/rejected": -1.7560714483261108, "logps/chosen": -416.6077575683594, "logps/rejected": -274.50592041015625, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 0.07405523210763931, "rewards/margins": 3.3490467071533203, "rewards/rejected": -3.274991750717163, "step": 378 }, { "epoch": 1.9, "learning_rate": 3.049623595954766e-07, "logits/chosen": -2.31630802154541, "logits/rejected": -1.8046125173568726, "logps/chosen": -385.2266845703125, "logps/rejected": -274.4510192871094, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -0.006834276020526886, "rewards/margins": 3.373891830444336, "rewards/rejected": -3.3807260990142822, "step": 379 }, { "epoch": 1.91, "learning_rate": 3.0250535488453073e-07, "logits/chosen": -2.3981449604034424, "logits/rejected": -1.792945146560669, "logps/chosen": -369.78216552734375, "logps/rejected": -256.83331298828125, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 0.12055398523807526, "rewards/margins": 3.4099745750427246, "rewards/rejected": -3.2894206047058105, "step": 380 }, { "epoch": 1.91, "learning_rate": 3.000539878401296e-07, "logits/chosen": -2.2984890937805176, "logits/rejected": -1.7400116920471191, "logps/chosen": -467.20819091796875, "logps/rejected": -294.92926025390625, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 0.13502071797847748, "rewards/margins": 3.9661505222320557, "rewards/rejected": -3.831129550933838, "step": 381 }, { "epoch": 1.92, "learning_rate": 2.976083284388031e-07, "logits/chosen": -2.3894083499908447, "logits/rejected": -1.7546674013137817, "logps/chosen": -423.69708251953125, "logps/rejected": -245.5006103515625, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 0.249348446726799, "rewards/margins": 3.4605207443237305, "rewards/rejected": -3.211172342300415, "step": 382 }, { "epoch": 1.92, "learning_rate": 2.951684464941516e-07, "logits/chosen": -2.297816753387451, "logits/rejected": -1.6849899291992188, "logps/chosen": -394.0736389160156, "logps/rejected": -244.73028564453125, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 0.18101859092712402, "rewards/margins": 3.3211042881011963, "rewards/rejected": -3.1400856971740723, "step": 383 }, { "epoch": 1.93, "learning_rate": 2.9273441165485225e-07, "logits/chosen": -2.2631728649139404, "logits/rejected": -1.6951870918273926, "logps/chosen": -395.10345458984375, "logps/rejected": -260.34735107421875, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 0.09220480173826218, "rewards/margins": 3.576016902923584, "rewards/rejected": -3.4838123321533203, "step": 384 }, { "epoch": 1.93, "learning_rate": 2.903062934026716e-07, "logits/chosen": -2.3323588371276855, "logits/rejected": -1.7334794998168945, "logps/chosen": -445.0806884765625, "logps/rejected": -270.95166015625, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 0.2552070617675781, "rewards/margins": 3.8895459175109863, "rewards/rejected": -3.634338855743408, "step": 385 }, { "epoch": 1.94, "learning_rate": 2.8788416105048117e-07, "logits/chosen": -2.34346604347229, "logits/rejected": -1.74708092212677, "logps/chosen": -431.78900146484375, "logps/rejected": -247.22415161132812, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 0.1859680414199829, "rewards/margins": 3.255077362060547, "rewards/rejected": -3.0691092014312744, "step": 386 }, { "epoch": 1.94, "learning_rate": 2.854680837402804e-07, "logits/chosen": -2.3314437866210938, "logits/rejected": -1.8017703294754028, "logps/chosen": -399.75555419921875, "logps/rejected": -294.3941955566406, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 0.08767624199390411, "rewards/margins": 3.6176881790161133, "rewards/rejected": -3.5300118923187256, "step": 387 }, { "epoch": 1.95, "learning_rate": 2.8305813044122093e-07, "logits/chosen": -2.26304030418396, "logits/rejected": -1.700200080871582, "logps/chosen": -445.2792663574219, "logps/rejected": -270.3061828613281, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 0.06615829467773438, "rewards/margins": 3.6837263107299805, "rewards/rejected": -3.617568016052246, "step": 388 }, { "epoch": 1.95, "learning_rate": 2.806543699476396e-07, "logits/chosen": -2.314140796661377, "logits/rejected": -1.7564080953598022, "logps/chosen": -437.03662109375, "logps/rejected": -294.1673583984375, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 0.00982004776597023, "rewards/margins": 3.792118787765503, "rewards/rejected": -3.7822985649108887, "step": 389 }, { "epoch": 1.96, "learning_rate": 2.782568708770933e-07, "logits/chosen": -2.273528575897217, "logits/rejected": -1.7875734567642212, "logps/chosen": -376.58538818359375, "logps/rejected": -269.906494140625, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 0.21372833847999573, "rewards/margins": 3.7537269592285156, "rewards/rejected": -3.539998769760132, "step": 390 }, { "epoch": 1.96, "learning_rate": 2.758657016684015e-07, "logits/chosen": -2.296980142593384, "logits/rejected": -1.7033805847167969, "logps/chosen": -364.1384582519531, "logps/rejected": -232.78109741210938, "loss": 0.0524, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.05850372463464737, "rewards/margins": 3.264252185821533, "rewards/rejected": -3.2057485580444336, "step": 391 }, { "epoch": 1.97, "learning_rate": 2.734809305796915e-07, "logits/chosen": -2.205900192260742, "logits/rejected": -1.6673946380615234, "logps/chosen": -373.4482727050781, "logps/rejected": -265.5444641113281, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 0.019626427441835403, "rewards/margins": 3.343919515609741, "rewards/rejected": -3.3242931365966797, "step": 392 }, { "epoch": 1.97, "learning_rate": 2.7110262568645057e-07, "logits/chosen": -2.3122575283050537, "logits/rejected": -1.7881789207458496, "logps/chosen": -434.0072021484375, "logps/rejected": -269.29412841796875, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 0.16516494750976562, "rewards/margins": 3.523787498474121, "rewards/rejected": -3.3586223125457764, "step": 393 }, { "epoch": 1.98, "learning_rate": 2.687308548795825e-07, "logits/chosen": -2.3441076278686523, "logits/rejected": -1.7119675874710083, "logps/chosen": -417.5376281738281, "logps/rejected": -246.445556640625, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 0.02519378624856472, "rewards/margins": 3.4510011672973633, "rewards/rejected": -3.425806999206543, "step": 394 }, { "epoch": 1.98, "learning_rate": 2.6636568586346897e-07, "logits/chosen": -2.3571932315826416, "logits/rejected": -1.7105283737182617, "logps/chosen": -455.7955017089844, "logps/rejected": -267.76788330078125, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 0.17906291782855988, "rewards/margins": 3.746137857437134, "rewards/rejected": -3.56707501411438, "step": 395 }, { "epoch": 1.99, "learning_rate": 2.640071861540385e-07, "logits/chosen": -2.352797269821167, "logits/rejected": -1.7296719551086426, "logps/chosen": -424.06121826171875, "logps/rejected": -278.6806640625, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 0.1542510986328125, "rewards/margins": 3.8389341831207275, "rewards/rejected": -3.684682846069336, "step": 396 }, { "epoch": 1.99, "learning_rate": 2.616554230768374e-07, "logits/chosen": -2.4059829711914062, "logits/rejected": -1.781857967376709, "logps/chosen": -434.9884033203125, "logps/rejected": -295.478515625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 0.11314164102077484, "rewards/margins": 3.8363025188446045, "rewards/rejected": -3.723160982131958, "step": 397 }, { "epoch": 2.0, "learning_rate": 2.593104637651087e-07, "logits/chosen": -2.2264084815979004, "logits/rejected": -1.6888924837112427, "logps/chosen": -338.0196533203125, "logps/rejected": -236.68585205078125, "loss": 0.0586, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.012078093364834785, "rewards/margins": 3.042238712310791, "rewards/rejected": -3.054316759109497, "step": 398 }, { "epoch": 2.0, "learning_rate": 2.569723751578756e-07, "logits/chosen": -2.437995672225952, "logits/rejected": -1.8169077634811401, "logps/chosen": -445.16070556640625, "logps/rejected": -277.7601623535156, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 0.1816602200269699, "rewards/margins": 3.7010343074798584, "rewards/rejected": -3.519374132156372, "step": 399 }, { "epoch": 2.01, "learning_rate": 2.5464122399803123e-07, "logits/chosen": -2.3263964653015137, "logits/rejected": -1.7180159091949463, "logps/chosen": -469.0960388183594, "logps/rejected": -283.8651428222656, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 0.048313647508621216, "rewards/margins": 3.5613222122192383, "rewards/rejected": -3.5130083560943604, "step": 400 } ], "logging_steps": 1, "max_steps": 598, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }