{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999375663357682, "eval_steps": 1000, "global_step": 4004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024973465692701506, "grad_norm": 0.609375, "learning_rate": 1.2468827930174565e-08, "logits/chosen": -0.33114343881607056, "logits/rejected": -0.24089118838310242, "logps/chosen": -44.38773727416992, "logps/rejected": -68.85894775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0024973465692701507, "grad_norm": 0.609375, "learning_rate": 1.2468827930174566e-07, "logits/chosen": -0.4296959638595581, "logits/rejected": -0.34308701753616333, "logps/chosen": -43.235145568847656, "logps/rejected": -80.90267944335938, "loss": 0.6931, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0005310365813784301, "rewards/margins": 9.072302782442421e-05, "rewards/rejected": 0.0004403134807944298, "step": 10 }, { "epoch": 0.004994693138540301, "grad_norm": 0.7890625, "learning_rate": 2.493765586034913e-07, "logits/chosen": -0.4125714898109436, "logits/rejected": -0.3169251084327698, "logps/chosen": -42.952693939208984, "logps/rejected": -78.09742736816406, "loss": 0.6928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.000491735409013927, "rewards/margins": 0.0007636584923602641, "rewards/rejected": -0.0012553940759971738, "step": 20 }, { "epoch": 0.0074920397078104516, "grad_norm": 0.50390625, "learning_rate": 3.7406483790523695e-07, "logits/chosen": -0.4181899130344391, "logits/rejected": -0.3332025110721588, "logps/chosen": -44.16044235229492, "logps/rejected": -71.77767181396484, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00046690466115251184, "rewards/margins": 0.0001037311740219593, "rewards/rejected": -0.0005706357769668102, "step": 30 }, { "epoch": 0.009989386277080603, "grad_norm": 0.66796875, "learning_rate": 4.987531172069826e-07, "logits/chosen": -0.43510785698890686, "logits/rejected": -0.34268879890441895, "logps/chosen": -43.815826416015625, "logps/rejected": -80.65787506103516, "loss": 0.6923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0002724333025980741, "rewards/margins": 0.0016190242022275925, "rewards/rejected": -0.0013465910451486707, "step": 40 }, { "epoch": 0.012486732846350752, "grad_norm": 0.96484375, "learning_rate": 6.234413965087283e-07, "logits/chosen": -0.4381956160068512, "logits/rejected": -0.32796674966812134, "logps/chosen": -43.25028610229492, "logps/rejected": -77.0926742553711, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0003348322061356157, "rewards/margins": 0.0009277343633584678, "rewards/rejected": -0.0005929021863266826, "step": 50 }, { "epoch": 0.014984079415620903, "grad_norm": 0.76171875, "learning_rate": 7.481296758104739e-07, "logits/chosen": -0.4058023989200592, "logits/rejected": -0.31743547320365906, "logps/chosen": -43.332618713378906, "logps/rejected": -81.47147369384766, "loss": 0.6923, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0009701108792796731, "rewards/margins": 0.0017434615874662995, "rewards/rejected": -0.0007733507081866264, "step": 60 }, { "epoch": 0.017481425984891052, "grad_norm": 0.419921875, "learning_rate": 8.728179551122195e-07, "logits/chosen": -0.40699687600135803, "logits/rejected": -0.33324044942855835, "logps/chosen": -42.81806182861328, "logps/rejected": -69.8255844116211, "loss": 0.6918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0005619878647848964, "rewards/margins": 0.0026690722443163395, "rewards/rejected": -0.0021070842631161213, "step": 70 }, { "epoch": 0.019978772554161205, "grad_norm": 1.1171875, "learning_rate": 9.975062344139653e-07, "logits/chosen": -0.4060027003288269, "logits/rejected": -0.31257936358451843, "logps/chosen": -43.606048583984375, "logps/rejected": -74.920654296875, "loss": 0.691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0015056885313242674, "rewards/margins": 0.004372184630483389, "rewards/rejected": -0.0028664960991591215, "step": 80 }, { "epoch": 0.022476119123431355, "grad_norm": 0.515625, "learning_rate": 1.1221945137157108e-06, "logits/chosen": -0.39966338872909546, "logits/rejected": -0.3299568295478821, "logps/chosen": -42.9066047668457, "logps/rejected": -67.96953582763672, "loss": 0.6899, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0013856906443834305, "rewards/margins": 0.00656374916434288, "rewards/rejected": -0.00517805851995945, "step": 90 }, { "epoch": 0.024973465692701504, "grad_norm": 0.73046875, "learning_rate": 1.2468827930174565e-06, "logits/chosen": -0.4090539515018463, "logits/rejected": -0.31136512756347656, "logps/chosen": -44.48310089111328, "logps/rejected": -84.36518096923828, "loss": 0.688, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0026647746562957764, "rewards/margins": 0.010295169427990913, "rewards/rejected": -0.00763039430603385, "step": 100 }, { "epoch": 0.027470812261971653, "grad_norm": 0.6484375, "learning_rate": 1.3715710723192023e-06, "logits/chosen": -0.40336164832115173, "logits/rejected": -0.3107188642024994, "logps/chosen": -43.532264709472656, "logps/rejected": -78.85313415527344, "loss": 0.6869, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0021588439121842384, "rewards/margins": 0.012583871372044086, "rewards/rejected": -0.010425028391182423, "step": 110 }, { "epoch": 0.029968158831241806, "grad_norm": 0.62109375, "learning_rate": 1.4962593516209478e-06, "logits/chosen": -0.44534754753112793, "logits/rejected": -0.3525586724281311, "logps/chosen": -43.2638053894043, "logps/rejected": -73.06291961669922, "loss": 0.6844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0026553391944617033, "rewards/margins": 0.017566664144396782, "rewards/rejected": -0.014911326579749584, "step": 120 }, { "epoch": 0.032465505400511956, "grad_norm": 0.734375, "learning_rate": 1.6209476309226935e-06, "logits/chosen": -0.4450170397758484, "logits/rejected": -0.356881707906723, "logps/chosen": -43.020606994628906, "logps/rejected": -74.61629486083984, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.004791858606040478, "rewards/margins": 0.030012447386980057, "rewards/rejected": -0.025220584124326706, "step": 130 }, { "epoch": 0.034962851969782105, "grad_norm": 0.52734375, "learning_rate": 1.745635910224439e-06, "logits/chosen": -0.4518131613731384, "logits/rejected": -0.35589107871055603, "logps/chosen": -44.260887145996094, "logps/rejected": -89.10530853271484, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.0043911924585700035, "rewards/margins": 0.03277025744318962, "rewards/rejected": -0.02837906777858734, "step": 140 }, { "epoch": 0.037460198539052254, "grad_norm": 0.470703125, "learning_rate": 1.8703241895261848e-06, "logits/chosen": -0.39934635162353516, "logits/rejected": -0.32321810722351074, "logps/chosen": -41.51782989501953, "logps/rejected": -74.04359436035156, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.009082725271582603, "rewards/margins": 0.04699797183275223, "rewards/rejected": -0.037915244698524475, "step": 150 }, { "epoch": 0.03995754510832241, "grad_norm": 0.703125, "learning_rate": 1.9950124688279305e-06, "logits/chosen": -0.40700763463974, "logits/rejected": -0.33096417784690857, "logps/chosen": -42.31119918823242, "logps/rejected": -82.06526947021484, "loss": 0.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.010724621824920177, "rewards/margins": 0.059142522513866425, "rewards/rejected": -0.04841790720820427, "step": 160 }, { "epoch": 0.04245489167759256, "grad_norm": 0.443359375, "learning_rate": 2.119700748129676e-06, "logits/chosen": -0.4063618779182434, "logits/rejected": -0.31147629022598267, "logps/chosen": -43.24675750732422, "logps/rejected": -74.64862060546875, "loss": 0.6545, "rewards/accuracies": 1.0, "rewards/chosen": 0.009759762324392796, "rewards/margins": 0.07911679148674011, "rewards/rejected": -0.0693570226430893, "step": 170 }, { "epoch": 0.04495223824686271, "grad_norm": 0.369140625, "learning_rate": 2.2443890274314216e-06, "logits/chosen": -0.3992343842983246, "logits/rejected": -0.30265265703201294, "logps/chosen": -42.563804626464844, "logps/rejected": -89.08007049560547, "loss": 0.6496, "rewards/accuracies": 1.0, "rewards/chosen": 0.016897384077310562, "rewards/margins": 0.08930385112762451, "rewards/rejected": -0.07240646332502365, "step": 180 }, { "epoch": 0.04744958481613286, "grad_norm": 0.388671875, "learning_rate": 2.3690773067331675e-06, "logits/chosen": -0.393463671207428, "logits/rejected": -0.29836633801460266, "logps/chosen": -42.574581146240234, "logps/rejected": -81.52467346191406, "loss": 0.6448, "rewards/accuracies": 1.0, "rewards/chosen": 0.01014000829309225, "rewards/margins": 0.09943968802690506, "rewards/rejected": -0.08929967135190964, "step": 190 }, { "epoch": 0.04994693138540301, "grad_norm": 0.423828125, "learning_rate": 2.493765586034913e-06, "logits/chosen": -0.3705541491508484, "logits/rejected": -0.2776363492012024, "logps/chosen": -43.70671844482422, "logps/rejected": -79.51457214355469, "loss": 0.635, "rewards/accuracies": 1.0, "rewards/chosen": 0.011818965896964073, "rewards/margins": 0.12041501700878143, "rewards/rejected": -0.10859604924917221, "step": 200 }, { "epoch": 0.05244427795467316, "grad_norm": 0.400390625, "learning_rate": 2.6184538653366586e-06, "logits/chosen": -0.3588668704032898, "logits/rejected": -0.27049878239631653, "logps/chosen": -41.42917251586914, "logps/rejected": -80.90901947021484, "loss": 0.6249, "rewards/accuracies": 1.0, "rewards/chosen": 0.012408060021698475, "rewards/margins": 0.14213527739048004, "rewards/rejected": -0.12972721457481384, "step": 210 }, { "epoch": 0.05494162452394331, "grad_norm": 0.376953125, "learning_rate": 2.7431421446384045e-06, "logits/chosen": -0.32904312014579773, "logits/rejected": -0.24068386852741241, "logps/chosen": -42.47250747680664, "logps/rejected": -89.7323226928711, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 0.011426225304603577, "rewards/margins": 0.13695809245109558, "rewards/rejected": -0.125531867146492, "step": 220 }, { "epoch": 0.05743897109321346, "grad_norm": 0.4921875, "learning_rate": 2.86783042394015e-06, "logits/chosen": -0.3509990870952606, "logits/rejected": -0.26932188868522644, "logps/chosen": -41.524696350097656, "logps/rejected": -84.92623901367188, "loss": 0.6128, "rewards/accuracies": 1.0, "rewards/chosen": 0.016151348128914833, "rewards/margins": 0.16833417117595673, "rewards/rejected": -0.15218281745910645, "step": 230 }, { "epoch": 0.05993631766248361, "grad_norm": 0.494140625, "learning_rate": 2.9925187032418956e-06, "logits/chosen": -0.3635261356830597, "logits/rejected": -0.26545146107673645, "logps/chosen": -43.126625061035156, "logps/rejected": -89.85209655761719, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 0.004656613804399967, "rewards/margins": 0.19621676206588745, "rewards/rejected": -0.19156016409397125, "step": 240 }, { "epoch": 0.06243366423175376, "grad_norm": 0.435546875, "learning_rate": 3.117206982543641e-06, "logits/chosen": -0.3245137929916382, "logits/rejected": -0.22121305763721466, "logps/chosen": -42.189552307128906, "logps/rejected": -91.37117767333984, "loss": 0.5891, "rewards/accuracies": 1.0, "rewards/chosen": 0.009633781388401985, "rewards/margins": 0.22138457000255585, "rewards/rejected": -0.2117508202791214, "step": 250 }, { "epoch": 0.06493101080102391, "grad_norm": 0.76171875, "learning_rate": 3.241895261845387e-06, "logits/chosen": -0.3298066258430481, "logits/rejected": -0.2243480682373047, "logps/chosen": -41.658103942871094, "logps/rejected": -97.98243713378906, "loss": 0.5608, "rewards/accuracies": 1.0, "rewards/chosen": 0.014573690481483936, "rewards/margins": 0.28632912039756775, "rewards/rejected": -0.2717553973197937, "step": 260 }, { "epoch": 0.06742835737029407, "grad_norm": 0.84765625, "learning_rate": 3.3665835411471326e-06, "logits/chosen": -0.274959921836853, "logits/rejected": -0.15127086639404297, "logps/chosen": -42.591773986816406, "logps/rejected": -109.750732421875, "loss": 0.5226, "rewards/accuracies": 1.0, "rewards/chosen": 0.010403521358966827, "rewards/margins": 0.3793814182281494, "rewards/rejected": -0.36897793412208557, "step": 270 }, { "epoch": 0.06992570393956421, "grad_norm": 1.265625, "learning_rate": 3.491271820448878e-06, "logits/chosen": -0.26599782705307007, "logits/rejected": -0.14541617035865784, "logps/chosen": -43.68675994873047, "logps/rejected": -131.44851684570312, "loss": 0.4865, "rewards/accuracies": 1.0, "rewards/chosen": 0.009956231340765953, "rewards/margins": 0.4785284399986267, "rewards/rejected": -0.4685722291469574, "step": 280 }, { "epoch": 0.07242305050883437, "grad_norm": 1.8828125, "learning_rate": 3.615960099750624e-06, "logits/chosen": -0.2336564064025879, "logits/rejected": -0.09945651143789291, "logps/chosen": -42.83462142944336, "logps/rejected": -155.51699829101562, "loss": 0.3811, "rewards/accuracies": 1.0, "rewards/chosen": 0.010509507730603218, "rewards/margins": 0.8152663111686707, "rewards/rejected": -0.804756760597229, "step": 290 }, { "epoch": 0.07492039707810451, "grad_norm": 1.484375, "learning_rate": 3.7406483790523696e-06, "logits/chosen": -0.187991201877594, "logits/rejected": -0.015538264997303486, "logps/chosen": -53.440765380859375, "logps/rejected": -225.8020477294922, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": -0.09434106200933456, "rewards/margins": 1.4534434080123901, "rewards/rejected": -1.5477845668792725, "step": 300 }, { "epoch": 0.07741774364737466, "grad_norm": 0.94140625, "learning_rate": 3.8653366583541155e-06, "logits/chosen": -0.10128624737262726, "logits/rejected": 0.10228855907917023, "logps/chosen": -66.81230163574219, "logps/rejected": -350.84722900390625, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": -0.22738003730773926, "rewards/margins": 2.590130567550659, "rewards/rejected": -2.8175110816955566, "step": 310 }, { "epoch": 0.07991509021664482, "grad_norm": 0.66015625, "learning_rate": 3.990024937655861e-06, "logits/chosen": -0.06341538578271866, "logits/rejected": 0.19574430584907532, "logps/chosen": -63.102256774902344, "logps/rejected": -506.8706970214844, "loss": 0.1113, "rewards/accuracies": 1.0, "rewards/chosen": -0.19701920449733734, "rewards/margins": 4.0467329025268555, "rewards/rejected": -4.2437520027160645, "step": 320 }, { "epoch": 0.08241243678591496, "grad_norm": 0.703125, "learning_rate": 4.114713216957607e-06, "logits/chosen": 0.03162340074777603, "logits/rejected": 0.2972305417060852, "logps/chosen": -55.010414123535156, "logps/rejected": -444.8564453125, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": -0.1186632290482521, "rewards/margins": 3.6549084186553955, "rewards/rejected": -3.773571729660034, "step": 330 }, { "epoch": 0.08490978335518512, "grad_norm": 0.48046875, "learning_rate": 4.239401496259352e-06, "logits/chosen": 0.05119480937719345, "logits/rejected": 0.35537463426589966, "logps/chosen": -52.077064514160156, "logps/rejected": -547.7008056640625, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": -0.08894447982311249, "rewards/margins": 4.584813117980957, "rewards/rejected": -4.673757076263428, "step": 340 }, { "epoch": 0.08740712992445526, "grad_norm": 0.1796875, "learning_rate": 4.364089775561098e-06, "logits/chosen": 0.17311367392539978, "logits/rejected": 0.49543648958206177, "logps/chosen": -59.5765380859375, "logps/rejected": -548.73876953125, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": -0.1556529700756073, "rewards/margins": 4.651310443878174, "rewards/rejected": -4.806963920593262, "step": 350 }, { "epoch": 0.08990447649372542, "grad_norm": 0.216796875, "learning_rate": 4.488778054862843e-06, "logits/chosen": 0.13120940327644348, "logits/rejected": 0.5331201553344727, "logps/chosen": -59.65636444091797, "logps/rejected": -660.1800537109375, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -0.15369465947151184, "rewards/margins": 5.723788738250732, "rewards/rejected": -5.877484321594238, "step": 360 }, { "epoch": 0.09240182306299556, "grad_norm": 0.90625, "learning_rate": 4.6134663341645895e-06, "logits/chosen": 0.25265592336654663, "logits/rejected": 0.6620725989341736, "logps/chosen": -61.8662109375, "logps/rejected": -688.4534912109375, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": -0.18499073386192322, "rewards/margins": 5.994035720825195, "rewards/rejected": -6.179026126861572, "step": 370 }, { "epoch": 0.09489916963226572, "grad_norm": 1.0703125, "learning_rate": 4.738154613466335e-06, "logits/chosen": 0.21470895409584045, "logits/rejected": 0.7127342224121094, "logps/chosen": -76.92805480957031, "logps/rejected": -1027.8746337890625, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.3377203345298767, "rewards/margins": 9.173759460449219, "rewards/rejected": -9.511480331420898, "step": 380 }, { "epoch": 0.09739651620153587, "grad_norm": 0.013671875, "learning_rate": 4.862842892768081e-06, "logits/chosen": 0.2577429413795471, "logits/rejected": 0.7398630380630493, "logps/chosen": -87.82744598388672, "logps/rejected": -828.4269409179688, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -0.44494834542274475, "rewards/margins": 7.14224100112915, "rewards/rejected": -7.5871901512146, "step": 390 }, { "epoch": 0.09989386277080602, "grad_norm": 0.326171875, "learning_rate": 4.987531172069826e-06, "logits/chosen": 0.312338262796402, "logits/rejected": 0.8188611268997192, "logps/chosen": -83.17652130126953, "logps/rejected": -929.5695190429688, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -0.39534783363342285, "rewards/margins": 8.192750930786133, "rewards/rejected": -8.588098526000977, "step": 400 }, { "epoch": 0.10239120934007617, "grad_norm": 0.2412109375, "learning_rate": 4.999923022460671e-06, "logits/chosen": 0.2771604657173157, "logits/rejected": 0.875481903553009, "logps/chosen": -71.91412353515625, "logps/rejected": -1142.7008056640625, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.29563969373703003, "rewards/margins": 10.368195533752441, "rewards/rejected": -10.663835525512695, "step": 410 }, { "epoch": 0.10488855590934631, "grad_norm": 0.361328125, "learning_rate": 4.999656933348981e-06, "logits/chosen": 0.3595578372478485, "logits/rejected": 0.8383792638778687, "logps/chosen": -77.22844696044922, "logps/rejected": -818.2454833984375, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -0.3375477194786072, "rewards/margins": 7.169804573059082, "rewards/rejected": -7.50735330581665, "step": 420 }, { "epoch": 0.10738590247861647, "grad_norm": 0.006805419921875, "learning_rate": 4.99920080255011e-06, "logits/chosen": 0.3072226047515869, "logits/rejected": 0.9365525245666504, "logps/chosen": -69.96758270263672, "logps/rejected": -1107.283447265625, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -0.24378785490989685, "rewards/margins": 9.968598365783691, "rewards/rejected": -10.212385177612305, "step": 430 }, { "epoch": 0.10988324904788661, "grad_norm": 0.9375, "learning_rate": 4.998554664742362e-06, "logits/chosen": 0.386096328496933, "logits/rejected": 0.9298421740531921, "logps/chosen": -77.93641662597656, "logps/rejected": -925.2971801757812, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -0.3427829146385193, "rewards/margins": 8.217004776000977, "rewards/rejected": -8.559788703918457, "step": 440 }, { "epoch": 0.11238059561715677, "grad_norm": 0.162109375, "learning_rate": 4.997718569049726e-06, "logits/chosen": 0.38062307238578796, "logits/rejected": 0.9510132670402527, "logps/chosen": -62.76348876953125, "logps/rejected": -1039.5853271484375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -0.19411073625087738, "rewards/margins": 9.47815227508545, "rewards/rejected": -9.672263145446777, "step": 450 }, { "epoch": 0.11487794218642693, "grad_norm": 0.0625, "learning_rate": 4.9966925790381404e-06, "logits/chosen": 0.4757159352302551, "logits/rejected": 1.018425703048706, "logps/chosen": -81.46342468261719, "logps/rejected": -907.7916870117188, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -0.38752180337905884, "rewards/margins": 7.994225978851318, "rewards/rejected": -8.381747245788574, "step": 460 }, { "epoch": 0.11737528875569707, "grad_norm": 0.39453125, "learning_rate": 4.995476772710657e-06, "logits/chosen": 0.40233319997787476, "logits/rejected": 1.0515995025634766, "logps/chosen": -82.72390747070312, "logps/rejected": -1207.831787109375, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -0.37652480602264404, "rewards/margins": 10.886858940124512, "rewards/rejected": -11.263383865356445, "step": 470 }, { "epoch": 0.11987263532496722, "grad_norm": 0.109375, "learning_rate": 4.994071242501516e-06, "logits/chosen": 0.4317776560783386, "logits/rejected": 1.0796253681182861, "logps/chosen": -62.16728591918945, "logps/rejected": -1022.6038208007812, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -0.18862931430339813, "rewards/margins": 9.357450485229492, "rewards/rejected": -9.546079635620117, "step": 480 }, { "epoch": 0.12236998189423737, "grad_norm": 3.3527612686157227e-06, "learning_rate": 4.992476095269112e-06, "logits/chosen": 0.4001534581184387, "logits/rejected": 0.9869491457939148, "logps/chosen": -64.50323486328125, "logps/rejected": -1058.342041015625, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -0.21038322150707245, "rewards/margins": 9.569865226745605, "rewards/rejected": -9.780248641967773, "step": 490 }, { "epoch": 0.12486732846350752, "grad_norm": 0.166015625, "learning_rate": 4.990691452287877e-06, "logits/chosen": 0.513416051864624, "logits/rejected": 1.122924566268921, "logps/chosen": -86.93208312988281, "logps/rejected": -1010.1483154296875, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -0.43185362219810486, "rewards/margins": 8.921982765197754, "rewards/rejected": -9.353837966918945, "step": 500 }, { "epoch": 0.12736467503277768, "grad_norm": 0.1806640625, "learning_rate": 4.988717449239056e-06, "logits/chosen": 0.5288435220718384, "logits/rejected": 1.184326410293579, "logps/chosen": -75.24764251708984, "logps/rejected": -1083.439697265625, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -0.31985238194465637, "rewards/margins": 9.819514274597168, "rewards/rejected": -10.139368057250977, "step": 510 }, { "epoch": 0.12986202160204782, "grad_norm": 0.05126953125, "learning_rate": 4.98655423620039e-06, "logits/chosen": 0.45140591263771057, "logits/rejected": 1.135999083518982, "logps/chosen": -66.84233093261719, "logps/rejected": -1121.6982421875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.23821225762367249, "rewards/margins": 10.265534400939941, "rewards/rejected": -10.503746032714844, "step": 520 }, { "epoch": 0.13235936817131796, "grad_norm": 0.0130615234375, "learning_rate": 4.984201977634711e-06, "logits/chosen": 0.44299745559692383, "logits/rejected": 1.2149170637130737, "logps/chosen": -74.1891860961914, "logps/rejected": -1348.673095703125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -0.30132222175598145, "rewards/margins": 12.441837310791016, "rewards/rejected": -12.74316120147705, "step": 530 }, { "epoch": 0.13485671474058814, "grad_norm": 0.1982421875, "learning_rate": 4.9816608523774345e-06, "logits/chosen": 0.4906342625617981, "logits/rejected": 1.1866085529327393, "logps/chosen": -61.67924880981445, "logps/rejected": -1052.905029296875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.1749764382839203, "rewards/margins": 9.563148498535156, "rewards/rejected": -9.73812484741211, "step": 540 }, { "epoch": 0.13735406130985828, "grad_norm": 0.02099609375, "learning_rate": 4.978931053622964e-06, "logits/chosen": 0.5177958607673645, "logits/rejected": 1.2569612264633179, "logps/chosen": -70.76200866699219, "logps/rejected": -1278.626708984375, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.27674826979637146, "rewards/margins": 11.792952537536621, "rewards/rejected": -12.069701194763184, "step": 550 }, { "epoch": 0.13985140787912842, "grad_norm": 0.0003528594970703125, "learning_rate": 4.9760127889100044e-06, "logits/chosen": 0.5248929262161255, "logits/rejected": 1.2501459121704102, "logps/chosen": -81.47541809082031, "logps/rejected": -1154.671630859375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.3913446366786957, "rewards/margins": 10.44621467590332, "rewards/rejected": -10.837559700012207, "step": 560 }, { "epoch": 0.1423487544483986, "grad_norm": 0.09423828125, "learning_rate": 4.972906280105781e-06, "logits/chosen": 0.5316249132156372, "logits/rejected": 1.3082139492034912, "logps/chosen": -88.09521484375, "logps/rejected": -1197.23486328125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -0.4347292482852936, "rewards/margins": 10.762666702270508, "rewards/rejected": -11.197395324707031, "step": 570 }, { "epoch": 0.14484610101766873, "grad_norm": 0.0106201171875, "learning_rate": 4.969611763389175e-06, "logits/chosen": 0.5327505469322205, "logits/rejected": 1.3031514883041382, "logps/chosen": -73.19583129882812, "logps/rejected": -1114.89990234375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -0.30240899324417114, "rewards/margins": 10.148442268371582, "rewards/rejected": -10.450851440429688, "step": 580 }, { "epoch": 0.14734344758693887, "grad_norm": 0.0751953125, "learning_rate": 4.966129489232762e-06, "logits/chosen": 0.47109970450401306, "logits/rejected": 1.3012760877609253, "logps/chosen": -71.85210418701172, "logps/rejected": -1336.1424560546875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.277540385723114, "rewards/margins": 12.241964340209961, "rewards/rejected": -12.51950454711914, "step": 590 }, { "epoch": 0.14984079415620902, "grad_norm": 0.193359375, "learning_rate": 4.962459722383775e-06, "logits/chosen": 0.4269895553588867, "logits/rejected": 1.1828067302703857, "logps/chosen": -71.79056549072266, "logps/rejected": -1337.454345703125, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -0.2726261019706726, "rewards/margins": 12.248571395874023, "rewards/rejected": -12.521197319030762, "step": 600 }, { "epoch": 0.1523381407254792, "grad_norm": 0.09619140625, "learning_rate": 4.958602741843975e-06, "logits/chosen": 0.4595261216163635, "logits/rejected": 1.3015968799591064, "logps/chosen": -78.28643035888672, "logps/rejected": -1252.5814208984375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.3374534249305725, "rewards/margins": 11.378314018249512, "rewards/rejected": -11.715767860412598, "step": 610 }, { "epoch": 0.15483548729474933, "grad_norm": 0.048583984375, "learning_rate": 4.954558840848437e-06, "logits/chosen": 0.5825181007385254, "logits/rejected": 1.3565789461135864, "logps/chosen": -76.19590759277344, "logps/rejected": -1119.151611328125, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.3366764783859253, "rewards/margins": 10.178030967712402, "rewards/rejected": -10.514707565307617, "step": 620 }, { "epoch": 0.15733283386401947, "grad_norm": 0.1572265625, "learning_rate": 4.950328326843258e-06, "logits/chosen": 0.5459114909172058, "logits/rejected": 1.3613156080245972, "logps/chosen": -82.85084533691406, "logps/rejected": -1277.3514404296875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.3968349099159241, "rewards/margins": 11.52591323852539, "rewards/rejected": -11.922747611999512, "step": 630 }, { "epoch": 0.15983018043328964, "grad_norm": 0.005096435546875, "learning_rate": 4.945911521462182e-06, "logits/chosen": 0.5720694065093994, "logits/rejected": 1.411368727684021, "logps/chosen": -80.45169067382812, "logps/rejected": -1358.6873779296875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3696327209472656, "rewards/margins": 12.431825637817383, "rewards/rejected": -12.801457405090332, "step": 640 }, { "epoch": 0.16232752700255978, "grad_norm": 0.1708984375, "learning_rate": 4.941308760502149e-06, "logits/chosen": 0.5064912438392639, "logits/rejected": 1.2029752731323242, "logps/chosen": -67.90926361083984, "logps/rejected": -1026.6124267578125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -0.2300504744052887, "rewards/margins": 9.079760551452637, "rewards/rejected": -9.30981159210205, "step": 650 }, { "epoch": 0.16482487357182993, "grad_norm": 0.1787109375, "learning_rate": 4.936520393897762e-06, "logits/chosen": 0.4909030497074127, "logits/rejected": 1.2922523021697998, "logps/chosen": -69.4020004272461, "logps/rejected": -1298.668212890625, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -0.25019845366477966, "rewards/margins": 11.917684555053711, "rewards/rejected": -12.167882919311523, "step": 660 }, { "epoch": 0.16732222014110007, "grad_norm": 0.00469970703125, "learning_rate": 4.931546785694684e-06, "logits/chosen": 0.5053218007087708, "logits/rejected": 1.4669103622436523, "logps/chosen": -86.32283782958984, "logps/rejected": -1483.82763671875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.42769041657447815, "rewards/margins": 13.680854797363281, "rewards/rejected": -14.108546257019043, "step": 670 }, { "epoch": 0.16981956671037024, "grad_norm": 0.0238037109375, "learning_rate": 4.926388314021964e-06, "logits/chosen": 0.6257452368736267, "logits/rejected": 1.5271151065826416, "logps/chosen": -92.75479888916016, "logps/rejected": -1237.61474609375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.4862573742866516, "rewards/margins": 11.188620567321777, "rewards/rejected": -11.67487907409668, "step": 680 }, { "epoch": 0.17231691327964038, "grad_norm": 0.000385284423828125, "learning_rate": 4.921045371063283e-06, "logits/chosen": 0.584161102771759, "logits/rejected": 1.478125810623169, "logps/chosen": -89.634033203125, "logps/rejected": -1360.25537109375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.46992653608322144, "rewards/margins": 12.42024040222168, "rewards/rejected": -12.890167236328125, "step": 690 }, { "epoch": 0.17481425984891052, "grad_norm": 0.1318359375, "learning_rate": 4.915518363027142e-06, "logits/chosen": 0.5938104391098022, "logits/rejected": 1.4910205602645874, "logps/chosen": -73.86201477050781, "logps/rejected": -1182.0716552734375, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.3045103847980499, "rewards/margins": 10.839627265930176, "rewards/rejected": -11.144137382507324, "step": 700 }, { "epoch": 0.1773116064181807, "grad_norm": 0.08740234375, "learning_rate": 4.909807710115977e-06, "logits/chosen": 0.543526828289032, "logits/rejected": 1.467707633972168, "logps/chosen": -81.62191009521484, "logps/rejected": -1380.295166015625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.38441941142082214, "rewards/margins": 12.690566062927246, "rewards/rejected": -13.074984550476074, "step": 710 }, { "epoch": 0.17980895298745084, "grad_norm": 0.025146484375, "learning_rate": 4.903913846494211e-06, "logits/chosen": 0.4768219590187073, "logits/rejected": 1.4783326387405396, "logps/chosen": -79.43184661865234, "logps/rejected": -1673.644287109375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3560616374015808, "rewards/margins": 15.518526077270508, "rewards/rejected": -15.87458610534668, "step": 720 }, { "epoch": 0.18230629955672098, "grad_norm": 0.08154296875, "learning_rate": 4.897837220255251e-06, "logits/chosen": 0.5687042474746704, "logits/rejected": 1.4331896305084229, "logps/chosen": -86.38923645019531, "logps/rejected": -1329.374755859375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.4307560920715332, "rewards/margins": 12.039878845214844, "rewards/rejected": -12.470634460449219, "step": 730 }, { "epoch": 0.18480364612599112, "grad_norm": 0.0101318359375, "learning_rate": 4.891578293387413e-06, "logits/chosen": 0.604946494102478, "logits/rejected": 1.5590946674346924, "logps/chosen": -80.6954345703125, "logps/rejected": -1429.257080078125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -0.37448957562446594, "rewards/margins": 13.214704513549805, "rewards/rejected": -13.589195251464844, "step": 740 }, { "epoch": 0.1873009926952613, "grad_norm": 0.07177734375, "learning_rate": 4.885137541738808e-06, "logits/chosen": 0.5679504871368408, "logits/rejected": 1.4625308513641357, "logps/chosen": -74.07333374023438, "logps/rejected": -1173.9857177734375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.3022187054157257, "rewards/margins": 10.615083694458008, "rewards/rejected": -10.917302131652832, "step": 750 }, { "epoch": 0.18979833926453143, "grad_norm": 0.000659942626953125, "learning_rate": 4.878515454981153e-06, "logits/chosen": 0.5600544214248657, "logits/rejected": 1.5554611682891846, "logps/chosen": -95.30448913574219, "logps/rejected": -1539.63427734375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.5141724348068237, "rewards/margins": 14.041679382324219, "rewards/rejected": -14.555851936340332, "step": 760 }, { "epoch": 0.19229568583380158, "grad_norm": 0.154296875, "learning_rate": 4.8717125365725545e-06, "logits/chosen": 0.6704256534576416, "logits/rejected": 1.511325716972351, "logps/chosen": -78.95833587646484, "logps/rejected": -1096.5010986328125, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.35167983174324036, "rewards/margins": 9.9315185546875, "rewards/rejected": -10.283197402954102, "step": 770 }, { "epoch": 0.19479303240307175, "grad_norm": 0.04443359375, "learning_rate": 4.864729303719221e-06, "logits/chosen": 0.49029749631881714, "logits/rejected": 1.4827202558517456, "logps/chosen": -78.9704360961914, "logps/rejected": -1541.809326171875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.34706807136535645, "rewards/margins": 14.254959106445312, "rewards/rejected": -14.602025985717773, "step": 780 }, { "epoch": 0.1972903789723419, "grad_norm": 0.11328125, "learning_rate": 4.857566287336152e-06, "logits/chosen": 0.5910658836364746, "logits/rejected": 1.5483187437057495, "logps/chosen": -99.37945556640625, "logps/rejected": -1432.4755859375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -0.5464431047439575, "rewards/margins": 12.95887565612793, "rewards/rejected": -13.505319595336914, "step": 790 }, { "epoch": 0.19978772554161203, "grad_norm": 0.0011444091796875, "learning_rate": 4.850224032006765e-06, "logits/chosen": 0.6179195642471313, "logits/rejected": 1.5901352167129517, "logps/chosen": -81.73147583007812, "logps/rejected": -1412.5142822265625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.38581180572509766, "rewards/margins": 13.030293464660645, "rewards/rejected": -13.416107177734375, "step": 800 }, { "epoch": 0.20228507211088217, "grad_norm": 0.1357421875, "learning_rate": 4.8427030959414984e-06, "logits/chosen": 0.5971755385398865, "logits/rejected": 1.6486015319824219, "logps/chosen": -74.70821380615234, "logps/rejected": -1532.8193359375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.3184259831905365, "rewards/margins": 14.300427436828613, "rewards/rejected": -14.618852615356445, "step": 810 }, { "epoch": 0.20478241868015234, "grad_norm": 0.1689453125, "learning_rate": 4.835004050935369e-06, "logits/chosen": 0.6013739705085754, "logits/rejected": 1.4875710010528564, "logps/chosen": -74.9230728149414, "logps/rejected": -1462.062255859375, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -0.31746476888656616, "rewards/margins": 13.55724811553955, "rewards/rejected": -13.874712944030762, "step": 820 }, { "epoch": 0.2072797652494225, "grad_norm": 0.0279541015625, "learning_rate": 4.8271274823245e-06, "logits/chosen": 0.6184748411178589, "logits/rejected": 1.5838812589645386, "logps/chosen": -74.03543853759766, "logps/rejected": -1436.349853515625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.29689908027648926, "rewards/margins": 13.336532592773438, "rewards/rejected": -13.63343334197998, "step": 830 }, { "epoch": 0.20977711181869263, "grad_norm": 0.0888671875, "learning_rate": 4.8190739889416264e-06, "logits/chosen": 0.6181553602218628, "logits/rejected": 1.6880038976669312, "logps/chosen": -73.8195571899414, "logps/rejected": -1627.819091796875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3135436177253723, "rewards/margins": 15.254382133483887, "rewards/rejected": -15.567927360534668, "step": 840 }, { "epoch": 0.2122744583879628, "grad_norm": 0.0703125, "learning_rate": 4.810844183070553e-06, "logits/chosen": 0.5540085434913635, "logits/rejected": 1.6211318969726562, "logps/chosen": -72.0704116821289, "logps/rejected": -1341.570068359375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.29530271887779236, "rewards/margins": 12.359551429748535, "rewards/rejected": -12.654852867126465, "step": 850 }, { "epoch": 0.21477180495723294, "grad_norm": 0.1435546875, "learning_rate": 4.802438690399622e-06, "logits/chosen": 0.600739598274231, "logits/rejected": 1.643431305885315, "logps/chosen": -70.41534423828125, "logps/rejected": -1476.7939453125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.26624736189842224, "rewards/margins": 13.719133377075195, "rewards/rejected": -13.985379219055176, "step": 860 }, { "epoch": 0.21726915152650308, "grad_norm": 0.000701904296875, "learning_rate": 4.793858149974129e-06, "logits/chosen": 0.6058120727539062, "logits/rejected": 1.721599817276001, "logps/chosen": -79.14794921875, "logps/rejected": -1739.1048583984375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.3524496853351593, "rewards/margins": 16.286325454711914, "rewards/rejected": -16.638776779174805, "step": 870 }, { "epoch": 0.21976649809577323, "grad_norm": 0.047119140625, "learning_rate": 4.785103214147747e-06, "logits/chosen": 0.6141168475151062, "logits/rejected": 1.7174230813980103, "logps/chosen": -77.48551940917969, "logps/rejected": -1538.275634765625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.34762924909591675, "rewards/margins": 14.33294677734375, "rewards/rejected": -14.680575370788574, "step": 880 }, { "epoch": 0.2222638446650434, "grad_norm": 0.000667572021484375, "learning_rate": 4.776174548532926e-06, "logits/chosen": 0.6287072896957397, "logits/rejected": 1.6851770877838135, "logps/chosen": -77.02873229980469, "logps/rejected": -1560.05810546875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3427741825580597, "rewards/margins": 14.53984260559082, "rewards/rejected": -14.882616996765137, "step": 890 }, { "epoch": 0.22476119123431354, "grad_norm": 0.07080078125, "learning_rate": 4.767072831950288e-06, "logits/chosen": 0.617357611656189, "logits/rejected": 1.7079490423202515, "logps/chosen": -77.236572265625, "logps/rejected": -1560.383544921875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3377893567085266, "rewards/margins": 14.535722732543945, "rewards/rejected": -14.873510360717773, "step": 900 }, { "epoch": 0.22725853780358368, "grad_norm": 0.01129150390625, "learning_rate": 4.7577987563770226e-06, "logits/chosen": 0.6263229250907898, "logits/rejected": 1.674384355545044, "logps/chosen": -81.606201171875, "logps/rejected": -1556.177490234375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.37518563866615295, "rewards/margins": 14.350992202758789, "rewards/rejected": -14.726178169250488, "step": 910 }, { "epoch": 0.22975588437285385, "grad_norm": 0.00042724609375, "learning_rate": 4.748353026894273e-06, "logits/chosen": 0.63312166929245, "logits/rejected": 1.7262632846832275, "logps/chosen": -73.91940307617188, "logps/rejected": -1473.6871337890625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.29600244760513306, "rewards/margins": 13.644391059875488, "rewards/rejected": -13.940393447875977, "step": 920 }, { "epoch": 0.232253230942124, "grad_norm": 0.00836181640625, "learning_rate": 4.738736361633532e-06, "logits/chosen": 0.6512018442153931, "logits/rejected": 1.6386451721191406, "logps/chosen": -75.5640869140625, "logps/rejected": -1397.0894775390625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.3240968585014343, "rewards/margins": 12.940801620483398, "rewards/rejected": -13.264900207519531, "step": 930 }, { "epoch": 0.23475057751139414, "grad_norm": 0.00250244140625, "learning_rate": 4.728949491722046e-06, "logits/chosen": 0.6666821837425232, "logits/rejected": 1.6732994318008423, "logps/chosen": -76.89160919189453, "logps/rejected": -1383.048095703125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.3491043150424957, "rewards/margins": 12.806585311889648, "rewards/rejected": -13.155691146850586, "step": 940 }, { "epoch": 0.2372479240806643, "grad_norm": 0.07421875, "learning_rate": 4.718993161227231e-06, "logits/chosen": 0.5883976221084595, "logits/rejected": 1.7654300928115845, "logps/chosen": -84.6438217163086, "logps/rejected": -1749.6275634765625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.3954206109046936, "rewards/margins": 16.34560775756836, "rewards/rejected": -16.74102783203125, "step": 950 }, { "epoch": 0.23974527064993445, "grad_norm": 0.006500244140625, "learning_rate": 4.708868127100098e-06, "logits/chosen": 0.666793167591095, "logits/rejected": 1.6790577173233032, "logps/chosen": -75.14250183105469, "logps/rejected": -1408.4847412109375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.3262697458267212, "rewards/margins": 13.068046569824219, "rewards/rejected": -13.394315719604492, "step": 960 }, { "epoch": 0.2422426172192046, "grad_norm": 0.0001659393310546875, "learning_rate": 4.6985751591177075e-06, "logits/chosen": 0.62273108959198, "logits/rejected": 1.7520809173583984, "logps/chosen": -87.58243560791016, "logps/rejected": -1661.821533203125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.4351579546928406, "rewards/margins": 15.40393352508545, "rewards/rejected": -15.839093208312988, "step": 970 }, { "epoch": 0.24473996378847473, "grad_norm": 0.00885009765625, "learning_rate": 4.688115039824648e-06, "logits/chosen": 0.6803555488586426, "logits/rejected": 1.7338272333145142, "logps/chosen": -88.861328125, "logps/rejected": -1561.404541015625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.4565040171146393, "rewards/margins": 14.439603805541992, "rewards/rejected": -14.896108627319336, "step": 980 }, { "epoch": 0.2472373103577449, "grad_norm": 5.424022674560547e-06, "learning_rate": 4.677488564473535e-06, "logits/chosen": 0.6470680832862854, "logits/rejected": 1.818428635597229, "logps/chosen": -92.59998321533203, "logps/rejected": -1680.185791015625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.4968494772911072, "rewards/margins": 15.579751968383789, "rewards/rejected": -16.076602935791016, "step": 990 }, { "epoch": 0.24973465692701505, "grad_norm": 0.03173828125, "learning_rate": 4.666696540964556e-06, "logits/chosen": 0.7138900756835938, "logits/rejected": 1.7771879434585571, "logps/chosen": -99.16166687011719, "logps/rejected": -1503.458984375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.551328182220459, "rewards/margins": 13.802647590637207, "rewards/rejected": -14.353976249694824, "step": 1000 }, { "epoch": 0.24973465692701505, "eval_logits/chosen": 0.7451997995376587, "eval_logits/rejected": 1.5489047765731812, "eval_logps/chosen": -80.45184326171875, "eval_logps/rejected": -830.6234130859375, "eval_loss": 0.006477854214608669, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.36095961928367615, "eval_rewards/margins": 7.334376335144043, "eval_rewards/rejected": -7.695336818695068, "eval_runtime": 0.619, "eval_samples_per_second": 8.077, "eval_steps_per_second": 8.077, "step": 1000 }, { "epoch": 0.2522320034962852, "grad_norm": 0.0003871917724609375, "learning_rate": 4.6557397897840454e-06, "logits/chosen": 0.693498969078064, "logits/rejected": 1.753259301185608, "logps/chosen": -102.29890441894531, "logps/rejected": -1543.0269775390625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.583904504776001, "rewards/margins": 14.051568984985352, "rewards/rejected": -14.635473251342773, "step": 1010 }, { "epoch": 0.25472935006555536, "grad_norm": 0.0003032684326171875, "learning_rate": 4.644619143942108e-06, "logits/chosen": 0.5707821249961853, "logits/rejected": 1.7276995182037354, "logps/chosen": -88.48922729492188, "logps/rejected": -1698.140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.44621315598487854, "rewards/margins": 15.652883529663086, "rewards/rejected": -16.099096298217773, "step": 1020 }, { "epoch": 0.2572266966348255, "grad_norm": 0.076171875, "learning_rate": 4.633335448909284e-06, "logits/chosen": 0.658003032207489, "logits/rejected": 1.7504956722259521, "logps/chosen": -89.60042572021484, "logps/rejected": -1624.2664794921875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4533475339412689, "rewards/margins": 14.999029159545898, "rewards/rejected": -15.452377319335938, "step": 1030 }, { "epoch": 0.25972404320409564, "grad_norm": 0.0308837890625, "learning_rate": 4.621889562552272e-06, "logits/chosen": 0.6270695924758911, "logits/rejected": 1.8431812524795532, "logps/chosen": -105.17597961425781, "logps/rejected": -1880.988037109375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.6098282933235168, "rewards/margins": 17.456111907958984, "rewards/rejected": -18.065940856933594, "step": 1040 }, { "epoch": 0.2622213897733658, "grad_norm": 3.886222839355469e-05, "learning_rate": 4.610282355068707e-06, "logits/chosen": 0.577286422252655, "logits/rejected": 1.723755121231079, "logps/chosen": -109.06168365478516, "logps/rejected": -1825.4671630859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6522443890571594, "rewards/margins": 16.784276962280273, "rewards/rejected": -17.4365234375, "step": 1050 }, { "epoch": 0.26471873634263593, "grad_norm": 0.078125, "learning_rate": 4.598514708921006e-06, "logits/chosen": 0.6790138483047485, "logits/rejected": 1.8542677164077759, "logps/chosen": -101.43669128417969, "logps/rejected": -1781.3316650390625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.5860346555709839, "rewards/margins": 16.492624282836914, "rewards/rejected": -17.078659057617188, "step": 1060 }, { "epoch": 0.26721608291190607, "grad_norm": 0.0002994537353515625, "learning_rate": 4.5865875187692695e-06, "logits/chosen": 0.6998518109321594, "logits/rejected": 1.8515506982803345, "logps/chosen": -90.31967163085938, "logps/rejected": -1559.104736328125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.46949252486228943, "rewards/margins": 14.420549392700195, "rewards/rejected": -14.890042304992676, "step": 1070 }, { "epoch": 0.26971342948117627, "grad_norm": 0.04296875, "learning_rate": 4.57450169140327e-06, "logits/chosen": 0.6541129350662231, "logits/rejected": 1.944573163986206, "logps/chosen": -94.61952209472656, "logps/rejected": -1980.4349365234375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.5051703453063965, "rewards/margins": 18.555133819580078, "rewards/rejected": -19.060306549072266, "step": 1080 }, { "epoch": 0.2722107760504464, "grad_norm": 0.0250244140625, "learning_rate": 4.562258145673507e-06, "logits/chosen": 0.624032199382782, "logits/rejected": 1.8918708562850952, "logps/chosen": -105.9957275390625, "logps/rejected": -1974.2584228515625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.6295837759971619, "rewards/margins": 18.393098831176758, "rewards/rejected": -19.022680282592773, "step": 1090 }, { "epoch": 0.27470812261971655, "grad_norm": 0.0026092529296875, "learning_rate": 4.549857812421353e-06, "logits/chosen": 0.64922696352005, "logits/rejected": 1.8475501537322998, "logps/chosen": -89.56166076660156, "logps/rejected": -1663.010986328125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.46423882246017456, "rewards/margins": 15.462870597839355, "rewards/rejected": -15.92711067199707, "step": 1100 }, { "epoch": 0.2772054691889867, "grad_norm": 0.0084228515625, "learning_rate": 4.537301634408281e-06, "logits/chosen": 0.6925086975097656, "logits/rejected": 1.7738326787948608, "logps/chosen": -85.5665512084961, "logps/rejected": -1501.200439453125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4182400703430176, "rewards/margins": 13.875646591186523, "rewards/rejected": -14.293886184692383, "step": 1110 }, { "epoch": 0.27970281575825684, "grad_norm": 0.0732421875, "learning_rate": 4.52459056624419e-06, "logits/chosen": 0.7249046564102173, "logits/rejected": 1.795248031616211, "logps/chosen": -102.11865997314453, "logps/rejected": -1670.522216796875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.5903103351593018, "rewards/margins": 15.393072128295898, "rewards/rejected": -15.983380317687988, "step": 1120 }, { "epoch": 0.282200162327527, "grad_norm": 0.06494140625, "learning_rate": 4.51172557431483e-06, "logits/chosen": 0.6399365663528442, "logits/rejected": 1.7282390594482422, "logps/chosen": -105.56150817871094, "logps/rejected": -1699.7261962890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6241176724433899, "rewards/margins": 15.525866508483887, "rewards/rejected": -16.14998435974121, "step": 1130 }, { "epoch": 0.2846975088967972, "grad_norm": 0.054443359375, "learning_rate": 4.49870763670833e-06, "logits/chosen": 0.6207230687141418, "logits/rejected": 1.878230333328247, "logps/chosen": -95.18315124511719, "logps/rejected": -1844.5869140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5091260671615601, "rewards/margins": 17.207239151000977, "rewards/rejected": -17.71636390686035, "step": 1140 }, { "epoch": 0.2871948554660673, "grad_norm": 0.06591796875, "learning_rate": 4.4855377431408335e-06, "logits/chosen": 0.6628460884094238, "logits/rejected": 1.7432994842529297, "logps/chosen": -109.8287124633789, "logps/rejected": -1642.527099609375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6535288095474243, "rewards/margins": 14.924878120422363, "rewards/rejected": -15.578405380249023, "step": 1150 }, { "epoch": 0.28969220203533746, "grad_norm": 0.005096435546875, "learning_rate": 4.472216894881261e-06, "logits/chosen": 0.6672796010971069, "logits/rejected": 1.738856315612793, "logps/chosen": -89.06207275390625, "logps/rejected": -1540.048583984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.46248659491539, "rewards/margins": 14.235052108764648, "rewards/rejected": -14.697538375854492, "step": 1160 }, { "epoch": 0.2921895486046076, "grad_norm": 0.047607421875, "learning_rate": 4.4587461046751815e-06, "logits/chosen": 0.6774252653121948, "logits/rejected": 1.7696081399917603, "logps/chosen": -81.09745788574219, "logps/rejected": -1620.4942626953125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.38098251819610596, "rewards/margins": 15.126324653625488, "rewards/rejected": -15.507307052612305, "step": 1170 }, { "epoch": 0.29468689517387775, "grad_norm": 0.008544921875, "learning_rate": 4.44512639666781e-06, "logits/chosen": 0.6951876878738403, "logits/rejected": 1.7908748388290405, "logps/chosen": -76.070068359375, "logps/rejected": -1503.576416015625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.32203441858291626, "rewards/margins": 14.029817581176758, "rewards/rejected": -14.351852416992188, "step": 1180 }, { "epoch": 0.2971842417431479, "grad_norm": 0.890625, "learning_rate": 4.431358806326158e-06, "logits/chosen": 0.5664582848548889, "logits/rejected": 1.6636062860488892, "logps/chosen": -77.29322814941406, "logps/rejected": -1649.8541259765625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.32812389731407166, "rewards/margins": 15.302160263061523, "rewards/rejected": -15.630284309387207, "step": 1190 }, { "epoch": 0.29968158831241803, "grad_norm": 0.765625, "learning_rate": 4.4174443803603e-06, "logits/chosen": 0.7006584405899048, "logits/rejected": 1.8050626516342163, "logps/chosen": -101.17628479003906, "logps/rejected": -1638.1669921875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.5764530301094055, "rewards/margins": 15.088088989257812, "rewards/rejected": -15.664541244506836, "step": 1200 }, { "epoch": 0.30217893488168823, "grad_norm": 0.045166015625, "learning_rate": 4.4033841766438e-06, "logits/chosen": 0.5828436613082886, "logits/rejected": 1.6060895919799805, "logps/chosen": -79.80711364746094, "logps/rejected": -1499.616943359375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.35304346680641174, "rewards/margins": 13.869488716125488, "rewards/rejected": -14.222529411315918, "step": 1210 }, { "epoch": 0.3046762814509584, "grad_norm": 0.0005035400390625, "learning_rate": 4.389179264133281e-06, "logits/chosen": 0.6191063523292542, "logits/rejected": 1.7791473865509033, "logps/chosen": -74.18888854980469, "logps/rejected": -1617.736572265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.31279683113098145, "rewards/margins": 15.179719924926758, "rewards/rejected": -15.492517471313477, "step": 1220 }, { "epoch": 0.3071736280202285, "grad_norm": 0.10205078125, "learning_rate": 4.374830722787159e-06, "logits/chosen": 0.5632847547531128, "logits/rejected": 1.709839105606079, "logps/chosen": -71.81803894042969, "logps/rejected": -1728.1201171875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.2758982479572296, "rewards/margins": 16.288021087646484, "rewards/rejected": -16.563919067382812, "step": 1230 }, { "epoch": 0.30967097458949866, "grad_norm": 0.08984375, "learning_rate": 4.360339643483533e-06, "logits/chosen": 0.5613077878952026, "logits/rejected": 1.6441980600357056, "logps/chosen": -71.94982147216797, "logps/rejected": -1693.345947265625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.27554553747177124, "rewards/margins": 15.830523490905762, "rewards/rejected": -16.106069564819336, "step": 1240 }, { "epoch": 0.3121683211587688, "grad_norm": 0.00022411346435546875, "learning_rate": 4.345707127937253e-06, "logits/chosen": 0.5210096836090088, "logits/rejected": 1.81972336769104, "logps/chosen": -72.21741485595703, "logps/rejected": -1985.244873046875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.2866365611553192, "rewards/margins": 18.848371505737305, "rewards/rejected": -19.135007858276367, "step": 1250 }, { "epoch": 0.31466566772803894, "grad_norm": 0.0087890625, "learning_rate": 4.330934288616154e-06, "logits/chosen": 0.6370071172714233, "logits/rejected": 1.816506028175354, "logps/chosen": -77.37321472167969, "logps/rejected": -1700.324951171875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.33770519495010376, "rewards/margins": 15.96662712097168, "rewards/rejected": -16.304332733154297, "step": 1260 }, { "epoch": 0.3171630142973091, "grad_norm": 0.043701171875, "learning_rate": 4.316022248656485e-06, "logits/chosen": 0.5022410154342651, "logits/rejected": 1.5646175146102905, "logps/chosen": -71.90743255615234, "logps/rejected": -1543.3675537109375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2772095799446106, "rewards/margins": 14.190910339355469, "rewards/rejected": -14.468118667602539, "step": 1270 }, { "epoch": 0.3196603608665793, "grad_norm": 0.01177978515625, "learning_rate": 4.3009721417775166e-06, "logits/chosen": 0.5786353349685669, "logits/rejected": 1.7490533590316772, "logps/chosen": -76.0634536743164, "logps/rejected": -1822.3128662109375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.33201277256011963, "rewards/margins": 17.15824317932129, "rewards/rejected": -17.49025535583496, "step": 1280 }, { "epoch": 0.3221577074358494, "grad_norm": 0.006378173828125, "learning_rate": 4.285785112195346e-06, "logits/chosen": 0.5005044341087341, "logits/rejected": 1.6343857049942017, "logps/chosen": -79.80322265625, "logps/rejected": -1840.0863037109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.360287070274353, "rewards/margins": 17.2882080078125, "rewards/rejected": -17.648494720458984, "step": 1290 }, { "epoch": 0.32465505400511957, "grad_norm": 0.0218505859375, "learning_rate": 4.27046231453591e-06, "logits/chosen": 0.5454662442207336, "logits/rejected": 1.7699216604232788, "logps/chosen": -73.63867950439453, "logps/rejected": -1785.0687255859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3033478856086731, "rewards/margins": 16.751049041748047, "rewards/rejected": -17.054393768310547, "step": 1300 }, { "epoch": 0.3271524005743897, "grad_norm": 3.4458935260772705e-08, "learning_rate": 4.255004913747196e-06, "logits/chosen": 0.5776439905166626, "logits/rejected": 1.7879540920257568, "logps/chosen": -73.9155502319336, "logps/rejected": -1853.062744140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3109303414821625, "rewards/margins": 17.477283477783203, "rewards/rejected": -17.78821563720703, "step": 1310 }, { "epoch": 0.32964974714365985, "grad_norm": 0.0025787353515625, "learning_rate": 4.2394140850106825e-06, "logits/chosen": 0.5839067697525024, "logits/rejected": 1.7082710266113281, "logps/chosen": -79.85846710205078, "logps/rejected": -1769.608154296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.36045509576797485, "rewards/margins": 16.53636360168457, "rewards/rejected": -16.896818161010742, "step": 1320 }, { "epoch": 0.33214709371293, "grad_norm": 0.049072265625, "learning_rate": 4.223691013651986e-06, "logits/chosen": 0.4838981032371521, "logits/rejected": 1.5925347805023193, "logps/chosen": -78.32035827636719, "logps/rejected": -1797.3795166015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.33012980222702026, "rewards/margins": 16.690799713134766, "rewards/rejected": -17.02092933654785, "step": 1330 }, { "epoch": 0.33464444028220014, "grad_norm": 0.07421875, "learning_rate": 4.207836895050748e-06, "logits/chosen": 0.5183486342430115, "logits/rejected": 1.813838243484497, "logps/chosen": -78.78418731689453, "logps/rejected": -2234.604248046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.35720717906951904, "rewards/margins": 21.229446411132812, "rewards/rejected": -21.586654663085938, "step": 1340 }, { "epoch": 0.33714178685147034, "grad_norm": 0.034912109375, "learning_rate": 4.1918529345497525e-06, "logits/chosen": 0.6919676661491394, "logits/rejected": 1.7151187658309937, "logps/chosen": -73.4660415649414, "logps/rejected": -1405.526123046875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.2937282919883728, "rewards/margins": 13.013958930969238, "rewards/rejected": -13.307687759399414, "step": 1350 }, { "epoch": 0.3396391334207405, "grad_norm": 0.0673828125, "learning_rate": 4.175740347363289e-06, "logits/chosen": 0.58757483959198, "logits/rejected": 1.651389718055725, "logps/chosen": -76.53401947021484, "logps/rejected": -1477.0704345703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.3171566426753998, "rewards/margins": 13.620218276977539, "rewards/rejected": -13.93737506866455, "step": 1360 }, { "epoch": 0.3421364799900106, "grad_norm": 0.00014400482177734375, "learning_rate": 4.159500358484759e-06, "logits/chosen": 0.5347609519958496, "logits/rejected": 1.775714635848999, "logps/chosen": -80.72816467285156, "logps/rejected": -2101.271240234375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.37172675132751465, "rewards/margins": 19.874296188354492, "rewards/rejected": -20.246021270751953, "step": 1370 }, { "epoch": 0.34463382655928076, "grad_norm": 0.08544921875, "learning_rate": 4.143134202593549e-06, "logits/chosen": 0.624781608581543, "logits/rejected": 1.6769899129867554, "logps/chosen": -73.6835708618164, "logps/rejected": -1517.2562255859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.28604602813720703, "rewards/margins": 14.03973388671875, "rewards/rejected": -14.325779914855957, "step": 1380 }, { "epoch": 0.3471311731285509, "grad_norm": 0.000701904296875, "learning_rate": 4.126643123961158e-06, "logits/chosen": 0.5619300007820129, "logits/rejected": 1.733264684677124, "logps/chosen": -85.01399230957031, "logps/rejected": -1977.3433837890625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4228588938713074, "rewards/margins": 18.610515594482422, "rewards/rejected": -19.033374786376953, "step": 1390 }, { "epoch": 0.34962851969782105, "grad_norm": 0.054931640625, "learning_rate": 4.110028376356599e-06, "logits/chosen": 0.6396419405937195, "logits/rejected": 1.709763765335083, "logps/chosen": -78.08328247070312, "logps/rejected": -1361.017822265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.32667404413223267, "rewards/margins": 12.484647750854492, "rewards/rejected": -12.811322212219238, "step": 1400 }, { "epoch": 0.3521258662670912, "grad_norm": 0.0230712890625, "learning_rate": 4.093291222951079e-06, "logits/chosen": 0.59341961145401, "logits/rejected": 1.863669991493225, "logps/chosen": -88.14995574951172, "logps/rejected": -1872.766845703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.4488135874271393, "rewards/margins": 17.484996795654297, "rewards/rejected": -17.93381118774414, "step": 1410 }, { "epoch": 0.3546232128363614, "grad_norm": 0.058837890625, "learning_rate": 4.076432936221965e-06, "logits/chosen": 0.7002652287483215, "logits/rejected": 1.8483015298843384, "logps/chosen": -82.04905700683594, "logps/rejected": -1564.48388671875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.38699063658714294, "rewards/margins": 14.584707260131836, "rewards/rejected": -14.971699714660645, "step": 1420 }, { "epoch": 0.35712055940563153, "grad_norm": 0.0228271484375, "learning_rate": 4.059454797856039e-06, "logits/chosen": 0.6757210493087769, "logits/rejected": 1.8517526388168335, "logps/chosen": -79.04359436035156, "logps/rejected": -1546.5145263671875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.36757320165634155, "rewards/margins": 14.412260055541992, "rewards/rejected": -14.77983570098877, "step": 1430 }, { "epoch": 0.3596179059749017, "grad_norm": 2.86102294921875e-05, "learning_rate": 4.042358098652057e-06, "logits/chosen": 0.6149075627326965, "logits/rejected": 1.7819246053695679, "logps/chosen": -79.50230407714844, "logps/rejected": -1636.815673828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.36546218395233154, "rewards/margins": 15.302419662475586, "rewards/rejected": -15.667881965637207, "step": 1440 }, { "epoch": 0.3621152525441718, "grad_norm": 0.00115203857421875, "learning_rate": 4.025144138422615e-06, "logits/chosen": 0.6270621418952942, "logits/rejected": 1.8290207386016846, "logps/chosen": -94.74217224121094, "logps/rejected": -1927.5875244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.5115745663642883, "rewards/margins": 18.033849716186523, "rewards/rejected": -18.54542350769043, "step": 1450 }, { "epoch": 0.36461259911344196, "grad_norm": 0.01483154296875, "learning_rate": 4.007814225895321e-06, "logits/chosen": 0.6495813131332397, "logits/rejected": 1.905644178390503, "logps/chosen": -77.37786865234375, "logps/rejected": -1849.2958984375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.35049691796302795, "rewards/margins": 17.428863525390625, "rewards/rejected": -17.779361724853516, "step": 1460 }, { "epoch": 0.3671099456827121, "grad_norm": 0.030029296875, "learning_rate": 3.990369678613303e-06, "logits/chosen": 0.5495260953903198, "logits/rejected": 1.7052236795425415, "logps/chosen": -80.6564712524414, "logps/rejected": -1763.9212646484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3531542718410492, "rewards/margins": 16.24311637878418, "rewards/rejected": -16.596271514892578, "step": 1470 }, { "epoch": 0.36960729225198224, "grad_norm": 0.005859375, "learning_rate": 3.97281182283504e-06, "logits/chosen": 0.625984251499176, "logits/rejected": 1.8606735467910767, "logps/chosen": -79.91282653808594, "logps/rejected": -2002.6253662109375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.37286117672920227, "rewards/margins": 18.913448333740234, "rewards/rejected": -19.28631019592285, "step": 1480 }, { "epoch": 0.37210463882125244, "grad_norm": 0.0966796875, "learning_rate": 3.955141993433526e-06, "logits/chosen": 0.6155849695205688, "logits/rejected": 1.8059221506118774, "logps/chosen": -83.35047912597656, "logps/rejected": -1705.9954833984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.40526407957077026, "rewards/margins": 15.947067260742188, "rewards/rejected": -16.352331161499023, "step": 1490 }, { "epoch": 0.3746019853905226, "grad_norm": 0.028076171875, "learning_rate": 3.937361533794784e-06, "logits/chosen": 0.6340751647949219, "logits/rejected": 1.7404279708862305, "logps/chosen": -88.5922622680664, "logps/rejected": -1672.023193359375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.4479256570339203, "rewards/margins": 15.48670768737793, "rewards/rejected": -15.934633255004883, "step": 1500 }, { "epoch": 0.3770993319597927, "grad_norm": 0.005706787109375, "learning_rate": 3.919471795715738e-06, "logits/chosen": 0.6204045414924622, "logits/rejected": 1.7921810150146484, "logps/chosen": -76.07710266113281, "logps/rejected": -1581.0501708984375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3369273245334625, "rewards/margins": 14.815042495727539, "rewards/rejected": -15.151969909667969, "step": 1510 }, { "epoch": 0.37959667852906287, "grad_norm": 0.033203125, "learning_rate": 3.901474139301433e-06, "logits/chosen": 0.5973562002182007, "logits/rejected": 1.781730055809021, "logps/chosen": -83.27436065673828, "logps/rejected": -1706.674560546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3964506983757019, "rewards/margins": 15.901094436645508, "rewards/rejected": -16.297544479370117, "step": 1520 }, { "epoch": 0.382094025098333, "grad_norm": 0.020263671875, "learning_rate": 3.883369932861634e-06, "logits/chosen": 0.66780024766922, "logits/rejected": 1.8160803318023682, "logps/chosen": -88.08844757080078, "logps/rejected": -1613.07373046875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.44633254408836365, "rewards/margins": 15.01531982421875, "rewards/rejected": -15.461652755737305, "step": 1530 }, { "epoch": 0.38459137166760315, "grad_norm": 5.245208740234375e-05, "learning_rate": 3.865160552806796e-06, "logits/chosen": 0.6651610136032104, "logits/rejected": 1.8406972885131836, "logps/chosen": -79.48463439941406, "logps/rejected": -1628.993896484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.36581045389175415, "rewards/margins": 15.247782707214355, "rewards/rejected": -15.613592147827148, "step": 1540 }, { "epoch": 0.3870887182368733, "grad_norm": 1.7404556274414062e-05, "learning_rate": 3.84684738354342e-06, "logits/chosen": 0.6299249529838562, "logits/rejected": 1.8017246723175049, "logps/chosen": -78.25436401367188, "logps/rejected": -1699.4664306640625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.34191855788230896, "rewards/margins": 15.943153381347656, "rewards/rejected": -16.285072326660156, "step": 1550 }, { "epoch": 0.3895860648061435, "grad_norm": 0.043212890625, "learning_rate": 3.828431817368798e-06, "logits/chosen": 0.6114810705184937, "logits/rejected": 1.7776029109954834, "logps/chosen": -83.11959075927734, "logps/rejected": -1808.9710693359375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.39526110887527466, "rewards/margins": 16.893442153930664, "rewards/rejected": -17.2887020111084, "step": 1560 }, { "epoch": 0.39208341137541364, "grad_norm": 0.0859375, "learning_rate": 3.8099152543651684e-06, "logits/chosen": 0.5259889364242554, "logits/rejected": 1.8491640090942383, "logps/chosen": -76.72280883789062, "logps/rejected": -1907.3863525390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3375067710876465, "rewards/margins": 18.016918182373047, "rewards/rejected": -18.354427337646484, "step": 1570 }, { "epoch": 0.3945807579446838, "grad_norm": 0.07177734375, "learning_rate": 3.791299102293261e-06, "logits/chosen": 0.5979338884353638, "logits/rejected": 1.8080129623413086, "logps/chosen": -85.99974060058594, "logps/rejected": -1962.4896240234375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4262419641017914, "rewards/margins": 18.461589813232422, "rewards/rejected": -18.88783073425293, "step": 1580 }, { "epoch": 0.3970781045139539, "grad_norm": 0.0078125, "learning_rate": 3.7725847764852774e-06, "logits/chosen": 0.5477781891822815, "logits/rejected": 1.7578785419464111, "logps/chosen": -83.63060760498047, "logps/rejected": -1990.638671875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.39986157417297363, "rewards/margins": 18.64605140686035, "rewards/rejected": -19.045909881591797, "step": 1590 }, { "epoch": 0.39957545108322406, "grad_norm": 0.0067138671875, "learning_rate": 3.7537736997372833e-06, "logits/chosen": 0.5983849167823792, "logits/rejected": 1.6318690776824951, "logps/chosen": -74.38432312011719, "logps/rejected": -1474.78759765625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.30779850482940674, "rewards/margins": 13.598607063293457, "rewards/rejected": -13.906405448913574, "step": 1600 }, { "epoch": 0.4020727976524942, "grad_norm": 0.0035247802734375, "learning_rate": 3.734867302201038e-06, "logits/chosen": 0.620630145072937, "logits/rejected": 1.7149145603179932, "logps/chosen": -75.28178405761719, "logps/rejected": -1552.66650390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.32418856024742126, "rewards/margins": 14.512880325317383, "rewards/rejected": -14.837068557739258, "step": 1610 }, { "epoch": 0.40457014422176435, "grad_norm": 0.04833984375, "learning_rate": 3.7158670212752666e-06, "logits/chosen": 0.609667181968689, "logits/rejected": 1.8217569589614868, "logps/chosen": -75.36375427246094, "logps/rejected": -1846.893798828125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.31798630952835083, "rewards/margins": 17.42896842956543, "rewards/rejected": -17.746957778930664, "step": 1620 }, { "epoch": 0.40706749079103455, "grad_norm": 0.0034332275390625, "learning_rate": 3.696774301496376e-06, "logits/chosen": 0.6272271871566772, "logits/rejected": 1.8513765335083008, "logps/chosen": -76.99528503417969, "logps/rejected": -1668.758056640625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.33525028824806213, "rewards/margins": 15.664929389953613, "rewards/rejected": -16.000181198120117, "step": 1630 }, { "epoch": 0.4095648373603047, "grad_norm": 0.0020599365234375, "learning_rate": 3.677590594428629e-06, "logits/chosen": 0.6275375485420227, "logits/rejected": 1.746649980545044, "logps/chosen": -82.9039535522461, "logps/rejected": -1647.865478515625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.402643620967865, "rewards/margins": 15.365156173706055, "rewards/rejected": -15.767801284790039, "step": 1640 }, { "epoch": 0.41206218392957483, "grad_norm": 0.0001926422119140625, "learning_rate": 3.658317358553794e-06, "logits/chosen": 0.6051415205001831, "logits/rejected": 1.807227373123169, "logps/chosen": -78.21363830566406, "logps/rejected": -1698.2001953125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3519694209098816, "rewards/margins": 15.862031936645508, "rewards/rejected": -16.214000701904297, "step": 1650 }, { "epoch": 0.414559530498845, "grad_norm": 0.05908203125, "learning_rate": 3.638956059160252e-06, "logits/chosen": 0.659566342830658, "logits/rejected": 1.9395606517791748, "logps/chosen": -79.38732147216797, "logps/rejected": -1887.150390625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.369088351726532, "rewards/margins": 17.81894874572754, "rewards/rejected": -18.188034057617188, "step": 1660 }, { "epoch": 0.4170568770681151, "grad_norm": 0.0064697265625, "learning_rate": 3.6195081682315972e-06, "logits/chosen": 0.6888834834098816, "logits/rejected": 1.855298638343811, "logps/chosen": -87.92467498779297, "logps/rejected": -1717.685546875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.45286068320274353, "rewards/margins": 16.04346466064453, "rewards/rejected": -16.496326446533203, "step": 1670 }, { "epoch": 0.41955422363738526, "grad_norm": 0.026123046875, "learning_rate": 3.5999751643347342e-06, "logits/chosen": 0.5452974438667297, "logits/rejected": 1.710627794265747, "logps/chosen": -84.69573974609375, "logps/rejected": -1964.759033203125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4223009943962097, "rewards/margins": 18.474639892578125, "rewards/rejected": -18.896940231323242, "step": 1680 }, { "epoch": 0.4220515702066554, "grad_norm": 0.050537109375, "learning_rate": 3.5803585325074536e-06, "logits/chosen": 0.5890778303146362, "logits/rejected": 1.8013776540756226, "logps/chosen": -78.17984008789062, "logps/rejected": -1845.5576171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3491733968257904, "rewards/margins": 17.39130210876465, "rewards/rejected": -17.740474700927734, "step": 1690 }, { "epoch": 0.4245489167759256, "grad_norm": 0.0849609375, "learning_rate": 3.5606597641455387e-06, "logits/chosen": 0.6665171384811401, "logits/rejected": 1.7867825031280518, "logps/chosen": -82.6786117553711, "logps/rejected": -1745.9921875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.4073053300380707, "rewards/margins": 16.353519439697266, "rewards/rejected": -16.760822296142578, "step": 1700 }, { "epoch": 0.42704626334519574, "grad_norm": 0.054443359375, "learning_rate": 3.540880356889376e-06, "logits/chosen": 0.6666916012763977, "logits/rejected": 1.773199439048767, "logps/chosen": -83.08412170410156, "logps/rejected": -1565.4173583984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.39527103304862976, "rewards/margins": 14.483154296875, "rewards/rejected": -14.878425598144531, "step": 1710 }, { "epoch": 0.4295436099144659, "grad_norm": 0.036865234375, "learning_rate": 3.5210218145100934e-06, "logits/chosen": 0.6796804666519165, "logits/rejected": 1.826575517654419, "logps/chosen": -76.29814147949219, "logps/rejected": -1558.442626953125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.3252686560153961, "rewards/margins": 14.555532455444336, "rewards/rejected": -14.8808012008667, "step": 1720 }, { "epoch": 0.432040956483736, "grad_norm": 0.037841796875, "learning_rate": 3.5010856467952335e-06, "logits/chosen": 0.6157870292663574, "logits/rejected": 1.7099930047988892, "logps/chosen": -81.55091094970703, "logps/rejected": -1618.1546630859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.3764324188232422, "rewards/margins": 14.980290412902832, "rewards/rejected": -15.356722831726074, "step": 1730 }, { "epoch": 0.43453830305300617, "grad_norm": 0.1484375, "learning_rate": 3.4810733694339687e-06, "logits/chosen": 0.5888208150863647, "logits/rejected": 1.7958781719207764, "logps/chosen": -84.57051086425781, "logps/rejected": -1871.361083984375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.4017999768257141, "rewards/margins": 17.520339965820312, "rewards/rejected": -17.922138214111328, "step": 1740 }, { "epoch": 0.4370356496222763, "grad_norm": 0.032958984375, "learning_rate": 3.4609865039018676e-06, "logits/chosen": 0.682072639465332, "logits/rejected": 1.7670128345489502, "logps/chosen": -83.45240783691406, "logps/rejected": -1766.69140625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.40751171112060547, "rewards/margins": 16.532672882080078, "rewards/rejected": -16.940181732177734, "step": 1750 }, { "epoch": 0.43953299619154645, "grad_norm": 0.024658203125, "learning_rate": 3.4408265773452226e-06, "logits/chosen": 0.6357883214950562, "logits/rejected": 1.7647396326065063, "logps/chosen": -75.90863037109375, "logps/rejected": -1793.2620849609375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.32462552189826965, "rewards/margins": 16.89077377319336, "rewards/rejected": -17.21540069580078, "step": 1760 }, { "epoch": 0.44203034276081665, "grad_norm": 0.00022411346435546875, "learning_rate": 3.420595122464942e-06, "logits/chosen": 0.5758832693099976, "logits/rejected": 1.7814972400665283, "logps/chosen": -79.77733612060547, "logps/rejected": -1759.0218505859375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.36440029740333557, "rewards/margins": 16.545846939086914, "rewards/rejected": -16.910245895385742, "step": 1770 }, { "epoch": 0.4445276893300868, "grad_norm": 0.0712890625, "learning_rate": 3.4002936774000284e-06, "logits/chosen": 0.5318555235862732, "logits/rejected": 1.8811362981796265, "logps/chosen": -77.92293548583984, "logps/rejected": -2195.729736328125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3443797826766968, "rewards/margins": 20.8763370513916, "rewards/rejected": -21.22071647644043, "step": 1780 }, { "epoch": 0.44702503589935694, "grad_norm": 0.035888671875, "learning_rate": 3.3799237856106348e-06, "logits/chosen": 0.5407482385635376, "logits/rejected": 1.725608229637146, "logps/chosen": -77.46542358398438, "logps/rejected": -1750.359619140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.34000295400619507, "rewards/margins": 16.431018829345703, "rewards/rejected": -16.77102279663086, "step": 1790 }, { "epoch": 0.4495223824686271, "grad_norm": 0.0003185272216796875, "learning_rate": 3.35948699576072e-06, "logits/chosen": 0.5788090825080872, "logits/rejected": 1.8735895156860352, "logps/chosen": -83.07856750488281, "logps/rejected": -2100.754638671875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3939378559589386, "rewards/margins": 19.880746841430664, "rewards/rejected": -20.274681091308594, "step": 1800 }, { "epoch": 0.4520197290378972, "grad_norm": 0.13671875, "learning_rate": 3.3389848616003085e-06, "logits/chosen": 0.5929907560348511, "logits/rejected": 1.6974430084228516, "logps/chosen": -79.56812286376953, "logps/rejected": -1787.6068115234375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.3681466281414032, "rewards/margins": 16.794055938720703, "rewards/rejected": -17.16220474243164, "step": 1810 }, { "epoch": 0.45451707560716736, "grad_norm": 0.0390625, "learning_rate": 3.3184189418473674e-06, "logits/chosen": 0.5751794576644897, "logits/rejected": 1.7812267541885376, "logps/chosen": -77.59693908691406, "logps/rejected": -1778.1439208984375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3408336043357849, "rewards/margins": 16.739145278930664, "rewards/rejected": -17.079978942871094, "step": 1820 }, { "epoch": 0.45701442217643756, "grad_norm": 0.001373291015625, "learning_rate": 3.2977908000692925e-06, "logits/chosen": 0.5487096905708313, "logits/rejected": 1.74447500705719, "logps/chosen": -80.25045013427734, "logps/rejected": -1946.0765380859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3701193928718567, "rewards/margins": 18.401947021484375, "rewards/rejected": -18.77206802368164, "step": 1830 }, { "epoch": 0.4595117687457077, "grad_norm": 0.048095703125, "learning_rate": 3.2771020045640435e-06, "logits/chosen": 0.6444208025932312, "logits/rejected": 1.6972076892852783, "logps/chosen": -78.00311279296875, "logps/rejected": -1579.7557373046875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3551621735095978, "rewards/margins": 14.743237495422363, "rewards/rejected": -15.09839916229248, "step": 1840 }, { "epoch": 0.46200911531497785, "grad_norm": 0.07177734375, "learning_rate": 3.256354128240907e-06, "logits/chosen": 0.6255194544792175, "logits/rejected": 1.7124531269073486, "logps/chosen": -85.12455749511719, "logps/rejected": -1608.01171875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.401846319437027, "rewards/margins": 14.84345531463623, "rewards/rejected": -15.245302200317383, "step": 1850 }, { "epoch": 0.464506461884248, "grad_norm": 1.191438059322536e-10, "learning_rate": 3.235548748500914e-06, "logits/chosen": 0.5620906352996826, "logits/rejected": 1.8212181329727173, "logps/chosen": -78.25764465332031, "logps/rejected": -1836.809326171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3476276397705078, "rewards/margins": 17.330211639404297, "rewards/rejected": -17.677841186523438, "step": 1860 }, { "epoch": 0.46700380845351813, "grad_norm": 0.038330078125, "learning_rate": 3.214687447116913e-06, "logits/chosen": 0.5774132609367371, "logits/rejected": 1.7261114120483398, "logps/chosen": -76.27984619140625, "logps/rejected": -1707.5220947265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3342064321041107, "rewards/margins": 15.912832260131836, "rewards/rejected": -16.247039794921875, "step": 1870 }, { "epoch": 0.4695011550227883, "grad_norm": 0.002532958984375, "learning_rate": 3.193771810113313e-06, "logits/chosen": 0.5532559752464294, "logits/rejected": 1.8629133701324463, "logps/chosen": -79.43685150146484, "logps/rejected": -2138.56884765625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3567715287208557, "rewards/margins": 20.29428482055664, "rewards/rejected": -20.65105628967285, "step": 1880 }, { "epoch": 0.4719985015920584, "grad_norm": 0.018310546875, "learning_rate": 3.1728034276455032e-06, "logits/chosen": 0.6407243609428406, "logits/rejected": 1.7773427963256836, "logps/chosen": -75.46717834472656, "logps/rejected": -1624.9876708984375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3326609432697296, "rewards/margins": 15.156428337097168, "rewards/rejected": -15.489087104797363, "step": 1890 }, { "epoch": 0.4744958481613286, "grad_norm": 0.001556396484375, "learning_rate": 3.1517838938789597e-06, "logits/chosen": 0.5151150822639465, "logits/rejected": 1.6990512609481812, "logps/chosen": -79.35011291503906, "logps/rejected": -1993.761962890625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.3504992127418518, "rewards/margins": 18.6396427154541, "rewards/rejected": -18.990140914916992, "step": 1900 }, { "epoch": 0.47699319473059876, "grad_norm": 0.036376953125, "learning_rate": 3.130714806868041e-06, "logits/chosen": 0.5437807440757751, "logits/rejected": 1.6498979330062866, "logps/chosen": -77.74958801269531, "logps/rejected": -1746.6015625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3441064655780792, "rewards/margins": 16.398990631103516, "rewards/rejected": -16.74309730529785, "step": 1910 }, { "epoch": 0.4794905412998689, "grad_norm": 0.0269775390625, "learning_rate": 3.1095977684344976e-06, "logits/chosen": 0.6197426319122314, "logits/rejected": 1.865501046180725, "logps/chosen": -83.05316162109375, "logps/rejected": -1912.490234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.40501928329467773, "rewards/margins": 18.013139724731445, "rewards/rejected": -18.41815757751465, "step": 1920 }, { "epoch": 0.48198788786913904, "grad_norm": 0.004058837890625, "learning_rate": 3.0884343840456874e-06, "logits/chosen": 0.5581328868865967, "logits/rejected": 1.818427324295044, "logps/chosen": -82.58245849609375, "logps/rejected": -2075.47509765625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.39019304513931274, "rewards/margins": 19.626462936401367, "rewards/rejected": -20.016658782958984, "step": 1930 }, { "epoch": 0.4844852344384092, "grad_norm": 5.0067901611328125e-06, "learning_rate": 3.0672262626925174e-06, "logits/chosen": 0.49209919571876526, "logits/rejected": 1.718467354774475, "logps/chosen": -82.48509216308594, "logps/rejected": -1921.3333740234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.35821837186813354, "rewards/margins": 18.051937103271484, "rewards/rejected": -18.410158157348633, "step": 1940 }, { "epoch": 0.4869825810076793, "grad_norm": 0.0255126953125, "learning_rate": 3.0459750167671147e-06, "logits/chosen": 0.4969088137149811, "logits/rejected": 1.7654139995574951, "logps/chosen": -79.9859390258789, "logps/rejected": -2075.444580078125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3472353518009186, "rewards/margins": 19.543991088867188, "rewards/rejected": -19.89122772216797, "step": 1950 }, { "epoch": 0.48947992757694947, "grad_norm": 0.1552734375, "learning_rate": 3.024682261940247e-06, "logits/chosen": 0.5588921904563904, "logits/rejected": 1.6852495670318604, "logps/chosen": -83.98374938964844, "logps/rejected": -1691.5599365234375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.39910078048706055, "rewards/margins": 15.737649917602539, "rewards/rejected": -16.136751174926758, "step": 1960 }, { "epoch": 0.49197727414621967, "grad_norm": 1.4185905456542969e-05, "learning_rate": 3.0033496170384803e-06, "logits/chosen": 0.6266374588012695, "logits/rejected": 1.8179903030395508, "logps/chosen": -77.69737243652344, "logps/rejected": -1697.776123046875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3449680805206299, "rewards/margins": 15.93467903137207, "rewards/rejected": -16.279645919799805, "step": 1970 }, { "epoch": 0.4944746207154898, "grad_norm": 0.0458984375, "learning_rate": 2.9819787039211068e-06, "logits/chosen": 0.5513324737548828, "logits/rejected": 1.6900783777236938, "logps/chosen": -76.17434692382812, "logps/rejected": -1829.5364990234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.31986111402511597, "rewards/margins": 17.185441970825195, "rewards/rejected": -17.50530242919922, "step": 1980 }, { "epoch": 0.49697196728475995, "grad_norm": 0.002777099609375, "learning_rate": 2.960571147356845e-06, "logits/chosen": 0.5562096834182739, "logits/rejected": 1.8595008850097656, "logps/chosen": -83.13392639160156, "logps/rejected": -2010.169921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.395429790019989, "rewards/margins": 18.9808292388916, "rewards/rejected": -19.37626075744629, "step": 1990 }, { "epoch": 0.4994693138540301, "grad_norm": 0.000293731689453125, "learning_rate": 2.9391285749003046e-06, "logits/chosen": 0.5312787294387817, "logits/rejected": 1.7356303930282593, "logps/chosen": -95.15312194824219, "logps/rejected": -2160.408935546875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5149081349372864, "rewards/margins": 20.321773529052734, "rewards/rejected": -20.836681365966797, "step": 2000 }, { "epoch": 0.4994693138540301, "eval_logits/chosen": 0.6455119848251343, "eval_logits/rejected": 1.5546293258666992, "eval_logps/chosen": -82.33647155761719, "eval_logps/rejected": -980.0130615234375, "eval_loss": 0.0030529608484357595, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.37980592250823975, "eval_rewards/margins": 8.809426307678223, "eval_rewards/rejected": -9.189233779907227, "eval_runtime": 0.6247, "eval_samples_per_second": 8.004, "eval_steps_per_second": 8.004, "step": 2000 }, { "epoch": 0.5019666604233003, "grad_norm": 0.06494140625, "learning_rate": 2.9176526167682543e-06, "logits/chosen": 0.6404844522476196, "logits/rejected": 1.8602796792984009, "logps/chosen": -82.97758483886719, "logps/rejected": -1836.1568603515625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.4049321115016937, "rewards/margins": 17.25887107849121, "rewards/rejected": -17.663803100585938, "step": 2010 }, { "epoch": 0.5044640069925704, "grad_norm": 0.0166015625, "learning_rate": 2.8961449057156775e-06, "logits/chosen": 0.5347205400466919, "logits/rejected": 1.700486421585083, "logps/chosen": -84.12736511230469, "logps/rejected": -1874.214111328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.3977189064025879, "rewards/margins": 17.567378997802734, "rewards/rejected": -17.965099334716797, "step": 2020 }, { "epoch": 0.5069613535618406, "grad_norm": 0.0013427734375, "learning_rate": 2.874607076911642e-06, "logits/chosen": 0.5987354516983032, "logits/rejected": 1.7991135120391846, "logps/chosen": -81.83995819091797, "logps/rejected": -1828.4136962890625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.38373640179634094, "rewards/margins": 17.181564331054688, "rewards/rejected": -17.5653018951416, "step": 2030 }, { "epoch": 0.5094587001311107, "grad_norm": 0.000850677490234375, "learning_rate": 2.8530407678149806e-06, "logits/chosen": 0.6027461886405945, "logits/rejected": 1.730103850364685, "logps/chosen": -81.79703521728516, "logps/rejected": -1646.1201171875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.37951746582984924, "rewards/margins": 15.314178466796875, "rewards/rejected": -15.693696975708008, "step": 2040 }, { "epoch": 0.5119560467003809, "grad_norm": 0.00055694580078125, "learning_rate": 2.8314476180498003e-06, "logits/chosen": 0.6401151418685913, "logits/rejected": 1.7924197912216187, "logps/chosen": -85.15666198730469, "logps/rejected": -1746.1314697265625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.421166330575943, "rewards/margins": 16.335796356201172, "rewards/rejected": -16.756961822509766, "step": 2050 }, { "epoch": 0.514453393269651, "grad_norm": 0.032958984375, "learning_rate": 2.8098292692808253e-06, "logits/chosen": 0.6529192328453064, "logits/rejected": 1.7113368511199951, "logps/chosen": -83.26264953613281, "logps/rejected": -1448.375732421875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3913651704788208, "rewards/margins": 13.431114196777344, "rewards/rejected": -13.822479248046875, "step": 2060 }, { "epoch": 0.5169507398389211, "grad_norm": 0.035400390625, "learning_rate": 2.7881873650885904e-06, "logits/chosen": 0.6235641241073608, "logits/rejected": 1.7722196578979492, "logps/chosen": -85.73550415039062, "logps/rejected": -1683.2877197265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.42536306381225586, "rewards/margins": 15.712361335754395, "rewards/rejected": -16.13772201538086, "step": 2070 }, { "epoch": 0.5194480864081913, "grad_norm": 0.07470703125, "learning_rate": 2.7665235508444772e-06, "logits/chosen": 0.5478901267051697, "logits/rejected": 1.8091357946395874, "logps/chosen": -79.84373474121094, "logps/rejected": -1996.998779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.368811696767807, "rewards/margins": 18.87316131591797, "rewards/rejected": -19.241975784301758, "step": 2080 }, { "epoch": 0.5219454329774614, "grad_norm": 0.000518798828125, "learning_rate": 2.7448394735856275e-06, "logits/chosen": 0.5016141533851624, "logits/rejected": 1.7318010330200195, "logps/chosen": -88.0289306640625, "logps/rejected": -2137.91552734375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.43785446882247925, "rewards/margins": 20.17669105529785, "rewards/rejected": -20.614543914794922, "step": 2090 }, { "epoch": 0.5244427795467316, "grad_norm": 0.041748046875, "learning_rate": 2.723136781889722e-06, "logits/chosen": 0.6009372472763062, "logits/rejected": 1.8194977045059204, "logps/chosen": -82.27381896972656, "logps/rejected": -1805.30859375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.38621821999549866, "rewards/margins": 16.96898651123047, "rewards/rejected": -17.355205535888672, "step": 2100 }, { "epoch": 0.5269401261160017, "grad_norm": 0.0242919921875, "learning_rate": 2.7014171257496414e-06, "logits/chosen": 0.5697668790817261, "logits/rejected": 1.7131723165512085, "logps/chosen": -84.2120132446289, "logps/rejected": -1700.6185302734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.3910997211933136, "rewards/margins": 15.752738952636719, "rewards/rejected": -16.14383888244629, "step": 2110 }, { "epoch": 0.5294374726852719, "grad_norm": 0.052978515625, "learning_rate": 2.6796821564480237e-06, "logits/chosen": 0.601753294467926, "logits/rejected": 1.729688048362732, "logps/chosen": -76.5484390258789, "logps/rejected": -1640.7828369140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.32075661420822144, "rewards/margins": 15.291044235229492, "rewards/rejected": -15.611801147460938, "step": 2120 }, { "epoch": 0.531934819254542, "grad_norm": 0.0012664794921875, "learning_rate": 2.6579335264317253e-06, "logits/chosen": 0.5816048383712769, "logits/rejected": 1.7906360626220703, "logps/chosen": -85.20054626464844, "logps/rejected": -1883.8203125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.40961331129074097, "rewards/margins": 17.597332000732422, "rewards/rejected": -18.006946563720703, "step": 2130 }, { "epoch": 0.5344321658238121, "grad_norm": 0.00101470947265625, "learning_rate": 2.6361728891861843e-06, "logits/chosen": 0.5752017498016357, "logits/rejected": 1.6957403421401978, "logps/chosen": -86.33828735351562, "logps/rejected": -1814.455322265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4084859788417816, "rewards/margins": 16.83043670654297, "rewards/rejected": -17.238922119140625, "step": 2140 }, { "epoch": 0.5369295123930824, "grad_norm": 0.009521484375, "learning_rate": 2.614401899109716e-06, "logits/chosen": 0.5796340703964233, "logits/rejected": 1.7896589040756226, "logps/chosen": -78.46866607666016, "logps/rejected": -1804.115234375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3526671528816223, "rewards/margins": 16.97123908996582, "rewards/rejected": -17.32390785217285, "step": 2150 }, { "epoch": 0.5394268589623525, "grad_norm": 0.0263671875, "learning_rate": 2.5926222113877282e-06, "logits/chosen": 0.5532792806625366, "logits/rejected": 1.7575023174285889, "logps/chosen": -86.9544906616211, "logps/rejected": -1865.7669677734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.4337650239467621, "rewards/margins": 17.272930145263672, "rewards/rejected": -17.706693649291992, "step": 2160 }, { "epoch": 0.5419242055316227, "grad_norm": 0.0419921875, "learning_rate": 2.570835481866889e-06, "logits/chosen": 0.6227487921714783, "logits/rejected": 1.7569319009780884, "logps/chosen": -83.56333923339844, "logps/rejected": -1739.373046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4034454822540283, "rewards/margins": 16.28359031677246, "rewards/rejected": -16.687036514282227, "step": 2170 }, { "epoch": 0.5444215521008928, "grad_norm": 0.0240478515625, "learning_rate": 2.5490433669292337e-06, "logits/chosen": 0.5318483114242554, "logits/rejected": 1.7802917957305908, "logps/chosen": -83.16242218017578, "logps/rejected": -2065.672119140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3906503915786743, "rewards/margins": 19.537628173828125, "rewards/rejected": -19.92827796936035, "step": 2180 }, { "epoch": 0.546918898670163, "grad_norm": 0.00081634521484375, "learning_rate": 2.527247523366232e-06, "logits/chosen": 0.55711829662323, "logits/rejected": 1.7834659814834595, "logps/chosen": -89.28174591064453, "logps/rejected": -1952.4468994140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.45531249046325684, "rewards/margins": 18.357410430908203, "rewards/rejected": -18.812725067138672, "step": 2190 }, { "epoch": 0.5494162452394331, "grad_norm": 0.0498046875, "learning_rate": 2.5054496082528336e-06, "logits/chosen": 0.6078628897666931, "logits/rejected": 1.8645546436309814, "logps/chosen": -78.72160339355469, "logps/rejected": -1901.3984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3558036983013153, "rewards/margins": 17.979564666748047, "rewards/rejected": -18.335365295410156, "step": 2200 }, { "epoch": 0.5519135918087033, "grad_norm": 0.030029296875, "learning_rate": 2.483651278821481e-06, "logits/chosen": 0.6357477903366089, "logits/rejected": 1.8168609142303467, "logps/chosen": -86.10234069824219, "logps/rejected": -1748.644775390625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.43243059515953064, "rewards/margins": 16.351675033569336, "rewards/rejected": -16.78410530090332, "step": 2210 }, { "epoch": 0.5544109383779734, "grad_norm": 0.01177978515625, "learning_rate": 2.4618541923361166e-06, "logits/chosen": 0.6292850971221924, "logits/rejected": 1.7243268489837646, "logps/chosen": -83.60933685302734, "logps/rejected": -1522.750244140625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.39073851704597473, "rewards/margins": 14.038189888000488, "rewards/rejected": -14.428926467895508, "step": 2220 }, { "epoch": 0.5569082849472435, "grad_norm": 0.00070953369140625, "learning_rate": 2.4400600059661836e-06, "logits/chosen": 0.5282065868377686, "logits/rejected": 1.8849893808364868, "logps/chosen": -86.94523620605469, "logps/rejected": -2129.426025390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.42179185152053833, "rewards/margins": 20.111438751220703, "rewards/rejected": -20.53322982788086, "step": 2230 }, { "epoch": 0.5594056315165137, "grad_norm": 0.008544921875, "learning_rate": 2.41827037666064e-06, "logits/chosen": 0.6786268353462219, "logits/rejected": 1.8249973058700562, "logps/chosen": -76.79341888427734, "logps/rejected": -1632.072265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.33309558033943176, "rewards/margins": 15.309873580932617, "rewards/rejected": -15.642970085144043, "step": 2240 }, { "epoch": 0.5619029780857838, "grad_norm": 0.018798828125, "learning_rate": 2.396486961021983e-06, "logits/chosen": 0.617296576499939, "logits/rejected": 1.860708236694336, "logps/chosen": -89.55140686035156, "logps/rejected": -1905.1920166015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4716108441352844, "rewards/margins": 17.87620735168457, "rewards/rejected": -18.347820281982422, "step": 2250 }, { "epoch": 0.564400324655054, "grad_norm": 0.00726318359375, "learning_rate": 2.3747114151802993e-06, "logits/chosen": 0.6001085638999939, "logits/rejected": 1.8411632776260376, "logps/chosen": -79.05329895019531, "logps/rejected": -1769.6693115234375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3584744334220886, "rewards/margins": 16.644119262695312, "rewards/rejected": -17.002593994140625, "step": 2260 }, { "epoch": 0.5668976712243241, "grad_norm": 0.038330078125, "learning_rate": 2.352945394667363e-06, "logits/chosen": 0.5482415556907654, "logits/rejected": 1.7739299535751343, "logps/chosen": -88.25926971435547, "logps/rejected": -2113.56884765625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.43419378995895386, "rewards/margins": 19.882293701171875, "rewards/rejected": -20.31648826599121, "step": 2270 }, { "epoch": 0.5693950177935944, "grad_norm": 0.06494140625, "learning_rate": 2.3311905542907627e-06, "logits/chosen": 0.6261372566223145, "logits/rejected": 1.787941336631775, "logps/chosen": -80.4935302734375, "logps/rejected": -1684.802001953125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.378140389919281, "rewards/margins": 15.795267105102539, "rewards/rejected": -16.17340660095215, "step": 2280 }, { "epoch": 0.5718923643628645, "grad_norm": 0.033935546875, "learning_rate": 2.30944854800809e-06, "logits/chosen": 0.6286464929580688, "logits/rejected": 1.8051517009735107, "logps/chosen": -80.61420440673828, "logps/rejected": -1804.312744140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.37235134840011597, "rewards/margins": 16.959814071655273, "rewards/rejected": -17.332164764404297, "step": 2290 }, { "epoch": 0.5743897109321346, "grad_norm": 0.0019683837890625, "learning_rate": 2.287721028801204e-06, "logits/chosen": 0.5823894739151001, "logits/rejected": 1.7553184032440186, "logps/chosen": -89.6335678100586, "logps/rejected": -1704.480712890625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.45537322759628296, "rewards/margins": 15.833767890930176, "rewards/rejected": -16.289142608642578, "step": 2300 }, { "epoch": 0.5768870575014048, "grad_norm": 0.021728515625, "learning_rate": 2.26600964855055e-06, "logits/chosen": 0.6238933205604553, "logits/rejected": 1.7979097366333008, "logps/chosen": -79.57666778564453, "logps/rejected": -1692.2340087890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3568623960018158, "rewards/margins": 15.8856201171875, "rewards/rejected": -16.242483139038086, "step": 2310 }, { "epoch": 0.5793844040706749, "grad_norm": 0.007476806640625, "learning_rate": 2.244316057909573e-06, "logits/chosen": 0.6190879344940186, "logits/rejected": 1.7797908782958984, "logps/chosen": -86.66450500488281, "logps/rejected": -1799.299560546875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4307888448238373, "rewards/margins": 16.876020431518555, "rewards/rejected": -17.30681037902832, "step": 2320 }, { "epoch": 0.5818817506399451, "grad_norm": 0.005279541015625, "learning_rate": 2.2226419061792282e-06, "logits/chosen": 0.5849915742874146, "logits/rejected": 1.8162180185317993, "logps/chosen": -85.55912017822266, "logps/rejected": -1866.6416015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.42593854665756226, "rewards/margins": 17.541982650756836, "rewards/rejected": -17.96792221069336, "step": 2330 }, { "epoch": 0.5843790972092152, "grad_norm": 0.002044677734375, "learning_rate": 2.200988841182589e-06, "logits/chosen": 0.6237704157829285, "logits/rejected": 1.8964016437530518, "logps/chosen": -95.87744140625, "logps/rejected": -2077.869140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5327093005180359, "rewards/margins": 19.543682098388672, "rewards/rejected": -20.076391220092773, "step": 2340 }, { "epoch": 0.5868764437784854, "grad_norm": 0.00469970703125, "learning_rate": 2.179358509139559e-06, "logits/chosen": 0.6188510060310364, "logits/rejected": 1.7551387548446655, "logps/chosen": -82.06452941894531, "logps/rejected": -1564.6396484375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.359652578830719, "rewards/margins": 14.480381965637207, "rewards/rejected": -14.840034484863281, "step": 2350 }, { "epoch": 0.5893737903477555, "grad_norm": 0.05322265625, "learning_rate": 2.1577525545417254e-06, "logits/chosen": 0.642662525177002, "logits/rejected": 1.8487951755523682, "logps/chosen": -85.49614715576172, "logps/rejected": -1861.0296630859375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.41231250762939453, "rewards/margins": 17.494285583496094, "rewards/rejected": -17.906597137451172, "step": 2360 }, { "epoch": 0.5918711369170256, "grad_norm": 0.059326171875, "learning_rate": 2.1361726200273293e-06, "logits/chosen": 0.6013755202293396, "logits/rejected": 1.8614768981933594, "logps/chosen": -82.80476379394531, "logps/rejected": -1880.697021484375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.38074877858161926, "rewards/margins": 17.66275978088379, "rewards/rejected": -18.043506622314453, "step": 2370 }, { "epoch": 0.5943684834862958, "grad_norm": 0.04638671875, "learning_rate": 2.1146203462563773e-06, "logits/chosen": 0.6672108769416809, "logits/rejected": 1.8760160207748413, "logps/chosen": -85.3284912109375, "logps/rejected": -1663.7691650390625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.42937812209129333, "rewards/margins": 15.528106689453125, "rewards/rejected": -15.957483291625977, "step": 2380 }, { "epoch": 0.5968658300555659, "grad_norm": 0.037109375, "learning_rate": 2.0930973717859117e-06, "logits/chosen": 0.5693127512931824, "logits/rejected": 1.8059355020523071, "logps/chosen": -86.84693145751953, "logps/rejected": -1841.8323974609375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.4406910836696625, "rewards/margins": 17.286680221557617, "rewards/rejected": -17.727371215820312, "step": 2390 }, { "epoch": 0.5993631766248361, "grad_norm": 4.8160552978515625e-05, "learning_rate": 2.0716053329454337e-06, "logits/chosen": 0.633589506149292, "logits/rejected": 1.8425014019012451, "logps/chosen": -84.12596130371094, "logps/rejected": -1987.7154541015625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.40977373719215393, "rewards/margins": 18.74795913696289, "rewards/rejected": -19.157733917236328, "step": 2400 }, { "epoch": 0.6018605231941062, "grad_norm": 0.020751953125, "learning_rate": 2.0501458637124963e-06, "logits/chosen": 0.6005128026008606, "logits/rejected": 1.9649635553359985, "logps/chosen": -89.92936706542969, "logps/rejected": -2303.25341796875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.46929341554641724, "rewards/margins": 21.84885597229004, "rewards/rejected": -22.31814956665039, "step": 2410 }, { "epoch": 0.6043578697633765, "grad_norm": 0.0059814453125, "learning_rate": 2.0287205955884812e-06, "logits/chosen": 0.5659859776496887, "logits/rejected": 1.7156604528427124, "logps/chosen": -82.71956634521484, "logps/rejected": -1716.356689453125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.37312421202659607, "rewards/margins": 15.841397285461426, "rewards/rejected": -16.214521408081055, "step": 2420 }, { "epoch": 0.6068552163326466, "grad_norm": 0.031494140625, "learning_rate": 2.0073311574745583e-06, "logits/chosen": 0.615592896938324, "logits/rejected": 1.8998111486434937, "logps/chosen": -83.0928726196289, "logps/rejected": -2058.5458984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.3958067297935486, "rewards/margins": 19.46796226501465, "rewards/rejected": -19.863767623901367, "step": 2430 }, { "epoch": 0.6093525629019167, "grad_norm": 0.0108642578125, "learning_rate": 1.9859791755478453e-06, "logits/chosen": 0.612500786781311, "logits/rejected": 1.7525627613067627, "logps/chosen": -79.78617858886719, "logps/rejected": -1609.921142578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3665863871574402, "rewards/margins": 15.052395820617676, "rewards/rejected": -15.418981552124023, "step": 2440 }, { "epoch": 0.6118499094711869, "grad_norm": 0.05419921875, "learning_rate": 1.9646662731377737e-06, "logits/chosen": 0.6692113876342773, "logits/rejected": 1.816349983215332, "logps/chosen": -84.40538787841797, "logps/rejected": -1675.842041015625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.40326443314552307, "rewards/margins": 15.622156143188477, "rewards/rejected": -16.025419235229492, "step": 2450 }, { "epoch": 0.614347256040457, "grad_norm": 0.00130462646484375, "learning_rate": 1.9433940706026743e-06, "logits/chosen": 0.5840574502944946, "logits/rejected": 1.8281028270721436, "logps/chosen": -86.7694320678711, "logps/rejected": -2069.530517578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.4303639531135559, "rewards/margins": 19.5331974029541, "rewards/rejected": -19.963563919067383, "step": 2460 }, { "epoch": 0.6168446026097272, "grad_norm": 0.00012493133544921875, "learning_rate": 1.9221641852065807e-06, "logits/chosen": 0.6755739450454712, "logits/rejected": 1.845654845237732, "logps/chosen": -88.67765045166016, "logps/rejected": -1714.5185546875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.46009987592697144, "rewards/margins": 15.997251510620117, "rewards/rejected": -16.457351684570312, "step": 2470 }, { "epoch": 0.6193419491789973, "grad_norm": 0.039794921875, "learning_rate": 1.9009782309962805e-06, "logits/chosen": 0.5677890181541443, "logits/rejected": 1.8127906322479248, "logps/chosen": -76.20381164550781, "logps/rejected": -1766.592529296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3160540759563446, "rewards/margins": 16.549949645996094, "rewards/rejected": -16.866003036499023, "step": 2480 }, { "epoch": 0.6218392957482675, "grad_norm": 0.037841796875, "learning_rate": 1.8798378186785979e-06, "logits/chosen": 0.6165963411331177, "logits/rejected": 1.7844451665878296, "logps/chosen": -80.53484344482422, "logps/rejected": -1799.0015869140625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.37462836503982544, "rewards/margins": 16.907718658447266, "rewards/rejected": -17.282346725463867, "step": 2490 }, { "epoch": 0.6243366423175376, "grad_norm": 0.01007080078125, "learning_rate": 1.8587445554979404e-06, "logits/chosen": 0.6141692399978638, "logits/rejected": 1.8811423778533936, "logps/chosen": -87.24476623535156, "logps/rejected": -2009.0035400390625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4570063054561615, "rewards/margins": 18.914798736572266, "rewards/rejected": -19.371807098388672, "step": 2500 }, { "epoch": 0.6268339888868077, "grad_norm": 0.00011968612670898438, "learning_rate": 1.8377000451141013e-06, "logits/chosen": 0.6391327977180481, "logits/rejected": 1.9281005859375, "logps/chosen": -86.39270782470703, "logps/rejected": -1954.652099609375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.4386569857597351, "rewards/margins": 18.369953155517578, "rewards/rejected": -18.808609008789062, "step": 2510 }, { "epoch": 0.6293313354560779, "grad_norm": 0.0277099609375, "learning_rate": 1.8167058874803405e-06, "logits/chosen": 0.5556064248085022, "logits/rejected": 1.718269944190979, "logps/chosen": -86.69864654541016, "logps/rejected": -1939.4332275390625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.4233566224575043, "rewards/margins": 18.154285430908203, "rewards/rejected": -18.5776424407959, "step": 2520 }, { "epoch": 0.631828682025348, "grad_norm": 0.0771484375, "learning_rate": 1.7957636787217451e-06, "logits/chosen": 0.5710119009017944, "logits/rejected": 1.7915983200073242, "logps/chosen": -79.21806335449219, "logps/rejected": -1942.8629150390625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3619151711463928, "rewards/margins": 18.352802276611328, "rewards/rejected": -18.71471405029297, "step": 2530 }, { "epoch": 0.6343260285946182, "grad_norm": 0.0155029296875, "learning_rate": 1.7748750110138768e-06, "logits/chosen": 0.5197774171829224, "logits/rejected": 1.737969160079956, "logps/chosen": -88.46708679199219, "logps/rejected": -2104.29296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.42627063393592834, "rewards/margins": 19.79252052307129, "rewards/rejected": -20.21879005432129, "step": 2540 }, { "epoch": 0.6368233751638883, "grad_norm": 0.021484375, "learning_rate": 1.7540414724617282e-06, "logits/chosen": 0.5759893655776978, "logits/rejected": 1.7303228378295898, "logps/chosen": -76.38877868652344, "logps/rejected": -1828.7796630859375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.32031676173210144, "rewards/margins": 17.18129539489746, "rewards/rejected": -17.50161361694336, "step": 2550 }, { "epoch": 0.6393207217331586, "grad_norm": 0.0196533203125, "learning_rate": 1.7332646469789827e-06, "logits/chosen": 0.6225037574768066, "logits/rejected": 1.779985785484314, "logps/chosen": -85.09439086914062, "logps/rejected": -1516.5433349609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4056459069252014, "rewards/margins": 14.090934753417969, "rewards/rejected": -14.496580123901367, "step": 2560 }, { "epoch": 0.6418180683024287, "grad_norm": 0.036376953125, "learning_rate": 1.7125461141675881e-06, "logits/chosen": 0.643700122833252, "logits/rejected": 1.8465179204940796, "logps/chosen": -80.20843505859375, "logps/rejected": -1810.455078125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3701106905937195, "rewards/margins": 16.961139678955078, "rewards/rejected": -17.331249237060547, "step": 2570 }, { "epoch": 0.6443154148716989, "grad_norm": 0.0289306640625, "learning_rate": 1.6918874491976744e-06, "logits/chosen": 0.5704053640365601, "logits/rejected": 1.6976230144500732, "logps/chosen": -84.80411529541016, "logps/rejected": -1765.397216796875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.4006038308143616, "rewards/margins": 16.490291595458984, "rewards/rejected": -16.89089584350586, "step": 2580 }, { "epoch": 0.646812761440969, "grad_norm": 0.000751495361328125, "learning_rate": 1.6712902226877917e-06, "logits/chosen": 0.6289549469947815, "logits/rejected": 1.850502610206604, "logps/chosen": -88.18513488769531, "logps/rejected": -1988.833984375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4497924745082855, "rewards/margins": 18.730804443359375, "rewards/rejected": -19.180593490600586, "step": 2590 }, { "epoch": 0.6493101080102391, "grad_norm": 0.0012664794921875, "learning_rate": 1.6507560005854977e-06, "logits/chosen": 0.5509642362594604, "logits/rejected": 1.7071406841278076, "logps/chosen": -83.9954833984375, "logps/rejected": -1830.5101318359375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3968578577041626, "rewards/margins": 17.073257446289062, "rewards/rejected": -17.470115661621094, "step": 2600 }, { "epoch": 0.6518074545795093, "grad_norm": 0.008544921875, "learning_rate": 1.6302863440483121e-06, "logits/chosen": 0.5004338026046753, "logits/rejected": 1.7150554656982422, "logps/chosen": -82.43248748779297, "logps/rejected": -1880.423828125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.36883944272994995, "rewards/margins": 17.624969482421875, "rewards/rejected": -17.99380874633789, "step": 2610 }, { "epoch": 0.6543048011487794, "grad_norm": 0.0341796875, "learning_rate": 1.6098828093250203e-06, "logits/chosen": 0.5160735845565796, "logits/rejected": 1.7385032176971436, "logps/chosen": -79.65149688720703, "logps/rejected": -2075.0966796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3529607653617859, "rewards/margins": 19.472675323486328, "rewards/rejected": -19.825634002685547, "step": 2620 }, { "epoch": 0.6568021477180496, "grad_norm": 0.027099609375, "learning_rate": 1.5895469476373545e-06, "logits/chosen": 0.5671316385269165, "logits/rejected": 1.722346305847168, "logps/chosen": -81.55863189697266, "logps/rejected": -1681.589111328125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3755113184452057, "rewards/margins": 15.65942096710205, "rewards/rejected": -16.03493309020996, "step": 2630 }, { "epoch": 0.6592994942873197, "grad_norm": 0.091796875, "learning_rate": 1.5692803050620642e-06, "logits/chosen": 0.6067586541175842, "logits/rejected": 1.7199970483779907, "logps/chosen": -83.18370056152344, "logps/rejected": -1680.830322265625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.38230761885643005, "rewards/margins": 15.612162590026855, "rewards/rejected": -15.994470596313477, "step": 2640 }, { "epoch": 0.6617968408565899, "grad_norm": 0.007720947265625, "learning_rate": 1.5490844224133717e-06, "logits/chosen": 0.6019744873046875, "logits/rejected": 1.8558555841445923, "logps/chosen": -80.30780792236328, "logps/rejected": -1929.556640625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.37255150079727173, "rewards/margins": 18.213796615600586, "rewards/rejected": -18.586347579956055, "step": 2650 }, { "epoch": 0.66429418742586, "grad_norm": 0.050537109375, "learning_rate": 1.528960835125822e-06, "logits/chosen": 0.6771095991134644, "logits/rejected": 1.8372013568878174, "logps/chosen": -80.95626068115234, "logps/rejected": -1653.2200927734375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3823738992214203, "rewards/margins": 15.478759765625, "rewards/rejected": -15.86113452911377, "step": 2660 }, { "epoch": 0.6667915339951301, "grad_norm": 0.0147705078125, "learning_rate": 1.5089110731375568e-06, "logits/chosen": 0.581728994846344, "logits/rejected": 1.7974720001220703, "logps/chosen": -79.06498718261719, "logps/rejected": -1824.8765869140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.35732579231262207, "rewards/margins": 17.178985595703125, "rewards/rejected": -17.53631019592285, "step": 2670 }, { "epoch": 0.6692888805644003, "grad_norm": 0.032470703125, "learning_rate": 1.4889366607739925e-06, "logits/chosen": 0.6092284917831421, "logits/rejected": 1.6554501056671143, "logps/chosen": -78.19693756103516, "logps/rejected": -1454.85107421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3566651940345764, "rewards/margins": 13.518289566040039, "rewards/rejected": -13.874954223632812, "step": 2680 }, { "epoch": 0.6717862271336704, "grad_norm": 0.017578125, "learning_rate": 1.4690391166319307e-06, "logits/chosen": 0.5935393571853638, "logits/rejected": 1.761783242225647, "logps/chosen": -85.10191345214844, "logps/rejected": -1834.072021484375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4155469536781311, "rewards/margins": 17.14066505432129, "rewards/rejected": -17.556209564208984, "step": 2690 }, { "epoch": 0.6742835737029407, "grad_norm": 0.036376953125, "learning_rate": 1.4492199534641055e-06, "logits/chosen": 0.6022766828536987, "logits/rejected": 1.8546326160430908, "logps/chosen": -84.67176055908203, "logps/rejected": -1870.0103759765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.41111135482788086, "rewards/margins": 17.593767166137695, "rewards/rejected": -18.0048770904541, "step": 2700 }, { "epoch": 0.6767809202722108, "grad_norm": 0.001617431640625, "learning_rate": 1.429480678064174e-06, "logits/chosen": 0.4885890483856201, "logits/rejected": 1.7658706903457642, "logps/chosen": -85.19550323486328, "logps/rejected": -2298.07373046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.4054490923881531, "rewards/margins": 21.81135368347168, "rewards/rejected": -22.2168025970459, "step": 2710 }, { "epoch": 0.679278266841481, "grad_norm": 0.021484375, "learning_rate": 1.4098227911521523e-06, "logits/chosen": 0.6109157204627991, "logits/rejected": 1.8104369640350342, "logps/chosen": -92.25324249267578, "logps/rejected": -1919.9534912109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.4862859845161438, "rewards/margins": 18.005590438842773, "rewards/rejected": -18.49187660217285, "step": 2720 }, { "epoch": 0.6817756134107511, "grad_norm": 0.0732421875, "learning_rate": 1.3902477872603295e-06, "logits/chosen": 0.6768110990524292, "logits/rejected": 1.7166074514389038, "logps/chosen": -80.58265686035156, "logps/rejected": -1554.531982421875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.36700159311294556, "rewards/margins": 14.375741958618164, "rewards/rejected": -14.742744445800781, "step": 2730 }, { "epoch": 0.6842729599800212, "grad_norm": 0.0478515625, "learning_rate": 1.370757154619638e-06, "logits/chosen": 0.5914765000343323, "logits/rejected": 1.823325753211975, "logps/chosen": -86.78914642333984, "logps/rejected": -1906.5355224609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.42601895332336426, "rewards/margins": 17.833972930908203, "rewards/rejected": -18.259990692138672, "step": 2740 }, { "epoch": 0.6867703065492914, "grad_norm": 0.005523681640625, "learning_rate": 1.3513523750465049e-06, "logits/chosen": 0.5616365075111389, "logits/rejected": 1.7267478704452515, "logps/chosen": -81.25291442871094, "logps/rejected": -1686.3486328125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.36524245142936707, "rewards/margins": 15.707315444946289, "rewards/rejected": -16.07255744934082, "step": 2750 }, { "epoch": 0.6892676531185615, "grad_norm": 0.00193023681640625, "learning_rate": 1.332034923830199e-06, "logits/chosen": 0.5695074200630188, "logits/rejected": 1.8329308032989502, "logps/chosen": -82.82709503173828, "logps/rejected": -1805.4775390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.39727023243904114, "rewards/margins": 16.94883918762207, "rewards/rejected": -17.346107482910156, "step": 2760 }, { "epoch": 0.6917649996878317, "grad_norm": 0.040771484375, "learning_rate": 1.31280626962067e-06, "logits/chosen": 0.6029590368270874, "logits/rejected": 1.6939897537231445, "logps/chosen": -86.7250747680664, "logps/rejected": -1587.8193359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.42630377411842346, "rewards/margins": 14.680148124694824, "rewards/rejected": -15.106452941894531, "step": 2770 }, { "epoch": 0.6942623462571018, "grad_norm": 0.0003566741943359375, "learning_rate": 1.2936678743168813e-06, "logits/chosen": 0.5795254707336426, "logits/rejected": 1.7682584524154663, "logps/chosen": -83.47227478027344, "logps/rejected": -1894.138916015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.39181336760520935, "rewards/margins": 17.82851791381836, "rewards/rejected": -18.22032928466797, "step": 2780 }, { "epoch": 0.696759692826372, "grad_norm": 0.01373291015625, "learning_rate": 1.2746211929556777e-06, "logits/chosen": 0.5124091506004333, "logits/rejected": 2.0397300720214844, "logps/chosen": -85.71356201171875, "logps/rejected": -2490.38232421875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.42116060853004456, "rewards/margins": 23.745744705200195, "rewards/rejected": -24.16690444946289, "step": 2790 }, { "epoch": 0.6992570393956421, "grad_norm": 1.30385160446167e-08, "learning_rate": 1.2556676736011558e-06, "logits/chosen": 0.6134932637214661, "logits/rejected": 1.816425085067749, "logps/chosen": -85.68560791015625, "logps/rejected": -1998.1363525390625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.43286705017089844, "rewards/margins": 18.802690505981445, "rewards/rejected": -19.235559463500977, "step": 2800 }, { "epoch": 0.7017543859649122, "grad_norm": 0.0089111328125, "learning_rate": 1.2368087572345772e-06, "logits/chosen": 0.6667296886444092, "logits/rejected": 1.7410768270492554, "logps/chosen": -84.29058837890625, "logps/rejected": -1482.312255859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4156969487667084, "rewards/margins": 13.738250732421875, "rewards/rejected": -14.153947830200195, "step": 2810 }, { "epoch": 0.7042517325341824, "grad_norm": 0.0927734375, "learning_rate": 1.2180458776448067e-06, "logits/chosen": 0.5982272028923035, "logits/rejected": 1.7856439352035522, "logps/chosen": -89.98011016845703, "logps/rejected": -1943.0396728515625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4640469551086426, "rewards/margins": 18.235332489013672, "rewards/rejected": -18.699377059936523, "step": 2820 }, { "epoch": 0.7067490791034525, "grad_norm": 0.02392578125, "learning_rate": 1.1993804613193158e-06, "logits/chosen": 0.6234604120254517, "logits/rejected": 1.765428900718689, "logps/chosen": -87.09599304199219, "logps/rejected": -1579.3837890625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.427617609500885, "rewards/margins": 14.572957038879395, "rewards/rejected": -15.000572204589844, "step": 2830 }, { "epoch": 0.7092464256727228, "grad_norm": 3.7401914596557617e-06, "learning_rate": 1.1808139273357232e-06, "logits/chosen": 0.5544342398643494, "logits/rejected": 1.7727603912353516, "logps/chosen": -83.8676528930664, "logps/rejected": -1906.515380859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.3836463987827301, "rewards/margins": 17.823671340942383, "rewards/rejected": -18.207317352294922, "step": 2840 }, { "epoch": 0.7117437722419929, "grad_norm": 5.5789947509765625e-05, "learning_rate": 1.1623476872539108e-06, "logits/chosen": 0.5153034925460815, "logits/rejected": 1.8462998867034912, "logps/chosen": -94.67253112792969, "logps/rejected": -2197.599609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.5202997326850891, "rewards/margins": 20.727157592773438, "rewards/rejected": -21.247455596923828, "step": 2850 }, { "epoch": 0.7142411188112631, "grad_norm": 0.042724609375, "learning_rate": 1.1439831450087032e-06, "logits/chosen": 0.580392062664032, "logits/rejected": 1.876275658607483, "logps/chosen": -87.93695068359375, "logps/rejected": -2129.860107421875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4418310225009918, "rewards/margins": 20.121156692504883, "rewards/rejected": -20.56298828125, "step": 2860 }, { "epoch": 0.7167384653805332, "grad_norm": 0.0458984375, "learning_rate": 1.1257216968031357e-06, "logits/chosen": 0.6597843170166016, "logits/rejected": 1.8998768329620361, "logps/chosen": -80.04630279541016, "logps/rejected": -1752.5791015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.37010782957077026, "rewards/margins": 16.46237564086914, "rewards/rejected": -16.832483291625977, "step": 2870 }, { "epoch": 0.7192358119498033, "grad_norm": 0.00104522705078125, "learning_rate": 1.1075647310022974e-06, "logits/chosen": 0.634041965007782, "logits/rejected": 1.8106848001480103, "logps/chosen": -78.93439483642578, "logps/rejected": -1525.8900146484375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.36263298988342285, "rewards/margins": 14.232080459594727, "rewards/rejected": -14.594714164733887, "step": 2880 }, { "epoch": 0.7217331585190735, "grad_norm": 0.002899169921875, "learning_rate": 1.0895136280277863e-06, "logits/chosen": 0.5515082478523254, "logits/rejected": 1.7851063013076782, "logps/chosen": -87.03413391113281, "logps/rejected": -2093.734375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4326532781124115, "rewards/margins": 19.687620162963867, "rewards/rejected": -20.120275497436523, "step": 2890 }, { "epoch": 0.7242305050883436, "grad_norm": 0.060546875, "learning_rate": 1.0715697602527542e-06, "logits/chosen": 0.5289216041564941, "logits/rejected": 1.7743902206420898, "logps/chosen": -85.22486114501953, "logps/rejected": -1992.9351806640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4111485481262207, "rewards/margins": 18.597354888916016, "rewards/rejected": -19.00850486755371, "step": 2900 }, { "epoch": 0.7267278516576138, "grad_norm": 0.04248046875, "learning_rate": 1.0537344918975708e-06, "logits/chosen": 0.654784083366394, "logits/rejected": 1.7333883047103882, "logps/chosen": -85.55310821533203, "logps/rejected": -1545.492919921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.4073718190193176, "rewards/margins": 14.238784790039062, "rewards/rejected": -14.646156311035156, "step": 2910 }, { "epoch": 0.7292251982268839, "grad_norm": 1.5079975128173828e-05, "learning_rate": 1.036009178926107e-06, "logits/chosen": 0.570530891418457, "logits/rejected": 1.8232314586639404, "logps/chosen": -87.81031799316406, "logps/rejected": -1891.7252197265625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.44701051712036133, "rewards/margins": 17.766794204711914, "rewards/rejected": -18.213804244995117, "step": 2920 }, { "epoch": 0.7317225447961541, "grad_norm": 0.016357421875, "learning_rate": 1.0183951689426438e-06, "logits/chosen": 0.5162047147750854, "logits/rejected": 1.80562424659729, "logps/chosen": -78.40940856933594, "logps/rejected": -2212.948974609375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.33928531408309937, "rewards/margins": 21.02674674987793, "rewards/rejected": -21.366031646728516, "step": 2930 }, { "epoch": 0.7342198913654242, "grad_norm": 0.01409912109375, "learning_rate": 1.0008938010894156e-06, "logits/chosen": 0.5077947974205017, "logits/rejected": 1.8344638347625732, "logps/chosen": -81.39566802978516, "logps/rejected": -2270.706298828125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3777496814727783, "rewards/margins": 21.60778045654297, "rewards/rejected": -21.98552894592285, "step": 2940 }, { "epoch": 0.7367172379346943, "grad_norm": 0.0198974609375, "learning_rate": 9.83506405944804e-07, "logits/chosen": 0.5673650503158569, "logits/rejected": 1.745111107826233, "logps/chosen": -77.10914611816406, "logps/rejected": -1838.6285400390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.34020406007766724, "rewards/margins": 17.217037200927734, "rewards/rejected": -17.55724334716797, "step": 2950 }, { "epoch": 0.7392145845039645, "grad_norm": 0.00946044921875, "learning_rate": 9.662343054221743e-07, "logits/chosen": 0.5164293050765991, "logits/rejected": 1.726947546005249, "logps/chosen": -88.59376525878906, "logps/rejected": -2064.08642578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.4376547336578369, "rewards/margins": 19.29999351501465, "rewards/rejected": -19.73764991760254, "step": 2960 }, { "epoch": 0.7417119310732347, "grad_norm": 0.0595703125, "learning_rate": 9.490788126693754e-07, "logits/chosen": 0.6247397661209106, "logits/rejected": 1.8680105209350586, "logps/chosen": -86.50829315185547, "logps/rejected": -1925.3023681640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4314423203468323, "rewards/margins": 17.977245330810547, "rewards/rejected": -18.408687591552734, "step": 2970 }, { "epoch": 0.7442092776425049, "grad_norm": 0.0030975341796875, "learning_rate": 9.32041231968904e-07, "logits/chosen": 0.582064151763916, "logits/rejected": 1.8307263851165771, "logps/chosen": -87.89469909667969, "logps/rejected": -2049.106689453125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.4329482614994049, "rewards/margins": 19.329082489013672, "rewards/rejected": -19.762033462524414, "step": 2980 }, { "epoch": 0.746706624211775, "grad_norm": 0.328125, "learning_rate": 9.151228586387464e-07, "logits/chosen": 0.6141242384910583, "logits/rejected": 1.747831106185913, "logps/chosen": -80.1594467163086, "logps/rejected": -1789.0576171875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.36252784729003906, "rewards/margins": 16.812475204467773, "rewards/rejected": -17.175004959106445, "step": 2990 }, { "epoch": 0.7492039707810452, "grad_norm": 0.06494140625, "learning_rate": 8.983249789338941e-07, "logits/chosen": 0.6428495645523071, "logits/rejected": 1.7919700145721436, "logps/chosen": -82.71188354492188, "logps/rejected": -1678.5035400390625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.3992861807346344, "rewards/margins": 15.69371509552002, "rewards/rejected": -16.092998504638672, "step": 3000 }, { "epoch": 0.7492039707810452, "eval_logits/chosen": 0.656849205493927, "eval_logits/rejected": 1.5703133344650269, "eval_logps/chosen": -84.02084350585938, "eval_logps/rejected": -995.490234375, "eval_loss": 0.0028192740865051746, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.39664965867996216, "eval_rewards/margins": 8.947355270385742, "eval_rewards/rejected": -9.34400463104248, "eval_runtime": 0.621, "eval_samples_per_second": 8.052, "eval_steps_per_second": 8.052, "step": 3000 }, { "epoch": 0.7517013173503153, "grad_norm": 0.0205078125, "learning_rate": 8.816488699485593e-07, "logits/chosen": 0.634880006313324, "logits/rejected": 1.8458068370819092, "logps/chosen": -89.79926300048828, "logps/rejected": -1852.7974853515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4646337628364563, "rewards/margins": 17.353429794311523, "rewards/rejected": -17.81806182861328, "step": 3010 }, { "epoch": 0.7541986639195855, "grad_norm": 4.553794860839844e-05, "learning_rate": 8.650957995190784e-07, "logits/chosen": 0.5151122212409973, "logits/rejected": 1.7481235265731812, "logps/chosen": -79.8306884765625, "logps/rejected": -2110.71923828125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3657647669315338, "rewards/margins": 20.01117706298828, "rewards/rejected": -20.37693977355957, "step": 3020 }, { "epoch": 0.7566960104888556, "grad_norm": 0.045166015625, "learning_rate": 8.486670261275193e-07, "logits/chosen": 0.6202859878540039, "logits/rejected": 1.8134170770645142, "logps/chosen": -84.73997497558594, "logps/rejected": -1783.783203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.42085719108581543, "rewards/margins": 16.739408493041992, "rewards/rejected": -17.160266876220703, "step": 3030 }, { "epoch": 0.7591933570581257, "grad_norm": 0.0111083984375, "learning_rate": 8.32363798806011e-07, "logits/chosen": 0.5721080303192139, "logits/rejected": 1.739031195640564, "logps/chosen": -85.13414764404297, "logps/rejected": -1891.197509765625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.42727169394493103, "rewards/margins": 17.766677856445312, "rewards/rejected": -18.193950653076172, "step": 3040 }, { "epoch": 0.7616907036273959, "grad_norm": 0.0208740234375, "learning_rate": 8.161873570417742e-07, "logits/chosen": 0.5966504812240601, "logits/rejected": 1.8666013479232788, "logps/chosen": -89.76878356933594, "logps/rejected": -1943.291015625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.4609198570251465, "rewards/margins": 18.261484146118164, "rewards/rejected": -18.722402572631836, "step": 3050 }, { "epoch": 0.764188050196666, "grad_norm": 0.0098876953125, "learning_rate": 8.001389306828897e-07, "logits/chosen": 0.4526674151420593, "logits/rejected": 1.7495372295379639, "logps/chosen": -82.58873748779297, "logps/rejected": -2175.00244140625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694322407245636, "rewards/margins": 20.39395523071289, "rewards/rejected": -20.763385772705078, "step": 3060 }, { "epoch": 0.7666853967659362, "grad_norm": 1.7508864402770996e-06, "learning_rate": 7.842197398447993e-07, "logits/chosen": 0.5828143358230591, "logits/rejected": 1.8392832279205322, "logps/chosen": -79.72120666503906, "logps/rejected": -1959.4478759765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3589129149913788, "rewards/margins": 18.44549560546875, "rewards/rejected": -18.804407119750977, "step": 3070 }, { "epoch": 0.7691827433352063, "grad_norm": 0.00104522705078125, "learning_rate": 7.684309948175414e-07, "logits/chosen": 0.5747276544570923, "logits/rejected": 1.7614377737045288, "logps/chosen": -83.3620834350586, "logps/rejected": -1877.7086181640625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.4084179401397705, "rewards/margins": 17.644577026367188, "rewards/rejected": -18.052997589111328, "step": 3080 }, { "epoch": 0.7716800899044765, "grad_norm": 0.00160980224609375, "learning_rate": 7.527738959737371e-07, "logits/chosen": 0.536163330078125, "logits/rejected": 1.8368165493011475, "logps/chosen": -81.3559799194336, "logps/rejected": -1827.6383056640625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3685862720012665, "rewards/margins": 17.127788543701172, "rewards/rejected": -17.496374130249023, "step": 3090 }, { "epoch": 0.7741774364737466, "grad_norm": 0.061279296875, "learning_rate": 7.372496336773269e-07, "logits/chosen": 0.6259430050849915, "logits/rejected": 1.7605386972427368, "logps/chosen": -82.03521728515625, "logps/rejected": -1697.513916015625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.3883189260959625, "rewards/margins": 15.871994018554688, "rewards/rejected": -16.260311126708984, "step": 3100 }, { "epoch": 0.7766747830430168, "grad_norm": 0.0439453125, "learning_rate": 7.218593881930744e-07, "logits/chosen": 0.6127210259437561, "logits/rejected": 1.7632982730865479, "logps/chosen": -77.52657318115234, "logps/rejected": -1801.5595703125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3411308526992798, "rewards/margins": 16.98233985900879, "rewards/rejected": -17.323471069335938, "step": 3110 }, { "epoch": 0.779172129612287, "grad_norm": 0.017333984375, "learning_rate": 7.066043295968342e-07, "logits/chosen": 0.5858328938484192, "logits/rejected": 1.7214057445526123, "logps/chosen": -82.28968048095703, "logps/rejected": -1686.96875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.377600759267807, "rewards/margins": 15.575152397155762, "rewards/rejected": -15.952753067016602, "step": 3120 }, { "epoch": 0.7816694761815571, "grad_norm": 0.00028228759765625, "learning_rate": 6.914856176865891e-07, "logits/chosen": 0.5658802390098572, "logits/rejected": 1.7670371532440186, "logps/chosen": -78.00981140136719, "logps/rejected": -1716.184814453125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.34367048740386963, "rewards/margins": 16.066198348999023, "rewards/rejected": -16.409870147705078, "step": 3130 }, { "epoch": 0.7841668227508273, "grad_norm": 6.198883056640625e-05, "learning_rate": 6.765044018942804e-07, "logits/chosen": 0.6243360042572021, "logits/rejected": 1.8233163356781006, "logps/chosen": -77.52008819580078, "logps/rejected": -1712.4619140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3422839641571045, "rewards/margins": 16.079269409179688, "rewards/rejected": -16.421554565429688, "step": 3140 }, { "epoch": 0.7866641693200974, "grad_norm": 0.134765625, "learning_rate": 6.616618211984169e-07, "logits/chosen": 0.617003321647644, "logits/rejected": 1.855446219444275, "logps/chosen": -81.59101104736328, "logps/rejected": -1846.211669921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.37706810235977173, "rewards/margins": 17.38241958618164, "rewards/rejected": -17.75948715209961, "step": 3150 }, { "epoch": 0.7891615158893676, "grad_norm": 0.04833984375, "learning_rate": 6.469590040374799e-07, "logits/chosen": 0.5514385104179382, "logits/rejected": 1.7037725448608398, "logps/chosen": -92.5042724609375, "logps/rejected": -1909.711181640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4816574156284332, "rewards/margins": 17.822921752929688, "rewards/rejected": -18.304576873779297, "step": 3160 }, { "epoch": 0.7916588624586377, "grad_norm": 0.03173828125, "learning_rate": 6.32397068224136e-07, "logits/chosen": 0.528624415397644, "logits/rejected": 1.6811710596084595, "logps/chosen": -89.05570983886719, "logps/rejected": -1862.1185302734375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.44779324531555176, "rewards/margins": 17.432941436767578, "rewards/rejected": -17.880733489990234, "step": 3170 }, { "epoch": 0.7941562090279078, "grad_norm": 0.04052734375, "learning_rate": 6.17977120860249e-07, "logits/chosen": 0.5938631296157837, "logits/rejected": 1.7992160320281982, "logps/chosen": -80.42100524902344, "logps/rejected": -1828.5582275390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.37270471453666687, "rewards/margins": 17.214147567749023, "rewards/rejected": -17.586851119995117, "step": 3180 }, { "epoch": 0.796653555597178, "grad_norm": 0.07177734375, "learning_rate": 6.037002582527121e-07, "logits/chosen": 0.6156030893325806, "logits/rejected": 1.7690448760986328, "logps/chosen": -83.84468078613281, "logps/rejected": -1731.0732421875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4016353189945221, "rewards/margins": 16.079975128173828, "rewards/rejected": -16.481611251831055, "step": 3190 }, { "epoch": 0.7991509021664481, "grad_norm": 0.03759765625, "learning_rate": 5.895675658300981e-07, "logits/chosen": 0.6333300471305847, "logits/rejected": 1.8136202096939087, "logps/chosen": -79.72032165527344, "logps/rejected": -1559.860595703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.37166255712509155, "rewards/margins": 14.575735092163086, "rewards/rejected": -14.947400093078613, "step": 3200 }, { "epoch": 0.8016482487357183, "grad_norm": 0.032470703125, "learning_rate": 5.755801180601381e-07, "logits/chosen": 0.5778881907463074, "logits/rejected": 1.754577875137329, "logps/chosen": -85.21940612792969, "logps/rejected": -1787.017333984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.40619927644729614, "rewards/margins": 16.748323440551758, "rewards/rejected": -17.154521942138672, "step": 3210 }, { "epoch": 0.8041455953049884, "grad_norm": 0.050537109375, "learning_rate": 5.617389783680307e-07, "logits/chosen": 0.5147963762283325, "logits/rejected": 1.858233094215393, "logps/chosen": -85.53825378417969, "logps/rejected": -2189.0078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.41369113326072693, "rewards/margins": 20.725154876708984, "rewards/rejected": -21.13884925842285, "step": 3220 }, { "epoch": 0.8066429418742586, "grad_norm": 0.035888671875, "learning_rate": 5.48045199055596e-07, "logits/chosen": 0.6537925004959106, "logits/rejected": 1.8616943359375, "logps/chosen": -81.87962341308594, "logps/rejected": -1831.1129150390625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.3843618333339691, "rewards/margins": 17.22643280029297, "rewards/rejected": -17.610795974731445, "step": 3230 }, { "epoch": 0.8091402884435287, "grad_norm": 0.02197265625, "learning_rate": 5.344998212212704e-07, "logits/chosen": 0.5282970070838928, "logits/rejected": 1.810681939125061, "logps/chosen": -85.24520111083984, "logps/rejected": -2183.88037109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.4112313389778137, "rewards/margins": 20.611225128173828, "rewards/rejected": -21.022457122802734, "step": 3240 }, { "epoch": 0.811637635012799, "grad_norm": 0.0023956298828125, "learning_rate": 5.211038746809551e-07, "logits/chosen": 0.6539278030395508, "logits/rejected": 1.8353042602539062, "logps/chosen": -83.72335815429688, "logps/rejected": -1798.7239990234375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4129239618778229, "rewards/margins": 16.881694793701172, "rewards/rejected": -17.294618606567383, "step": 3250 }, { "epoch": 0.8141349815820691, "grad_norm": 0.0294189453125, "learning_rate": 5.078583778897216e-07, "logits/chosen": 0.6602455377578735, "logits/rejected": 1.7690246105194092, "logps/chosen": -93.29869079589844, "logps/rejected": -1809.647216796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.4940834641456604, "rewards/margins": 16.90252113342285, "rewards/rejected": -17.396602630615234, "step": 3260 }, { "epoch": 0.8166323281513392, "grad_norm": 0.0830078125, "learning_rate": 4.94764337864384e-07, "logits/chosen": 0.6102297902107239, "logits/rejected": 1.7652736902236938, "logps/chosen": -81.92073059082031, "logps/rejected": -1667.0634765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.38310879468917847, "rewards/margins": 15.526937484741211, "rewards/rejected": -15.910046577453613, "step": 3270 }, { "epoch": 0.8191296747206094, "grad_norm": 0.04150390625, "learning_rate": 4.818227501069328e-07, "logits/chosen": 0.5220754742622375, "logits/rejected": 1.9412825107574463, "logps/chosen": -81.39790344238281, "logps/rejected": -2294.091064453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3824032247066498, "rewards/margins": 21.823511123657227, "rewards/rejected": -22.205913543701172, "step": 3280 }, { "epoch": 0.8216270212898795, "grad_norm": 0.025146484375, "learning_rate": 4.690345985288572e-07, "logits/chosen": 0.5984300971031189, "logits/rejected": 1.787674903869629, "logps/chosen": -85.31007385253906, "logps/rejected": -1911.9349365234375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.41559547185897827, "rewards/margins": 17.892559051513672, "rewards/rejected": -18.308155059814453, "step": 3290 }, { "epoch": 0.8241243678591497, "grad_norm": 8.791685104370117e-07, "learning_rate": 4.5640085537633633e-07, "logits/chosen": 0.5342472791671753, "logits/rejected": 1.8117504119873047, "logps/chosen": -77.17405700683594, "logps/rejected": -2120.84912109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.33397001028060913, "rewards/margins": 20.135189056396484, "rewards/rejected": -20.469158172607422, "step": 3300 }, { "epoch": 0.8266217144284198, "grad_norm": 7.581710815429688e-05, "learning_rate": 4.439224811563211e-07, "logits/chosen": 0.5074091553688049, "logits/rejected": 1.69931960105896, "logps/chosen": -87.99649047851562, "logps/rejected": -1984.584228515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.442207008600235, "rewards/margins": 18.59341049194336, "rewards/rejected": -19.03561782836914, "step": 3310 }, { "epoch": 0.82911906099769, "grad_norm": 0.00012111663818359375, "learning_rate": 4.316004245635158e-07, "logits/chosen": 0.533842921257019, "logits/rejected": 1.7812063694000244, "logps/chosen": -89.41615295410156, "logps/rejected": -2132.177734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.4604712426662445, "rewards/margins": 20.115650177001953, "rewards/rejected": -20.576122283935547, "step": 3320 }, { "epoch": 0.8316164075669601, "grad_norm": 6.8247318267822266e-06, "learning_rate": 4.194356224082455e-07, "logits/chosen": 0.4998435378074646, "logits/rejected": 1.818884253501892, "logps/chosen": -89.97554779052734, "logps/rejected": -2246.93017578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.4607468247413635, "rewards/margins": 21.1903133392334, "rewards/rejected": -21.651060104370117, "step": 3330 }, { "epoch": 0.8341137541362302, "grad_norm": 0.0008697509765625, "learning_rate": 4.074289995452338e-07, "logits/chosen": 0.663809597492218, "logits/rejected": 1.8902143239974976, "logps/chosen": -79.80634307861328, "logps/rejected": -1863.5374755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3643765449523926, "rewards/margins": 17.57371711730957, "rewards/rejected": -17.938095092773438, "step": 3340 }, { "epoch": 0.8366111007055004, "grad_norm": 9.715557098388672e-06, "learning_rate": 3.9558146880329246e-07, "logits/chosen": 0.5806099772453308, "logits/rejected": 1.7180675268173218, "logps/chosen": -88.5436782836914, "logps/rejected": -1847.456298828125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.44766122102737427, "rewards/margins": 17.159259796142578, "rewards/rejected": -17.606922149658203, "step": 3350 }, { "epoch": 0.8391084472747705, "grad_norm": 0.011962890625, "learning_rate": 3.838939309159187e-07, "logits/chosen": 0.6112891435623169, "logits/rejected": 1.7461353540420532, "logps/chosen": -85.6560287475586, "logps/rejected": -1812.320068359375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.42908936738967896, "rewards/margins": 16.986312866210938, "rewards/rejected": -17.415403366088867, "step": 3360 }, { "epoch": 0.8416057938440407, "grad_norm": 0.00148773193359375, "learning_rate": 3.723672744528162e-07, "logits/chosen": 0.5621702671051025, "logits/rejected": 1.8088220357894897, "logps/chosen": -78.2458724975586, "logps/rejected": -1943.078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.34909874200820923, "rewards/margins": 18.354211807250977, "rewards/rejected": -18.703310012817383, "step": 3370 }, { "epoch": 0.8441031404133108, "grad_norm": 0.00182342529296875, "learning_rate": 3.6100237575233647e-07, "logits/chosen": 0.6288230419158936, "logits/rejected": 1.7829002141952515, "logps/chosen": -84.52963256835938, "logps/rejected": -1607.1246337890625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.41369467973709106, "rewards/margins": 15.001932144165039, "rewards/rejected": -15.415626525878906, "step": 3380 }, { "epoch": 0.8466004869825811, "grad_norm": 0.0247802734375, "learning_rate": 3.4980009885486054e-07, "logits/chosen": 0.6984633803367615, "logits/rejected": 1.8022708892822266, "logps/chosen": -77.57188415527344, "logps/rejected": -1505.264892578125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.3510386645793915, "rewards/margins": 14.059832572937012, "rewards/rejected": -14.410870552062988, "step": 3390 }, { "epoch": 0.8490978335518512, "grad_norm": 0.0277099609375, "learning_rate": 3.3876129543710197e-07, "logits/chosen": 0.5471528768539429, "logits/rejected": 1.768148422241211, "logps/chosen": -87.0343246459961, "logps/rejected": -2079.26708984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.44154462218284607, "rewards/margins": 19.6082706451416, "rewards/rejected": -20.049814224243164, "step": 3400 }, { "epoch": 0.8515951801211213, "grad_norm": 0.00250244140625, "learning_rate": 3.2788680474735687e-07, "logits/chosen": 0.5990682244300842, "logits/rejected": 1.8559329509735107, "logps/chosen": -80.894287109375, "logps/rejected": -1869.932373046875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3692544400691986, "rewards/margins": 17.629741668701172, "rewards/rejected": -17.99899673461914, "step": 3410 }, { "epoch": 0.8540925266903915, "grad_norm": 0.00060272216796875, "learning_rate": 3.1717745354170214e-07, "logits/chosen": 0.550452470779419, "logits/rejected": 1.9075467586517334, "logps/chosen": -88.12527465820312, "logps/rejected": -2150.641357421875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.4492851793766022, "rewards/margins": 20.334678649902344, "rewards/rejected": -20.78396224975586, "step": 3420 }, { "epoch": 0.8565898732596616, "grad_norm": 0.000537872314453125, "learning_rate": 3.0663405602113727e-07, "logits/chosen": 0.5784090757369995, "logits/rejected": 1.8440923690795898, "logps/chosen": -77.76791381835938, "logps/rejected": -1944.1751708984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.34689050912857056, "rewards/margins": 18.372732162475586, "rewards/rejected": -18.719623565673828, "step": 3430 }, { "epoch": 0.8590872198289318, "grad_norm": 0.01416015625, "learning_rate": 2.9625741376968107e-07, "logits/chosen": 0.5425665378570557, "logits/rejected": 1.7546192407608032, "logps/chosen": -84.84912109375, "logps/rejected": -2032.4622802734375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.40073734521865845, "rewards/margins": 18.93886375427246, "rewards/rejected": -19.339599609375, "step": 3440 }, { "epoch": 0.8615845663982019, "grad_norm": 0.0206298828125, "learning_rate": 2.8604831569343324e-07, "logits/chosen": 0.5840142965316772, "logits/rejected": 1.6774394512176514, "logps/chosen": -87.08283996582031, "logps/rejected": -1656.0628662109375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.4280741214752197, "rewards/margins": 15.362825393676758, "rewards/rejected": -15.790898323059082, "step": 3450 }, { "epoch": 0.864081912967472, "grad_norm": 0.00439453125, "learning_rate": 2.760075379605942e-07, "logits/chosen": 0.5762545466423035, "logits/rejected": 1.788022756576538, "logps/chosen": -84.1275634765625, "logps/rejected": -1882.4407958984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.40409308671951294, "rewards/margins": 17.698461532592773, "rewards/rejected": -18.102556228637695, "step": 3460 }, { "epoch": 0.8665792595367422, "grad_norm": 0.042724609375, "learning_rate": 2.661358439424552e-07, "logits/chosen": 0.6203972697257996, "logits/rejected": 1.7815377712249756, "logps/chosen": -79.66865539550781, "logps/rejected": -1772.3359375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.36624962091445923, "rewards/margins": 16.665634155273438, "rewards/rejected": -17.031885147094727, "step": 3470 }, { "epoch": 0.8690766061060123, "grad_norm": 0.0026397705078125, "learning_rate": 2.564339841553615e-07, "logits/chosen": 0.6519962549209595, "logits/rejected": 1.8045860528945923, "logps/chosen": -84.95247650146484, "logps/rejected": -1720.0745849609375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.4275601804256439, "rewards/margins": 16.091691970825195, "rewards/rejected": -16.51925277709961, "step": 3480 }, { "epoch": 0.8715739526752825, "grad_norm": 0.1845703125, "learning_rate": 2.469026962036539e-07, "logits/chosen": 0.5797117352485657, "logits/rejected": 1.6833524703979492, "logps/chosen": -88.14852905273438, "logps/rejected": -1696.3714599609375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4267314374446869, "rewards/margins": 15.646469116210938, "rewards/rejected": -16.07320213317871, "step": 3490 }, { "epoch": 0.8740712992445526, "grad_norm": 0.046142578125, "learning_rate": 2.3754270472358786e-07, "logits/chosen": 0.6232503652572632, "logits/rejected": 1.6990222930908203, "logps/chosen": -83.14488983154297, "logps/rejected": -1672.453857421875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.39773061871528625, "rewards/margins": 15.525070190429688, "rewards/rejected": -15.922798156738281, "step": 3500 }, { "epoch": 0.8765686458138228, "grad_norm": 0.07373046875, "learning_rate": 2.283547213282458e-07, "logits/chosen": 0.5654767155647278, "logits/rejected": 1.7425930500030518, "logps/chosen": -85.06395721435547, "logps/rejected": -1774.28125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4012150168418884, "rewards/margins": 16.552427291870117, "rewards/rejected": -16.953643798828125, "step": 3510 }, { "epoch": 0.8790659923830929, "grad_norm": 0.052734375, "learning_rate": 2.1933944455343166e-07, "logits/chosen": 0.5508383512496948, "logits/rejected": 1.7986376285552979, "logps/chosen": -81.19587707519531, "logps/rejected": -2142.4736328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.374092161655426, "rewards/margins": 20.307262420654297, "rewards/rejected": -20.681354522705078, "step": 3520 }, { "epoch": 0.8815633389523632, "grad_norm": 0.003265380859375, "learning_rate": 2.104975598045647e-07, "logits/chosen": 0.5937038660049438, "logits/rejected": 1.7039823532104492, "logps/chosen": -81.80549621582031, "logps/rejected": -1617.2646484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.38183221220970154, "rewards/margins": 15.109405517578125, "rewards/rejected": -15.491238594055176, "step": 3530 }, { "epoch": 0.8840606855216333, "grad_norm": 0.032470703125, "learning_rate": 2.018297393045701e-07, "logits/chosen": 0.6291056871414185, "logits/rejected": 1.7945873737335205, "logps/chosen": -81.0144271850586, "logps/rejected": -1751.724609375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3717043697834015, "rewards/margins": 16.441532135009766, "rewards/rejected": -16.813236236572266, "step": 3540 }, { "epoch": 0.8865580320909034, "grad_norm": 0.039306640625, "learning_rate": 1.9333664204277236e-07, "logits/chosen": 0.5141820907592773, "logits/rejected": 1.6924489736557007, "logps/chosen": -83.84037780761719, "logps/rejected": -2023.642822265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3969913423061371, "rewards/margins": 18.992799758911133, "rewards/rejected": -19.38979148864746, "step": 3550 }, { "epoch": 0.8890553786601736, "grad_norm": 2.4437904357910156e-06, "learning_rate": 1.8501891372479124e-07, "logits/chosen": 0.5262492895126343, "logits/rejected": 1.801138162612915, "logps/chosen": -82.75626373291016, "logps/rejected": -1913.7564697265625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.37326088547706604, "rewards/margins": 17.95614242553711, "rewards/rejected": -18.329402923583984, "step": 3560 }, { "epoch": 0.8915527252294437, "grad_norm": 0.0230712890625, "learning_rate": 1.7687718672345533e-07, "logits/chosen": 0.5257088541984558, "logits/rejected": 1.7338473796844482, "logps/chosen": -84.81585693359375, "logps/rejected": -1994.5963134765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.4195898473262787, "rewards/margins": 18.773387908935547, "rewards/rejected": -19.19297981262207, "step": 3570 }, { "epoch": 0.8940500717987139, "grad_norm": 0.032958984375, "learning_rate": 1.689120800307212e-07, "logits/chosen": 0.43529587984085083, "logits/rejected": 1.6352293491363525, "logps/chosen": -84.8902816772461, "logps/rejected": -2162.602294921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4021669030189514, "rewards/margins": 20.277816772460938, "rewards/rejected": -20.679983139038086, "step": 3580 }, { "epoch": 0.896547418367984, "grad_norm": 0.09033203125, "learning_rate": 1.6112419921061357e-07, "logits/chosen": 0.6369230151176453, "logits/rejected": 1.848402976989746, "logps/chosen": -89.04129791259766, "logps/rejected": -1788.65234375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.4632226526737213, "rewards/margins": 16.736385345458984, "rewards/rejected": -17.199607849121094, "step": 3590 }, { "epoch": 0.8990447649372542, "grad_norm": 0.039306640625, "learning_rate": 1.5351413635318807e-07, "logits/chosen": 0.5430204272270203, "logits/rejected": 1.6954717636108398, "logps/chosen": -80.37650299072266, "logps/rejected": -1739.5556640625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.36847007274627686, "rewards/margins": 16.253259658813477, "rewards/rejected": -16.621726989746094, "step": 3600 }, { "epoch": 0.9015421115065243, "grad_norm": 0.00982666015625, "learning_rate": 1.460824700295138e-07, "logits/chosen": 0.5976991653442383, "logits/rejected": 1.8408482074737549, "logps/chosen": -83.05894470214844, "logps/rejected": -1974.3404541015625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.40586838126182556, "rewards/margins": 18.630273818969727, "rewards/rejected": -19.036144256591797, "step": 3610 }, { "epoch": 0.9040394580757944, "grad_norm": 0.01531982421875, "learning_rate": 1.3882976524768694e-07, "logits/chosen": 0.6637327075004578, "logits/rejected": 1.75222909450531, "logps/chosen": -82.24571228027344, "logps/rejected": -1613.786865234375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.39647185802459717, "rewards/margins": 15.058723449707031, "rewards/rejected": -15.455195426940918, "step": 3620 }, { "epoch": 0.9065368046450646, "grad_norm": 0.00054931640625, "learning_rate": 1.3175657340987664e-07, "logits/chosen": 0.6287505030632019, "logits/rejected": 1.799709677696228, "logps/chosen": -84.08810424804688, "logps/rejected": -1803.7301025390625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4151592254638672, "rewards/margins": 16.90250015258789, "rewards/rejected": -17.317657470703125, "step": 3630 }, { "epoch": 0.9090341512143347, "grad_norm": 0.0010986328125, "learning_rate": 1.2486343227040122e-07, "logits/chosen": 0.5875022411346436, "logits/rejected": 1.7384836673736572, "logps/chosen": -87.7680435180664, "logps/rejected": -1834.406982421875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.4344062805175781, "rewards/margins": 17.189672470092773, "rewards/rejected": -17.62407875061035, "step": 3640 }, { "epoch": 0.9115314977836049, "grad_norm": 0.0419921875, "learning_rate": 1.181508658948452e-07, "logits/chosen": 0.6155994534492493, "logits/rejected": 1.7817541360855103, "logps/chosen": -80.59324645996094, "logps/rejected": -1767.1129150390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3722308278083801, "rewards/margins": 16.59187889099121, "rewards/rejected": -16.964111328125, "step": 3650 }, { "epoch": 0.9140288443528751, "grad_norm": 0.0458984375, "learning_rate": 1.1161938462021627e-07, "logits/chosen": 0.6269813776016235, "logits/rejected": 1.7340434789657593, "logps/chosen": -86.62757110595703, "logps/rejected": -1731.770751953125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.4310578405857086, "rewards/margins": 16.17133903503418, "rewards/rejected": -16.602397918701172, "step": 3660 }, { "epoch": 0.9165261909221453, "grad_norm": 0.1083984375, "learning_rate": 1.0526948501614536e-07, "logits/chosen": 0.5681526064872742, "logits/rejected": 1.8455768823623657, "logps/chosen": -88.42396545410156, "logps/rejected": -1962.5306396484375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.44573092460632324, "rewards/margins": 18.391765594482422, "rewards/rejected": -18.837499618530273, "step": 3670 }, { "epoch": 0.9190235374914154, "grad_norm": 0.0791015625, "learning_rate": 9.910164984713477e-08, "logits/chosen": 0.5716847777366638, "logits/rejected": 1.790804147720337, "logps/chosen": -88.99705505371094, "logps/rejected": -2003.9739990234375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.457981675863266, "rewards/margins": 18.8723087310791, "rewards/rejected": -19.330291748046875, "step": 3680 }, { "epoch": 0.9215208840606856, "grad_norm": 0.0830078125, "learning_rate": 9.311634803585323e-08, "logits/chosen": 0.5493127107620239, "logits/rejected": 1.8056682348251343, "logps/chosen": -80.04996490478516, "logps/rejected": -2063.024169921875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.3674994707107544, "rewards/margins": 19.52413558959961, "rewards/rejected": -19.891637802124023, "step": 3690 }, { "epoch": 0.9240182306299557, "grad_norm": 0.000400543212890625, "learning_rate": 8.7314034627487e-08, "logits/chosen": 0.5750405192375183, "logits/rejected": 1.8101009130477905, "logps/chosen": -78.46788024902344, "logps/rejected": -1988.633056640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.35628947615623474, "rewards/margins": 18.821575164794922, "rewards/rejected": -19.1778621673584, "step": 3700 }, { "epoch": 0.9265155771992258, "grad_norm": 0.03515625, "learning_rate": 8.16951507551439e-08, "logits/chosen": 0.6284887194633484, "logits/rejected": 1.7544790506362915, "logps/chosen": -78.0561752319336, "logps/rejected": -1719.0638427734375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3490171432495117, "rewards/margins": 16.02911949157715, "rewards/rejected": -16.378137588500977, "step": 3710 }, { "epoch": 0.929012923768496, "grad_norm": 6.4849853515625e-05, "learning_rate": 7.626012360631291e-08, "logits/chosen": 0.5767999887466431, "logits/rejected": 1.8027598857879639, "logps/chosen": -87.67066192626953, "logps/rejected": -1751.861328125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.44937849044799805, "rewards/margins": 16.375062942504883, "rewards/rejected": -16.82444190979004, "step": 3720 }, { "epoch": 0.9315102703377661, "grad_norm": 0.0006256103515625, "learning_rate": 7.100936639038936e-08, "logits/chosen": 0.5324774384498596, "logits/rejected": 1.9097219705581665, "logps/chosen": -89.81242370605469, "logps/rejected": -2373.812744140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.4651169776916504, "rewards/margins": 22.532739639282227, "rewards/rejected": -22.997854232788086, "step": 3730 }, { "epoch": 0.9340076169070363, "grad_norm": 1.0356307029724121e-06, "learning_rate": 6.594327830725916e-08, "logits/chosen": 0.5782414674758911, "logits/rejected": 1.906734824180603, "logps/chosen": -80.02381896972656, "logps/rejected": -1915.576416015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3699369728565216, "rewards/margins": 18.08548927307129, "rewards/rejected": -18.455425262451172, "step": 3740 }, { "epoch": 0.9365049634763064, "grad_norm": 0.039306640625, "learning_rate": 6.106224451694592e-08, "logits/chosen": 0.5905268788337708, "logits/rejected": 1.7933048009872437, "logps/chosen": -77.8465347290039, "logps/rejected": -1931.9468994140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3471309244632721, "rewards/margins": 18.231327056884766, "rewards/rejected": -18.57845687866211, "step": 3750 }, { "epoch": 0.9390023100455765, "grad_norm": 0.07861328125, "learning_rate": 5.636663611033266e-08, "logits/chosen": 0.6453654170036316, "logits/rejected": 1.9545791149139404, "logps/chosen": -88.19773864746094, "logps/rejected": -2014.8583984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4550137519836426, "rewards/margins": 18.988948822021484, "rewards/rejected": -19.443960189819336, "step": 3760 }, { "epoch": 0.9414996566148467, "grad_norm": 0.00152587890625, "learning_rate": 5.185681008094579e-08, "logits/chosen": 0.5483246445655823, "logits/rejected": 1.754540205001831, "logps/chosen": -81.78535461425781, "logps/rejected": -1835.6578369140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3718385696411133, "rewards/margins": 17.2070369720459, "rewards/rejected": -17.578876495361328, "step": 3770 }, { "epoch": 0.9439970031841168, "grad_norm": 0.00086212158203125, "learning_rate": 4.753310929781513e-08, "logits/chosen": 0.6061893701553345, "logits/rejected": 1.7712287902832031, "logps/chosen": -84.43482971191406, "logps/rejected": -1739.1148681640625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4173789620399475, "rewards/margins": 16.286354064941406, "rewards/rejected": -16.703731536865234, "step": 3780 }, { "epoch": 0.946494349753387, "grad_norm": 1.2874603271484375e-05, "learning_rate": 4.3395862479405914e-08, "logits/chosen": 0.5530301928520203, "logits/rejected": 1.790492296218872, "logps/chosen": -98.83070373535156, "logps/rejected": -1927.699462890625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.5457450747489929, "rewards/margins": 17.921890258789062, "rewards/rejected": -18.467636108398438, "step": 3790 }, { "epoch": 0.9489916963226572, "grad_norm": 0.0028228759765625, "learning_rate": 3.9445384168628474e-08, "logits/chosen": 0.5836749076843262, "logits/rejected": 1.8246829509735107, "logps/chosen": -80.40269470214844, "logps/rejected": -1708.9019775390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3665417730808258, "rewards/margins": 15.90452766418457, "rewards/rejected": -16.271068572998047, "step": 3800 }, { "epoch": 0.9514890428919274, "grad_norm": 0.017578125, "learning_rate": 3.5681974708923484e-08, "logits/chosen": 0.6176645159721375, "logits/rejected": 1.7562223672866821, "logps/chosen": -82.15937805175781, "logps/rejected": -1655.8538818359375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3880414068698883, "rewards/margins": 15.394001960754395, "rewards/rejected": -15.78204345703125, "step": 3810 }, { "epoch": 0.9539863894611975, "grad_norm": 0.003021240234375, "learning_rate": 3.210592022142717e-08, "logits/chosen": 0.6430649161338806, "logits/rejected": 1.7840299606323242, "logps/chosen": -88.11245727539062, "logps/rejected": -1835.171630859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.4654541611671448, "rewards/margins": 17.16687774658203, "rewards/rejected": -17.6323299407959, "step": 3820 }, { "epoch": 0.9564837360304677, "grad_norm": 0.0260009765625, "learning_rate": 2.8717492583220095e-08, "logits/chosen": 0.6011831164360046, "logits/rejected": 1.8058007955551147, "logps/chosen": -83.06114959716797, "logps/rejected": -1898.2939453125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.39993369579315186, "rewards/margins": 17.886281967163086, "rewards/rejected": -18.28621482849121, "step": 3830 }, { "epoch": 0.9589810825997378, "grad_norm": 0.109375, "learning_rate": 2.551694940665539e-08, "logits/chosen": 0.600081741809845, "logits/rejected": 1.7849693298339844, "logps/chosen": -82.53587341308594, "logps/rejected": -1787.7008056640625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.39744192361831665, "rewards/margins": 16.777345657348633, "rewards/rejected": -17.174787521362305, "step": 3840 }, { "epoch": 0.9614784291690079, "grad_norm": 5.4836273193359375e-05, "learning_rate": 2.2504534019774092e-08, "logits/chosen": 0.712979257106781, "logits/rejected": 1.8468831777572632, "logps/chosen": -80.92863464355469, "logps/rejected": -1629.561767578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3758103847503662, "rewards/margins": 15.236900329589844, "rewards/rejected": -15.612710952758789, "step": 3850 }, { "epoch": 0.9639757757382781, "grad_norm": 0.058349609375, "learning_rate": 1.9680475447805826e-08, "logits/chosen": 0.6279615759849548, "logits/rejected": 1.7799345254898071, "logps/chosen": -81.30252838134766, "logps/rejected": -1729.193115234375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.3813311457633972, "rewards/margins": 16.206878662109375, "rewards/rejected": -16.58820915222168, "step": 3860 }, { "epoch": 0.9664731223075482, "grad_norm": 0.000812530517578125, "learning_rate": 1.70449883957563e-08, "logits/chosen": 0.5945799350738525, "logits/rejected": 1.7449548244476318, "logps/chosen": -79.51522064208984, "logps/rejected": -1772.429931640625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3488074243068695, "rewards/margins": 16.572925567626953, "rewards/rejected": -16.921733856201172, "step": 3870 }, { "epoch": 0.9689704688768184, "grad_norm": 0.03662109375, "learning_rate": 1.4598273232083182e-08, "logits/chosen": 0.5940654873847961, "logits/rejected": 1.737255334854126, "logps/chosen": -89.48096466064453, "logps/rejected": -1763.216064453125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.4539431631565094, "rewards/margins": 16.4827880859375, "rewards/rejected": -16.93673324584961, "step": 3880 }, { "epoch": 0.9714678154460885, "grad_norm": 7.677078247070312e-05, "learning_rate": 1.2340515973464917e-08, "logits/chosen": 0.5371723771095276, "logits/rejected": 1.6967008113861084, "logps/chosen": -89.69468688964844, "logps/rejected": -1855.769287109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.4602131247520447, "rewards/margins": 17.378524780273438, "rewards/rejected": -17.8387393951416, "step": 3890 }, { "epoch": 0.9739651620153587, "grad_norm": 0.01055908203125, "learning_rate": 1.0271888270655118e-08, "logits/chosen": 0.5918472409248352, "logits/rejected": 1.6669203042984009, "logps/chosen": -84.76191711425781, "logps/rejected": -1762.7152099609375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.40269866585731506, "rewards/margins": 16.396747589111328, "rewards/rejected": -16.79944610595703, "step": 3900 }, { "epoch": 0.9764625085846288, "grad_norm": 0.005584716796875, "learning_rate": 8.392547395435769e-09, "logits/chosen": 0.6482867002487183, "logits/rejected": 1.7531925439834595, "logps/chosen": -84.86217498779297, "logps/rejected": -1594.6502685546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4176376461982727, "rewards/margins": 14.827906608581543, "rewards/rejected": -15.245546340942383, "step": 3910 }, { "epoch": 0.9789598551538989, "grad_norm": 1.0251998901367188e-05, "learning_rate": 6.702636228657911e-09, "logits/chosen": 0.6012560129165649, "logits/rejected": 1.7790956497192383, "logps/chosen": -85.35179138183594, "logps/rejected": -1768.960693359375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.41833052039146423, "rewards/margins": 16.56346321105957, "rewards/rejected": -16.981792449951172, "step": 3920 }, { "epoch": 0.9814572017231691, "grad_norm": 0.049560546875, "learning_rate": 5.2022832493800465e-09, "logits/chosen": 0.575610339641571, "logits/rejected": 1.6641845703125, "logps/chosen": -87.38862609863281, "logps/rejected": -1609.995849609375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.43747106194496155, "rewards/margins": 15.003010749816895, "rewards/rejected": -15.440483093261719, "step": 3930 }, { "epoch": 0.9839545482924393, "grad_norm": 0.00927734375, "learning_rate": 3.891602525100124e-09, "logits/chosen": 0.5365520119667053, "logits/rejected": 1.7841014862060547, "logps/chosen": -77.71090698242188, "logps/rejected": -1843.400634765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.33150431513786316, "rewards/margins": 17.311891555786133, "rewards/rejected": -17.643396377563477, "step": 3940 }, { "epoch": 0.9864518948617095, "grad_norm": 0.000896453857421875, "learning_rate": 2.7706937030827495e-09, "logits/chosen": 0.6269220113754272, "logits/rejected": 1.821447730064392, "logps/chosen": -79.47364807128906, "logps/rejected": -1583.856689453125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.35821038484573364, "rewards/margins": 14.733774185180664, "rewards/rejected": -15.091984748840332, "step": 3950 }, { "epoch": 0.9889492414309796, "grad_norm": 0.021728515625, "learning_rate": 1.839642002783859e-09, "logits/chosen": 0.7017726302146912, "logits/rejected": 1.8069097995758057, "logps/chosen": -79.8754653930664, "logps/rejected": -1646.0501708984375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3667237162590027, "rewards/margins": 15.337608337402344, "rewards/rejected": -15.704330444335938, "step": 3960 }, { "epoch": 0.9914465880002498, "grad_norm": 0.021728515625, "learning_rate": 1.0985182093714574e-09, "logits/chosen": 0.6416125297546387, "logits/rejected": 1.760498046875, "logps/chosen": -85.49261474609375, "logps/rejected": -1687.677978515625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.42397341132164, "rewards/margins": 15.762815475463867, "rewards/rejected": -16.18678855895996, "step": 3970 }, { "epoch": 0.9939439345695199, "grad_norm": 0.003143310546875, "learning_rate": 5.473786683440896e-10, "logits/chosen": 0.5962403416633606, "logits/rejected": 1.8247146606445312, "logps/chosen": -83.6251220703125, "logps/rejected": -2010.3323974609375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4074554443359375, "rewards/margins": 18.965452194213867, "rewards/rejected": -19.372909545898438, "step": 3980 }, { "epoch": 0.99644128113879, "grad_norm": 0.049560546875, "learning_rate": 1.862652812467669e-10, "logits/chosen": 0.5162760615348816, "logits/rejected": 1.6673294305801392, "logps/chosen": -83.96633911132812, "logps/rejected": -1847.5804443359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.39398157596588135, "rewards/margins": 17.087627410888672, "rewards/rejected": -17.481609344482422, "step": 3990 }, { "epoch": 0.9989386277080602, "grad_norm": 2.9802322387695312e-05, "learning_rate": 1.5205502486292932e-11, "logits/chosen": 0.582720935344696, "logits/rejected": 1.8548141717910767, "logps/chosen": -80.02593994140625, "logps/rejected": -1968.4027099609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.35573673248291016, "rewards/margins": 18.611202239990234, "rewards/rejected": -18.96693992614746, "step": 4000 }, { "epoch": 0.9989386277080602, "eval_logits/chosen": 0.6540641784667969, "eval_logits/rejected": 1.569779634475708, "eval_logps/chosen": -84.51407623291016, "eval_logps/rejected": -994.2071533203125, "eval_loss": 0.00282670627348125, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.40158194303512573, "eval_rewards/margins": 8.929591178894043, "eval_rewards/rejected": -9.33117389678955, "eval_runtime": 0.6151, "eval_samples_per_second": 8.128, "eval_steps_per_second": 8.128, "step": 4000 }, { "epoch": 0.9999375663357682, "step": 4004, "total_flos": 0.0, "train_loss": 0.05122942747121405, "train_runtime": 6577.5594, "train_samples_per_second": 2.435, "train_steps_per_second": 0.609 } ], "logging_steps": 10, "max_steps": 4004, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }