{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8941884695531375, "eval_steps": 10000.0, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 79.0550765991211, "learning_rate": 6.6666666666666675e-06, "logits/chosen": -13.33062744140625, "logits/rejected": -14.616785049438477, "logps/chosen": -543.2405395507812, "logps/rejected": -576.2285766601562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 7.948572635650635, "learning_rate": 6.666666666666667e-05, "logits/chosen": -15.19211196899414, "logits/rejected": -16.048095703125, "logps/chosen": -607.538330078125, "logps/rejected": -608.885498046875, "loss": 0.6196, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.2160143256187439, "rewards/margins": 0.2663302719593048, "rewards/rejected": -0.05031595006585121, "step": 10 }, { "epoch": 0.0, "grad_norm": 67.47010803222656, "learning_rate": 0.00013333333333333334, "logits/chosen": -10.848133087158203, "logits/rejected": -9.958175659179688, "logps/chosen": -1237.641845703125, "logps/rejected": -1082.7227783203125, "loss": 2.2446, "rewards/accuracies": 0.5, "rewards/chosen": 17.824787139892578, "rewards/margins": -0.07056122273206711, "rewards/rejected": 17.895343780517578, "step": 20 }, { "epoch": 0.0, "grad_norm": 9.948680877685547, "learning_rate": 0.0002, "logits/chosen": -11.886600494384766, "logits/rejected": -12.03013801574707, "logps/chosen": -362.31756591796875, "logps/rejected": -560.6836547851562, "loss": 4.1276, "rewards/accuracies": 0.5, "rewards/chosen": -0.9082545042037964, "rewards/margins": -3.204153537750244, "rewards/rejected": 2.295898914337158, "step": 30 }, { "epoch": 0.0, "grad_norm": 69.71764373779297, "learning_rate": 0.0002666666666666667, "logits/chosen": -13.246805191040039, "logits/rejected": -12.323087692260742, "logps/chosen": -1235.0091552734375, "logps/rejected": -1362.859619140625, "loss": 3.2061, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -27.80769920349121, "rewards/margins": 1.9822498559951782, "rewards/rejected": -29.78995132446289, "step": 40 }, { "epoch": 0.0, "grad_norm": 129.64056396484375, "learning_rate": 0.0003333333333333333, "logits/chosen": -12.884356498718262, "logits/rejected": -11.354658126831055, "logps/chosen": -589.2250366210938, "logps/rejected": -778.6436157226562, "loss": 5.58, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.49032974243164, "rewards/margins": 0.9361041784286499, "rewards/rejected": -18.426433563232422, "step": 50 }, { "epoch": 0.0, "grad_norm": 1705.173583984375, "learning_rate": 0.0004, "logits/chosen": -11.519826889038086, "logits/rejected": -12.165529251098633, "logps/chosen": -908.3253784179688, "logps/rejected": -682.0640869140625, "loss": 15.1568, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.985088348388672, "rewards/margins": -12.43117618560791, "rewards/rejected": -18.55391502380371, "step": 60 }, { "epoch": 0.0, "grad_norm": 1036.1717529296875, "learning_rate": 0.00046666666666666666, "logits/chosen": -13.166908264160156, "logits/rejected": -12.385476112365723, "logps/chosen": -1752.58984375, "logps/rejected": -1105.6995849609375, "loss": 13.1787, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.314640045166016, "rewards/margins": -10.721281051635742, "rewards/rejected": -15.593356132507324, "step": 70 }, { "epoch": 0.0, "grad_norm": 531.494140625, "learning_rate": 0.0005333333333333334, "logits/chosen": -9.830523490905762, "logits/rejected": -9.639546394348145, "logps/chosen": -1073.137451171875, "logps/rejected": -868.0662231445312, "loss": 25.473, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -78.60629272460938, "rewards/margins": -21.26487922668457, "rewards/rejected": -57.34141159057617, "step": 80 }, { "epoch": 0.01, "grad_norm": 237.78582763671875, "learning_rate": 0.0006, "logits/chosen": -8.712360382080078, "logits/rejected": -8.80846881866455, "logps/chosen": -1765.245361328125, "logps/rejected": -1670.3939208984375, "loss": 6.8626, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.38302993774414, "rewards/margins": 0.2198719084262848, "rewards/rejected": -28.6028995513916, "step": 90 }, { "epoch": 0.01, "grad_norm": 693.9869995117188, "learning_rate": 0.0006666666666666666, "logits/chosen": -12.346224784851074, "logits/rejected": -12.23181438446045, "logps/chosen": -2082.345947265625, "logps/rejected": -2151.71533203125, "loss": 16.004, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.08687591552734, "rewards/margins": -3.986098527908325, "rewards/rejected": -109.10076904296875, "step": 100 }, { "epoch": 0.01, "grad_norm": 3.486322016667265e-13, "learning_rate": 0.0007333333333333333, "logits/chosen": -10.503616333007812, "logits/rejected": -10.510167121887207, "logps/chosen": -2363.585693359375, "logps/rejected": -2123.504638671875, "loss": 27.2918, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.36336517333984, "rewards/margins": -18.149518966674805, "rewards/rejected": -105.2138442993164, "step": 110 }, { "epoch": 0.01, "grad_norm": 133.98788452148438, "learning_rate": 0.0008, "logits/chosen": -9.226245880126953, "logits/rejected": -9.098515510559082, "logps/chosen": -1551.9586181640625, "logps/rejected": -1357.475830078125, "loss": 19.4354, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -47.712547302246094, "rewards/margins": -11.50971508026123, "rewards/rejected": -36.20282745361328, "step": 120 }, { "epoch": 0.01, "grad_norm": 600.650146484375, "learning_rate": 0.0008666666666666667, "logits/chosen": -7.6896772384643555, "logits/rejected": -7.815736293792725, "logps/chosen": -1498.174072265625, "logps/rejected": -1496.6353759765625, "loss": 13.3421, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -77.28450775146484, "rewards/margins": -3.5759434700012207, "rewards/rejected": -73.70856475830078, "step": 130 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 0.0009333333333333333, "logits/chosen": -8.860821723937988, "logits/rejected": -8.719849586486816, "logps/chosen": -4525.97802734375, "logps/rejected": -5622.8916015625, "loss": 17.1995, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -348.63189697265625, "rewards/margins": 88.90797424316406, "rewards/rejected": -437.5398864746094, "step": 140 }, { "epoch": 0.01, "grad_norm": 3058.078857421875, "learning_rate": 0.001, "logits/chosen": -9.124612808227539, "logits/rejected": -9.051043510437012, "logps/chosen": -7139.5244140625, "logps/rejected": -5119.35595703125, "loss": 172.8901, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -578.91259765625, "rewards/margins": -171.18446350097656, "rewards/rejected": -407.72821044921875, "step": 150 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 0.0009998064940593676, "logits/chosen": -9.406896591186523, "logits/rejected": -9.174515724182129, "logps/chosen": -2471.397705078125, "logps/rejected": -2583.948974609375, "loss": 29.1737, "rewards/accuracies": 0.5, "rewards/chosen": -129.94354248046875, "rewards/margins": 0.1854328215122223, "rewards/rejected": -130.1289520263672, "step": 160 }, { "epoch": 0.01, "grad_norm": 170.42623901367188, "learning_rate": 0.0009996129881187353, "logits/chosen": -9.074646949768066, "logits/rejected": -9.092073440551758, "logps/chosen": -3532.22265625, "logps/rejected": -3179.40576171875, "loss": 28.7349, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -245.3097381591797, "rewards/margins": -20.016616821289062, "rewards/rejected": -225.2931365966797, "step": 170 }, { "epoch": 0.01, "grad_norm": 0.0029475451447069645, "learning_rate": 0.0009994194821781029, "logits/chosen": -6.456587791442871, "logits/rejected": -6.4560723304748535, "logps/chosen": -2494.274169921875, "logps/rejected": -1886.9368896484375, "loss": 58.8563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -119.5272445678711, "rewards/margins": -56.49876022338867, "rewards/rejected": -63.028480529785156, "step": 180 }, { "epoch": 0.01, "grad_norm": 0.12492014467716217, "learning_rate": 0.0009992259762374705, "logits/chosen": -7.145413875579834, "logits/rejected": -7.1469879150390625, "logps/chosen": -2288.42919921875, "logps/rejected": -2143.359619140625, "loss": 41.0566, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -83.50308990478516, "rewards/margins": -17.01555061340332, "rewards/rejected": -66.48753356933594, "step": 190 }, { "epoch": 0.01, "grad_norm": 97.82428741455078, "learning_rate": 0.000999032470296838, "logits/chosen": -6.8696088790893555, "logits/rejected": -6.869769096374512, "logps/chosen": -1822.53125, "logps/rejected": -1029.7470703125, "loss": 68.6133, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -146.5786590576172, "rewards/margins": -65.56668090820312, "rewards/rejected": -81.01197052001953, "step": 200 }, { "epoch": 0.01, "grad_norm": 14.525142669677734, "learning_rate": 0.0009988389643562057, "logits/chosen": -4.0387396812438965, "logits/rejected": -4.038853645324707, "logps/chosen": -2399.68896484375, "logps/rejected": -1892.8056640625, "loss": 66.7877, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -182.8202362060547, "rewards/margins": -59.76533126831055, "rewards/rejected": -123.054931640625, "step": 210 }, { "epoch": 0.01, "grad_norm": 0.0007763044559396803, "learning_rate": 0.0009986454584155733, "logits/chosen": -6.105194091796875, "logits/rejected": -6.1039323806762695, "logps/chosen": -2187.7021484375, "logps/rejected": -1637.1861572265625, "loss": 43.7206, "rewards/accuracies": 0.5, "rewards/chosen": -87.96297454833984, "rewards/margins": -26.038898468017578, "rewards/rejected": -61.92407989501953, "step": 220 }, { "epoch": 0.01, "grad_norm": 91.55896759033203, "learning_rate": 0.000998451952474941, "logits/chosen": -7.600955963134766, "logits/rejected": -7.599919319152832, "logps/chosen": -1831.0084228515625, "logps/rejected": -2311.152587890625, "loss": 35.8971, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -34.68043518066406, "rewards/margins": 7.497575283050537, "rewards/rejected": -42.178009033203125, "step": 230 }, { "epoch": 0.01, "grad_norm": 1.7780219252472307e-07, "learning_rate": 0.0009982584465343086, "logits/chosen": -7.835398197174072, "logits/rejected": -7.834392547607422, "logps/chosen": -2121.80859375, "logps/rejected": -1565.8697509765625, "loss": 14.3198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.83469772338867, "rewards/margins": -1.2381786108016968, "rewards/rejected": -32.596519470214844, "step": 240 }, { "epoch": 0.01, "grad_norm": 0.0009164345683529973, "learning_rate": 0.0009980649405936762, "logits/chosen": -6.777050971984863, "logits/rejected": -6.778311252593994, "logps/chosen": -2713.24951171875, "logps/rejected": -1647.801513671875, "loss": 47.6516, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -124.9349594116211, "rewards/margins": -38.98810577392578, "rewards/rejected": -85.94685363769531, "step": 250 }, { "epoch": 0.02, "grad_norm": 79.9888687133789, "learning_rate": 0.0009978714346530438, "logits/chosen": -6.422275543212891, "logits/rejected": -6.421200752258301, "logps/chosen": -2463.85400390625, "logps/rejected": -2120.465576171875, "loss": 40.6159, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -168.24913024902344, "rewards/margins": -28.290863037109375, "rewards/rejected": -139.95828247070312, "step": 260 }, { "epoch": 0.02, "grad_norm": 42.487205505371094, "learning_rate": 0.0009976779287124114, "logits/chosen": -7.872190952301025, "logits/rejected": -7.872282981872559, "logps/chosen": -1957.318115234375, "logps/rejected": -2109.769287109375, "loss": 8.7086, "rewards/accuracies": 0.5, "rewards/chosen": -46.243873596191406, "rewards/margins": 20.03973960876465, "rewards/rejected": -66.28361511230469, "step": 270 }, { "epoch": 0.02, "grad_norm": 35.03407669067383, "learning_rate": 0.000997484422771779, "logits/chosen": -6.6078691482543945, "logits/rejected": -6.607802391052246, "logps/chosen": -2212.938232421875, "logps/rejected": -1856.466796875, "loss": 29.6942, "rewards/accuracies": 0.5, "rewards/chosen": -76.65740203857422, "rewards/margins": -14.331930160522461, "rewards/rejected": -62.325469970703125, "step": 280 }, { "epoch": 0.02, "grad_norm": 46.82324981689453, "learning_rate": 0.0009972909168311467, "logits/chosen": -6.0425543785095215, "logits/rejected": -6.0446600914001465, "logps/chosen": -1235.8304443359375, "logps/rejected": -791.0191650390625, "loss": 49.61, "rewards/accuracies": 0.5, "rewards/chosen": -70.1013412475586, "rewards/margins": -8.875106811523438, "rewards/rejected": -61.22623825073242, "step": 290 }, { "epoch": 0.02, "grad_norm": 17.417787551879883, "learning_rate": 0.0009970974108905143, "logits/chosen": -6.7392425537109375, "logits/rejected": -6.737781524658203, "logps/chosen": -2058.590087890625, "logps/rejected": -1616.5101318359375, "loss": 9.9194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.24297332763672, "rewards/margins": 15.514071464538574, "rewards/rejected": -51.757049560546875, "step": 300 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 0.0009969039049498821, "logits/chosen": -6.1298418045043945, "logits/rejected": -6.130659580230713, "logps/chosen": -1896.272216796875, "logps/rejected": -1627.0184326171875, "loss": 38.6518, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -97.53885650634766, "rewards/margins": -16.450382232666016, "rewards/rejected": -81.08847045898438, "step": 310 }, { "epoch": 0.02, "grad_norm": 5.983738895001811e-15, "learning_rate": 0.0009967103990092497, "logits/chosen": -7.275118827819824, "logits/rejected": -7.274750709533691, "logps/chosen": -1858.8955078125, "logps/rejected": -1355.5494384765625, "loss": 40.6228, "rewards/accuracies": 0.5, "rewards/chosen": -110.50968933105469, "rewards/margins": -30.50033187866211, "rewards/rejected": -80.00936889648438, "step": 320 }, { "epoch": 0.02, "grad_norm": 55.3319091796875, "learning_rate": 0.0009965168930686173, "logits/chosen": -6.929184913635254, "logits/rejected": -6.927158355712891, "logps/chosen": -1939.719482421875, "logps/rejected": -1466.4986572265625, "loss": 25.1373, "rewards/accuracies": 0.5, "rewards/chosen": -79.09292602539062, "rewards/margins": -11.824658393859863, "rewards/rejected": -67.26826477050781, "step": 330 }, { "epoch": 0.02, "grad_norm": 8.277417509816587e-05, "learning_rate": 0.000996323387127985, "logits/chosen": -7.233368873596191, "logits/rejected": -7.231075286865234, "logps/chosen": -1739.8916015625, "logps/rejected": -1302.759033203125, "loss": 60.5064, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -102.33155822753906, "rewards/margins": -34.90766906738281, "rewards/rejected": -67.42388916015625, "step": 340 }, { "epoch": 0.02, "grad_norm": 30.069000244140625, "learning_rate": 0.0009961298811873526, "logits/chosen": -6.4559454917907715, "logits/rejected": -6.447089195251465, "logps/chosen": -1613.5511474609375, "logps/rejected": -1370.2017822265625, "loss": 29.5282, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -125.69710540771484, "rewards/margins": -21.5404109954834, "rewards/rejected": -104.15670013427734, "step": 350 }, { "epoch": 0.02, "grad_norm": 1.2652662917389534e-05, "learning_rate": 0.0009959363752467202, "logits/chosen": -6.617071628570557, "logits/rejected": -6.625367641448975, "logps/chosen": -2038.4144287109375, "logps/rejected": -1954.8209228515625, "loss": 21.0876, "rewards/accuracies": 0.5, "rewards/chosen": -48.580726623535156, "rewards/margins": 0.3278007507324219, "rewards/rejected": -48.90852737426758, "step": 360 }, { "epoch": 0.02, "grad_norm": 85.46837615966797, "learning_rate": 0.0009957428693060876, "logits/chosen": -5.823023319244385, "logits/rejected": -5.813015937805176, "logps/chosen": -2236.2802734375, "logps/rejected": -1743.0999755859375, "loss": 30.3614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -55.43383026123047, "rewards/margins": -24.121402740478516, "rewards/rejected": -31.312427520751953, "step": 370 }, { "epoch": 0.02, "grad_norm": 190.8472442626953, "learning_rate": 0.0009955493633654552, "logits/chosen": -5.652869701385498, "logits/rejected": -5.649645805358887, "logps/chosen": -2216.612548828125, "logps/rejected": -1622.517822265625, "loss": 50.3392, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -109.64753723144531, "rewards/margins": -47.692256927490234, "rewards/rejected": -61.95528030395508, "step": 380 }, { "epoch": 0.02, "grad_norm": 100.28608703613281, "learning_rate": 0.0009953558574248228, "logits/chosen": -7.488648414611816, "logits/rejected": -7.486850738525391, "logps/chosen": -2037.3365478515625, "logps/rejected": -1640.5283203125, "loss": 36.6937, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -93.3482666015625, "rewards/margins": -16.538909912109375, "rewards/rejected": -76.80935668945312, "step": 390 }, { "epoch": 0.02, "grad_norm": 4.202957630157471, "learning_rate": 0.0009951623514841905, "logits/chosen": -8.420580863952637, "logits/rejected": -8.441757202148438, "logps/chosen": -2226.40673828125, "logps/rejected": -2016.0570068359375, "loss": 18.8034, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -108.03814697265625, "rewards/margins": -13.83152961730957, "rewards/rejected": -94.20661926269531, "step": 400 }, { "epoch": 0.02, "grad_norm": 13.611844062805176, "learning_rate": 0.000994968845543558, "logits/chosen": -4.566296577453613, "logits/rejected": -4.562756538391113, "logps/chosen": -2511.823486328125, "logps/rejected": -2156.27294921875, "loss": 2.8049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -49.801551818847656, "rewards/margins": 13.184181213378906, "rewards/rejected": -62.9857292175293, "step": 410 }, { "epoch": 0.02, "grad_norm": 331.4441223144531, "learning_rate": 0.000994775339602926, "logits/chosen": -4.448211669921875, "logits/rejected": -4.398253440856934, "logps/chosen": -2920.37158203125, "logps/rejected": -2699.747314453125, "loss": 37.0523, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.16453552246094, "rewards/margins": -7.370457649230957, "rewards/rejected": -128.79408264160156, "step": 420 }, { "epoch": 0.02, "grad_norm": 147.90342712402344, "learning_rate": 0.0009945818336622935, "logits/chosen": -6.026165962219238, "logits/rejected": -5.995509147644043, "logps/chosen": -1619.7042236328125, "logps/rejected": -1066.2396240234375, "loss": 48.4837, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -93.85808563232422, "rewards/margins": -43.22713088989258, "rewards/rejected": -50.630958557128906, "step": 430 }, { "epoch": 0.03, "grad_norm": 12.26639175415039, "learning_rate": 0.0009943883277216611, "logits/chosen": -8.44426441192627, "logits/rejected": -8.460783004760742, "logps/chosen": -1789.95703125, "logps/rejected": -1610.291748046875, "loss": 17.7103, "rewards/accuracies": 0.5, "rewards/chosen": -46.815208435058594, "rewards/margins": -0.5483169555664062, "rewards/rejected": -46.266883850097656, "step": 440 }, { "epoch": 0.03, "grad_norm": 3.555406502418941e-16, "learning_rate": 0.0009941948217810288, "logits/chosen": -7.77847957611084, "logits/rejected": -7.783029079437256, "logps/chosen": -1707.8134765625, "logps/rejected": -1375.230712890625, "loss": 36.1372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -67.83514404296875, "rewards/margins": -15.93360424041748, "rewards/rejected": -51.90153121948242, "step": 450 }, { "epoch": 0.03, "grad_norm": 1.847238832827536e-16, "learning_rate": 0.0009940013158403964, "logits/chosen": -6.331117153167725, "logits/rejected": -6.306968688964844, "logps/chosen": -1664.856201171875, "logps/rejected": -1407.0579833984375, "loss": 36.79, "rewards/accuracies": 0.5, "rewards/chosen": -96.25984954833984, "rewards/margins": -17.700550079345703, "rewards/rejected": -78.55929565429688, "step": 460 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 0.000993807809899764, "logits/chosen": -7.969399929046631, "logits/rejected": -7.935347080230713, "logps/chosen": -2172.584228515625, "logps/rejected": -1759.8275146484375, "loss": 25.2709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.3980712890625, "rewards/margins": -5.081993103027344, "rewards/rejected": -29.316076278686523, "step": 470 }, { "epoch": 0.03, "grad_norm": 2.6890463808906654e-17, "learning_rate": 0.0009936143039591316, "logits/chosen": -7.759824275970459, "logits/rejected": -7.762341499328613, "logps/chosen": -1762.490966796875, "logps/rejected": -1777.1982421875, "loss": 9.7837, "rewards/accuracies": 0.5, "rewards/chosen": -100.68254852294922, "rewards/margins": 1.9100021123886108, "rewards/rejected": -102.5925521850586, "step": 480 }, { "epoch": 0.03, "grad_norm": 41.718223571777344, "learning_rate": 0.0009934207980184992, "logits/chosen": -6.830227851867676, "logits/rejected": -6.8066301345825195, "logps/chosen": -1634.262451171875, "logps/rejected": -1135.278076171875, "loss": 38.4581, "rewards/accuracies": 0.5, "rewards/chosen": -74.12540435791016, "rewards/margins": -24.716808319091797, "rewards/rejected": -49.40859603881836, "step": 490 }, { "epoch": 0.03, "grad_norm": 1.2868081331253052, "learning_rate": 0.0009932272920778668, "logits/chosen": -8.204156875610352, "logits/rejected": -8.182531356811523, "logps/chosen": -1690.419921875, "logps/rejected": -1684.708984375, "loss": 13.7028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -98.2767333984375, "rewards/margins": 0.4097534120082855, "rewards/rejected": -98.68647766113281, "step": 500 }, { "epoch": 0.03, "grad_norm": 81.09761047363281, "learning_rate": 0.0009930337861372345, "logits/chosen": -7.1385650634765625, "logits/rejected": -7.135669708251953, "logps/chosen": -2110.69580078125, "logps/rejected": -1985.994873046875, "loss": 13.2901, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 33.468162536621094, "rewards/margins": 3.5200119018554688, "rewards/rejected": 29.948156356811523, "step": 510 }, { "epoch": 0.03, "grad_norm": 90.39929962158203, "learning_rate": 0.000992840280196602, "logits/chosen": -6.992276668548584, "logits/rejected": -6.980306148529053, "logps/chosen": -2307.57080078125, "logps/rejected": -1310.16015625, "loss": 65.1338, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -78.27120208740234, "rewards/margins": -45.69062042236328, "rewards/rejected": -32.5805778503418, "step": 520 }, { "epoch": 0.03, "grad_norm": 57.91484069824219, "learning_rate": 0.0009926467742559697, "logits/chosen": -6.971288204193115, "logits/rejected": -6.9686784744262695, "logps/chosen": -1995.400634765625, "logps/rejected": -1571.699462890625, "loss": 23.7459, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.133018493652344, "rewards/margins": -3.5609397888183594, "rewards/rejected": 7.6939568519592285, "step": 530 }, { "epoch": 0.03, "grad_norm": 86.58811950683594, "learning_rate": 0.0009924532683153373, "logits/chosen": -7.894536018371582, "logits/rejected": -7.8725104331970215, "logps/chosen": -2207.85009765625, "logps/rejected": -1694.267578125, "loss": 44.9642, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -140.19595336914062, "rewards/margins": -41.18789291381836, "rewards/rejected": -99.008056640625, "step": 540 }, { "epoch": 0.03, "grad_norm": 5.662807941436768, "learning_rate": 0.000992259762374705, "logits/chosen": -7.573005676269531, "logits/rejected": -7.568861961364746, "logps/chosen": -1783.052734375, "logps/rejected": -1313.08935546875, "loss": 23.6, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -74.45988464355469, "rewards/margins": -16.292369842529297, "rewards/rejected": -58.16752243041992, "step": 550 }, { "epoch": 0.03, "grad_norm": 91.20228576660156, "learning_rate": 0.0009920662564340725, "logits/chosen": -8.139141082763672, "logits/rejected": -8.127764701843262, "logps/chosen": -1948.017822265625, "logps/rejected": -1735.8955078125, "loss": 30.6022, "rewards/accuracies": 0.5, "rewards/chosen": -92.84540557861328, "rewards/margins": -24.18777847290039, "rewards/rejected": -68.65763092041016, "step": 560 }, { "epoch": 0.03, "grad_norm": 37.36322021484375, "learning_rate": 0.0009918727504934402, "logits/chosen": -6.426170349121094, "logits/rejected": -6.414398193359375, "logps/chosen": -1900.140869140625, "logps/rejected": -1476.0205078125, "loss": 23.7704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -88.2837142944336, "rewards/margins": -2.8758177757263184, "rewards/rejected": -85.40789794921875, "step": 570 }, { "epoch": 0.03, "grad_norm": 51.519630432128906, "learning_rate": 0.0009916792445528078, "logits/chosen": -7.693927764892578, "logits/rejected": -7.691755771636963, "logps/chosen": -1643.5306396484375, "logps/rejected": -1194.6959228515625, "loss": 30.4764, "rewards/accuracies": 0.5, "rewards/chosen": -90.55430603027344, "rewards/margins": -5.6063432693481445, "rewards/rejected": -84.94795989990234, "step": 580 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 0.0009914857386121754, "logits/chosen": -8.19835090637207, "logits/rejected": -8.176474571228027, "logps/chosen": -1547.0028076171875, "logps/rejected": -1281.407470703125, "loss": 41.2852, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.08209228515625, "rewards/margins": -24.27876853942871, "rewards/rejected": -98.80331420898438, "step": 590 }, { "epoch": 0.03, "grad_norm": 3.2091870139083767e-07, "learning_rate": 0.000991292232671543, "logits/chosen": -7.381357669830322, "logits/rejected": -7.3774871826171875, "logps/chosen": -1644.5234375, "logps/rejected": -1495.98486328125, "loss": 16.3691, "rewards/accuracies": 0.5, "rewards/chosen": -96.88980102539062, "rewards/margins": -9.926788330078125, "rewards/rejected": -86.96299743652344, "step": 600 }, { "epoch": 0.04, "grad_norm": 31.075862884521484, "learning_rate": 0.0009910987267309106, "logits/chosen": -7.205088138580322, "logits/rejected": -7.234914302825928, "logps/chosen": -1876.4873046875, "logps/rejected": -1723.348388671875, "loss": 14.9459, "rewards/accuracies": 0.5, "rewards/chosen": -88.54505920410156, "rewards/margins": 7.451369285583496, "rewards/rejected": -95.99642944335938, "step": 610 }, { "epoch": 0.04, "grad_norm": 40.83525085449219, "learning_rate": 0.0009909052207902782, "logits/chosen": -7.004599571228027, "logits/rejected": -6.97506046295166, "logps/chosen": -2039.6810302734375, "logps/rejected": -1358.8668212890625, "loss": 28.5663, "rewards/accuracies": 0.5, "rewards/chosen": -73.08271789550781, "rewards/margins": -18.245763778686523, "rewards/rejected": -54.83695602416992, "step": 620 }, { "epoch": 0.04, "grad_norm": 1.222853657366051e-16, "learning_rate": 0.0009907117148496459, "logits/chosen": -7.760983467102051, "logits/rejected": -7.748518943786621, "logps/chosen": -1285.5289306640625, "logps/rejected": -1194.8626708984375, "loss": 15.9791, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -68.13813018798828, "rewards/margins": -6.503204345703125, "rewards/rejected": -61.634925842285156, "step": 630 }, { "epoch": 0.04, "grad_norm": 41.13446044921875, "learning_rate": 0.0009905182089090135, "logits/chosen": -6.013266563415527, "logits/rejected": -5.9806365966796875, "logps/chosen": -1976.6470947265625, "logps/rejected": -1309.6595458984375, "loss": 26.8871, "rewards/accuracies": 0.5, "rewards/chosen": -19.510499954223633, "rewards/margins": -12.427497863769531, "rewards/rejected": -7.083000183105469, "step": 640 }, { "epoch": 0.04, "grad_norm": 37.23004913330078, "learning_rate": 0.000990324702968381, "logits/chosen": -6.275165557861328, "logits/rejected": -6.2306647300720215, "logps/chosen": -1559.93505859375, "logps/rejected": -1011.8458862304688, "loss": 28.6028, "rewards/accuracies": 0.5, "rewards/chosen": -48.93232345581055, "rewards/margins": 1.7890281677246094, "rewards/rejected": -50.721351623535156, "step": 650 }, { "epoch": 0.04, "grad_norm": 5.770204580812788e-08, "learning_rate": 0.0009901311970277487, "logits/chosen": -9.365476608276367, "logits/rejected": -9.369478225708008, "logps/chosen": -1414.155517578125, "logps/rejected": -1435.2095947265625, "loss": 6.0635, "rewards/accuracies": 0.5, "rewards/chosen": -88.15409088134766, "rewards/margins": 3.634843111038208, "rewards/rejected": -91.7889404296875, "step": 660 }, { "epoch": 0.04, "grad_norm": 51.66230392456055, "learning_rate": 0.0009899376910871163, "logits/chosen": -7.515635013580322, "logits/rejected": -7.5245256423950195, "logps/chosen": -1939.2611083984375, "logps/rejected": -2025.1881103515625, "loss": 33.1741, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -108.00618743896484, "rewards/margins": -14.68476676940918, "rewards/rejected": -93.3214111328125, "step": 670 }, { "epoch": 0.04, "grad_norm": 0.46016398072242737, "learning_rate": 0.000989744185146484, "logits/chosen": -5.116194248199463, "logits/rejected": -5.129124641418457, "logps/chosen": -1484.629150390625, "logps/rejected": -1779.163330078125, "loss": 6.4277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -39.893463134765625, "rewards/margins": 17.024633407592773, "rewards/rejected": -56.9180908203125, "step": 680 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 0.0009895506792058516, "logits/chosen": -6.122480869293213, "logits/rejected": -6.1236252784729, "logps/chosen": -2510.695068359375, "logps/rejected": -2396.68896484375, "loss": 8.5141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -75.48404693603516, "rewards/margins": 4.666834354400635, "rewards/rejected": -80.15088653564453, "step": 690 }, { "epoch": 0.04, "grad_norm": 5.408333778381348, "learning_rate": 0.0009893571732652192, "logits/chosen": -8.676216125488281, "logits/rejected": -8.578221321105957, "logps/chosen": -2412.552978515625, "logps/rejected": -1958.117919921875, "loss": 19.3499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 28.8022518157959, "rewards/margins": -5.165684700012207, "rewards/rejected": 33.96793746948242, "step": 700 }, { "epoch": 0.04, "grad_norm": 20.156848907470703, "learning_rate": 0.0009891636673245868, "logits/chosen": -9.68482780456543, "logits/rejected": -9.648934364318848, "logps/chosen": -2727.833984375, "logps/rejected": -2880.28076171875, "loss": 12.5675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -57.87054443359375, "rewards/margins": 0.3588424623012543, "rewards/rejected": -58.22938919067383, "step": 710 }, { "epoch": 0.04, "grad_norm": 28.16060447692871, "learning_rate": 0.0009889701613839544, "logits/chosen": -9.937095642089844, "logits/rejected": -9.906726837158203, "logps/chosen": -2007.8912353515625, "logps/rejected": -1634.097900390625, "loss": 30.1331, "rewards/accuracies": 0.5, "rewards/chosen": -100.7011489868164, "rewards/margins": -19.781085968017578, "rewards/rejected": -80.9200668334961, "step": 720 }, { "epoch": 0.04, "grad_norm": 0.0066229370422661304, "learning_rate": 0.0009887766554433223, "logits/chosen": -7.114504814147949, "logits/rejected": -7.101144313812256, "logps/chosen": -1518.2005615234375, "logps/rejected": -1294.391845703125, "loss": 17.6549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -48.4238166809082, "rewards/margins": 0.9287613034248352, "rewards/rejected": -49.35258102416992, "step": 730 }, { "epoch": 0.04, "grad_norm": 106.27301788330078, "learning_rate": 0.0009885831495026899, "logits/chosen": -6.532594203948975, "logits/rejected": -6.5394697189331055, "logps/chosen": -2377.072265625, "logps/rejected": -2154.76123046875, "loss": 21.9901, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -35.338287353515625, "rewards/margins": -14.821279525756836, "rewards/rejected": -20.51700782775879, "step": 740 }, { "epoch": 0.04, "grad_norm": 164.3705291748047, "learning_rate": 0.0009883896435620575, "logits/chosen": -8.81663703918457, "logits/rejected": -8.813243865966797, "logps/chosen": -1773.4482421875, "logps/rejected": -1665.185302734375, "loss": 46.8302, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -106.53714752197266, "rewards/margins": -46.6954345703125, "rewards/rejected": -59.841712951660156, "step": 750 }, { "epoch": 0.04, "grad_norm": 60.73805236816406, "learning_rate": 0.000988196137621425, "logits/chosen": -9.71679401397705, "logits/rejected": -9.846713066101074, "logps/chosen": -2439.1630859375, "logps/rejected": -1588.63134765625, "loss": 57.6243, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -144.12254333496094, "rewards/margins": -55.06391143798828, "rewards/rejected": -89.05863952636719, "step": 760 }, { "epoch": 0.04, "grad_norm": 4.437787055969238, "learning_rate": 0.0009880026316807927, "logits/chosen": -7.173369407653809, "logits/rejected": -7.189019203186035, "logps/chosen": -1760.888671875, "logps/rejected": -1808.2392578125, "loss": 11.3131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.28813171386719, "rewards/margins": 5.980035305023193, "rewards/rejected": -115.26817321777344, "step": 770 }, { "epoch": 0.05, "grad_norm": 14.652424812316895, "learning_rate": 0.0009878091257401603, "logits/chosen": -7.83901834487915, "logits/rejected": -7.843647003173828, "logps/chosen": -1478.75146484375, "logps/rejected": -1336.068115234375, "loss": 21.1633, "rewards/accuracies": 0.5, "rewards/chosen": -53.1229248046875, "rewards/margins": -5.578841209411621, "rewards/rejected": -47.54408264160156, "step": 780 }, { "epoch": 0.05, "grad_norm": 6.077730655670166, "learning_rate": 0.000987615619799528, "logits/chosen": -6.984256744384766, "logits/rejected": -6.987234592437744, "logps/chosen": -1531.658935546875, "logps/rejected": -1443.79541015625, "loss": 21.1572, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -56.77827072143555, "rewards/margins": 0.22221145033836365, "rewards/rejected": -57.00048065185547, "step": 790 }, { "epoch": 0.05, "grad_norm": 60.97695541381836, "learning_rate": 0.0009874221138588956, "logits/chosen": -9.144891738891602, "logits/rejected": -9.139143943786621, "logps/chosen": -1944.2689208984375, "logps/rejected": -1815.9033203125, "loss": 8.4182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -37.51701736450195, "rewards/margins": 24.46981430053711, "rewards/rejected": -61.9868278503418, "step": 800 }, { "epoch": 0.05, "grad_norm": 4.85383651674387e-12, "learning_rate": 0.000987228607918263, "logits/chosen": -9.460871696472168, "logits/rejected": -9.450078010559082, "logps/chosen": -2167.621337890625, "logps/rejected": -1477.966552734375, "loss": 38.0798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -107.71726989746094, "rewards/margins": -13.28193187713623, "rewards/rejected": -94.43534851074219, "step": 810 }, { "epoch": 0.05, "grad_norm": 107.6178207397461, "learning_rate": 0.0009870351019776306, "logits/chosen": -8.556009292602539, "logits/rejected": -8.529903411865234, "logps/chosen": -1966.944580078125, "logps/rejected": -1742.2620849609375, "loss": 29.4582, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -4.649816989898682, "rewards/margins": -24.525999069213867, "rewards/rejected": 19.87618064880371, "step": 820 }, { "epoch": 0.05, "grad_norm": 0.0006477691349573433, "learning_rate": 0.0009868415960369982, "logits/chosen": -7.82785177230835, "logits/rejected": -7.817160129547119, "logps/chosen": -1930.4517822265625, "logps/rejected": -1799.6431884765625, "loss": 38.0015, "rewards/accuracies": 0.5, "rewards/chosen": -71.70576477050781, "rewards/margins": -8.29159164428711, "rewards/rejected": -63.4141731262207, "step": 830 }, { "epoch": 0.05, "grad_norm": 9.042349802257377e-07, "learning_rate": 0.000986648090096366, "logits/chosen": -9.225390434265137, "logits/rejected": -9.2084321975708, "logps/chosen": -1607.1015625, "logps/rejected": -1331.4697265625, "loss": 26.1539, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -87.90084075927734, "rewards/margins": -20.856107711791992, "rewards/rejected": -67.04473114013672, "step": 840 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 0.0009864545841557337, "logits/chosen": -10.246938705444336, "logits/rejected": -10.253719329833984, "logps/chosen": -2039.2154541015625, "logps/rejected": -2181.553955078125, "loss": 16.825, "rewards/accuracies": 0.5, "rewards/chosen": -61.34428787231445, "rewards/margins": 11.57815933227539, "rewards/rejected": -72.92245483398438, "step": 850 }, { "epoch": 0.05, "grad_norm": 18.48613739013672, "learning_rate": 0.0009862610782151013, "logits/chosen": -8.62929630279541, "logits/rejected": -8.535104751586914, "logps/chosen": -1573.906005859375, "logps/rejected": -1296.0841064453125, "loss": 37.8729, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -62.70831298828125, "rewards/margins": -17.29385757446289, "rewards/rejected": -45.414451599121094, "step": 860 }, { "epoch": 0.05, "grad_norm": 81.5608139038086, "learning_rate": 0.000986067572274469, "logits/chosen": -6.565032958984375, "logits/rejected": -6.5432939529418945, "logps/chosen": -2209.15966796875, "logps/rejected": -1547.0538330078125, "loss": 45.2313, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -109.89766693115234, "rewards/margins": -41.61104965209961, "rewards/rejected": -68.28662872314453, "step": 870 }, { "epoch": 0.05, "grad_norm": 99.08183288574219, "learning_rate": 0.0009858740663338365, "logits/chosen": -8.062918663024902, "logits/rejected": -8.075799942016602, "logps/chosen": -1326.723876953125, "logps/rejected": -1421.094970703125, "loss": 30.42, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.407840728759766, "rewards/margins": -21.12555503845215, "rewards/rejected": 0.7177138328552246, "step": 880 }, { "epoch": 0.05, "grad_norm": 60.86687469482422, "learning_rate": 0.0009856805603932041, "logits/chosen": -8.145303726196289, "logits/rejected": -8.229137420654297, "logps/chosen": -2148.944580078125, "logps/rejected": -1615.262451171875, "loss": 53.1943, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.842607498168945, "rewards/margins": -48.51213836669922, "rewards/rejected": 23.669530868530273, "step": 890 }, { "epoch": 0.05, "grad_norm": 0.00016999452782329172, "learning_rate": 0.0009854870544525717, "logits/chosen": -8.222623825073242, "logits/rejected": -8.17730712890625, "logps/chosen": -1926.1029052734375, "logps/rejected": -1022.8897705078125, "loss": 43.3151, "rewards/accuracies": 0.5, "rewards/chosen": -64.5214614868164, "rewards/margins": -28.152441024780273, "rewards/rejected": -36.3690185546875, "step": 900 }, { "epoch": 0.05, "grad_norm": 0.0003266576968599111, "learning_rate": 0.0009852935485119394, "logits/chosen": -7.4357428550720215, "logits/rejected": -7.418862819671631, "logps/chosen": -1633.501220703125, "logps/rejected": -1177.906005859375, "loss": 33.6961, "rewards/accuracies": 0.5, "rewards/chosen": -89.40547180175781, "rewards/margins": -19.479095458984375, "rewards/rejected": -69.92638397216797, "step": 910 }, { "epoch": 0.05, "grad_norm": 25.1517276763916, "learning_rate": 0.000985100042571307, "logits/chosen": -7.962770938873291, "logits/rejected": -7.968632698059082, "logps/chosen": -1882.302978515625, "logps/rejected": -1514.551025390625, "loss": 42.6509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -83.09789276123047, "rewards/margins": -12.202730178833008, "rewards/rejected": -70.89517211914062, "step": 920 }, { "epoch": 0.05, "grad_norm": 0.23357710242271423, "learning_rate": 0.0009849065366306746, "logits/chosen": -7.124972343444824, "logits/rejected": -7.115461826324463, "logps/chosen": -1946.891357421875, "logps/rejected": -1695.2578125, "loss": 37.0614, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -97.14744567871094, "rewards/margins": -36.36576461791992, "rewards/rejected": -60.78167724609375, "step": 930 }, { "epoch": 0.05, "grad_norm": 1.6709031829958908e-18, "learning_rate": 0.0009847130306900422, "logits/chosen": -8.934821128845215, "logits/rejected": -8.925009727478027, "logps/chosen": -1824.630126953125, "logps/rejected": -1418.1376953125, "loss": 22.1848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -107.84066009521484, "rewards/margins": -11.215415954589844, "rewards/rejected": -96.625244140625, "step": 940 }, { "epoch": 0.05, "grad_norm": 55.970306396484375, "learning_rate": 0.0009845195247494098, "logits/chosen": -8.31924057006836, "logits/rejected": -8.261636734008789, "logps/chosen": -2219.925048828125, "logps/rejected": -1592.0205078125, "loss": 63.5308, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.76979064941406, "rewards/margins": -54.70283126831055, "rewards/rejected": -63.06696701049805, "step": 950 }, { "epoch": 0.06, "grad_norm": 553.7432861328125, "learning_rate": 0.0009843260188087774, "logits/chosen": -8.996217727661133, "logits/rejected": -8.984220504760742, "logps/chosen": -2591.60205078125, "logps/rejected": -2006.3385009765625, "loss": 27.0744, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -116.5793228149414, "rewards/margins": -20.242034912109375, "rewards/rejected": -96.33729553222656, "step": 960 }, { "epoch": 0.06, "grad_norm": 5.756559986632108e-18, "learning_rate": 0.000984132512868145, "logits/chosen": -7.644999027252197, "logits/rejected": -7.629981994628906, "logps/chosen": -1687.9954833984375, "logps/rejected": -1261.870849609375, "loss": 28.2406, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -47.81318664550781, "rewards/margins": -9.336814880371094, "rewards/rejected": -38.47636795043945, "step": 970 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.0009839390069275127, "logits/chosen": -7.443305015563965, "logits/rejected": -7.3800458908081055, "logps/chosen": -1910.897216796875, "logps/rejected": -2167.142333984375, "loss": 30.773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -11.306743621826172, "rewards/margins": -12.11706256866455, "rewards/rejected": 0.8103206753730774, "step": 980 }, { "epoch": 0.06, "grad_norm": 39.48200607299805, "learning_rate": 0.0009837455009868803, "logits/chosen": -9.185789108276367, "logits/rejected": -9.16905689239502, "logps/chosen": -2041.3544921875, "logps/rejected": -1527.89306640625, "loss": 30.9622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.087112426757812, "rewards/margins": -16.073848724365234, "rewards/rejected": -9.013261795043945, "step": 990 }, { "epoch": 0.06, "grad_norm": 3.5318603515625, "learning_rate": 0.000983551995046248, "logits/chosen": -8.630331039428711, "logits/rejected": -8.626287460327148, "logps/chosen": -1319.155517578125, "logps/rejected": -1158.7845458984375, "loss": 12.7353, "rewards/accuracies": 0.5, "rewards/chosen": -41.1593132019043, "rewards/margins": 3.984602451324463, "rewards/rejected": -45.1439208984375, "step": 1000 }, { "epoch": 0.06, "grad_norm": 0.010018604807555676, "learning_rate": 0.0009833584891056155, "logits/chosen": -7.762141227722168, "logits/rejected": -7.731589317321777, "logps/chosen": -1534.88134765625, "logps/rejected": -1177.298583984375, "loss": 29.6225, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -117.82887268066406, "rewards/margins": -27.96295738220215, "rewards/rejected": -89.86591339111328, "step": 1010 }, { "epoch": 0.06, "grad_norm": 0.032573554664850235, "learning_rate": 0.0009831649831649831, "logits/chosen": -7.472870826721191, "logits/rejected": -7.4410529136657715, "logps/chosen": -1070.348876953125, "logps/rejected": -698.7373657226562, "loss": 25.485, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -59.149452209472656, "rewards/margins": -24.477880477905273, "rewards/rejected": -34.671573638916016, "step": 1020 }, { "epoch": 0.06, "grad_norm": 2.197950834670337e-07, "learning_rate": 0.0009829714772243508, "logits/chosen": -7.123406410217285, "logits/rejected": -7.115633487701416, "logps/chosen": -2166.6396484375, "logps/rejected": -1777.6695556640625, "loss": 15.576, "rewards/accuracies": 0.5, "rewards/chosen": -37.82978820800781, "rewards/margins": -4.478743076324463, "rewards/rejected": -33.35104751586914, "step": 1030 }, { "epoch": 0.06, "grad_norm": 50.17607116699219, "learning_rate": 0.0009827779712837184, "logits/chosen": -8.942715644836426, "logits/rejected": -8.933740615844727, "logps/chosen": -1885.5299072265625, "logps/rejected": -1579.527099609375, "loss": 24.6437, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -73.38020324707031, "rewards/margins": -17.982608795166016, "rewards/rejected": -55.3975944519043, "step": 1040 }, { "epoch": 0.06, "grad_norm": 37.867897033691406, "learning_rate": 0.000982584465343086, "logits/chosen": -7.511878967285156, "logits/rejected": -7.52625036239624, "logps/chosen": -1821.907470703125, "logps/rejected": -1849.6656494140625, "loss": 16.8082, "rewards/accuracies": 0.5, "rewards/chosen": -7.0937957763671875, "rewards/margins": 12.769678115844727, "rewards/rejected": -19.863473892211914, "step": 1050 }, { "epoch": 0.06, "grad_norm": 138.6531524658203, "learning_rate": 0.0009823909594024536, "logits/chosen": -8.491132736206055, "logits/rejected": -8.47044563293457, "logps/chosen": -2137.873046875, "logps/rejected": -1448.8956298828125, "loss": 49.8443, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -75.29298400878906, "rewards/margins": -35.3481559753418, "rewards/rejected": -39.94481658935547, "step": 1060 }, { "epoch": 0.06, "grad_norm": 126.18804931640625, "learning_rate": 0.0009821974534618212, "logits/chosen": -8.07949447631836, "logits/rejected": -8.067035675048828, "logps/chosen": -2228.529296875, "logps/rejected": -2319.3828125, "loss": 40.4671, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -73.169921875, "rewards/margins": -16.355478286743164, "rewards/rejected": -56.81444549560547, "step": 1070 }, { "epoch": 0.06, "grad_norm": 72.37720489501953, "learning_rate": 0.0009820039475211889, "logits/chosen": -7.872495174407959, "logits/rejected": -7.8348283767700195, "logps/chosen": -2124.46044921875, "logps/rejected": -1899.4652099609375, "loss": 25.0376, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -136.53567504882812, "rewards/margins": -21.886159896850586, "rewards/rejected": -114.64952087402344, "step": 1080 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.0009818104415805565, "logits/chosen": -8.554733276367188, "logits/rejected": -8.53542709350586, "logps/chosen": -2134.31298828125, "logps/rejected": -1996.0872802734375, "loss": 19.6765, "rewards/accuracies": 0.5, "rewards/chosen": -67.76606750488281, "rewards/margins": 2.5649025440216064, "rewards/rejected": -70.33097839355469, "step": 1090 }, { "epoch": 0.06, "grad_norm": 4.068888187408447, "learning_rate": 0.000981616935639924, "logits/chosen": -8.374502182006836, "logits/rejected": -8.343378067016602, "logps/chosen": -1543.2813720703125, "logps/rejected": -1290.000244140625, "loss": 43.666, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.60623931884766, "rewards/margins": -33.76421356201172, "rewards/rejected": -53.84202194213867, "step": 1100 }, { "epoch": 0.06, "grad_norm": 27.7368221282959, "learning_rate": 0.0009814234296992917, "logits/chosen": -9.944923400878906, "logits/rejected": -9.924177169799805, "logps/chosen": -1865.685791015625, "logps/rejected": -1188.806640625, "loss": 44.9919, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -82.32486724853516, "rewards/margins": -33.791873931884766, "rewards/rejected": -48.532997131347656, "step": 1110 }, { "epoch": 0.06, "grad_norm": 33.65122985839844, "learning_rate": 0.0009812299237586593, "logits/chosen": -6.93060302734375, "logits/rejected": -6.894410133361816, "logps/chosen": -1784.1021728515625, "logps/rejected": -1485.838134765625, "loss": 18.3123, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -78.77552032470703, "rewards/margins": -14.643689155578613, "rewards/rejected": -64.1318359375, "step": 1120 }, { "epoch": 0.07, "grad_norm": 1.129745399452986e-07, "learning_rate": 0.000981036417818027, "logits/chosen": -7.96735143661499, "logits/rejected": -7.9814934730529785, "logps/chosen": -1564.064453125, "logps/rejected": -1641.7113037109375, "loss": 20.1079, "rewards/accuracies": 0.5, "rewards/chosen": -94.07022094726562, "rewards/margins": 2.4283111095428467, "rewards/rejected": -96.49854278564453, "step": 1130 }, { "epoch": 0.07, "grad_norm": 50.030845642089844, "learning_rate": 0.0009808429118773946, "logits/chosen": -7.94497537612915, "logits/rejected": -7.880331993103027, "logps/chosen": -1962.975341796875, "logps/rejected": -1210.947021484375, "loss": 71.8331, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -130.71401977539062, "rewards/margins": -68.34611511230469, "rewards/rejected": -62.3679084777832, "step": 1140 }, { "epoch": 0.07, "grad_norm": 20.39025115966797, "learning_rate": 0.0009806494059367624, "logits/chosen": -7.3807878494262695, "logits/rejected": -7.319799900054932, "logps/chosen": -2247.948486328125, "logps/rejected": -2142.289306640625, "loss": 27.1884, "rewards/accuracies": 0.5, "rewards/chosen": -23.952199935913086, "rewards/margins": 0.8334900140762329, "rewards/rejected": -24.785686492919922, "step": 1150 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00098045589999613, "logits/chosen": -7.437757968902588, "logits/rejected": -7.4623003005981445, "logps/chosen": -1477.5362548828125, "logps/rejected": -1165.3492431640625, "loss": 4.8317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.931201934814453, "rewards/margins": 13.328805923461914, "rewards/rejected": -35.260005950927734, "step": 1160 }, { "epoch": 0.07, "grad_norm": 0.00014720263425260782, "learning_rate": 0.0009802623940554976, "logits/chosen": -7.551999092102051, "logits/rejected": -7.520773887634277, "logps/chosen": -2072.157470703125, "logps/rejected": -1621.1376953125, "loss": 25.7624, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -97.25016784667969, "rewards/margins": -12.680813789367676, "rewards/rejected": -84.5693588256836, "step": 1170 }, { "epoch": 0.07, "grad_norm": 0.016591086983680725, "learning_rate": 0.0009800688881148652, "logits/chosen": -6.15906286239624, "logits/rejected": -6.147068977355957, "logps/chosen": -1857.4156494140625, "logps/rejected": -2204.791748046875, "loss": 22.6409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -71.24910736083984, "rewards/margins": 24.42459487915039, "rewards/rejected": -95.6737060546875, "step": 1180 }, { "epoch": 0.07, "grad_norm": 68.04852294921875, "learning_rate": 0.0009798753821742329, "logits/chosen": -6.548835754394531, "logits/rejected": -6.499093532562256, "logps/chosen": -2390.12939453125, "logps/rejected": -1990.0377197265625, "loss": 44.8353, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -47.3138313293457, "rewards/margins": -34.48733139038086, "rewards/rejected": -12.826502799987793, "step": 1190 }, { "epoch": 0.07, "grad_norm": 15.727879524230957, "learning_rate": 0.0009796818762336005, "logits/chosen": -8.154226303100586, "logits/rejected": -8.085336685180664, "logps/chosen": -2016.595458984375, "logps/rejected": -1434.7130126953125, "loss": 51.6394, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -94.71541595458984, "rewards/margins": -45.62445068359375, "rewards/rejected": -49.090965270996094, "step": 1200 }, { "epoch": 0.07, "grad_norm": 5.880522530543836e-16, "learning_rate": 0.000979488370292968, "logits/chosen": -6.300468444824219, "logits/rejected": -6.2445526123046875, "logps/chosen": -2234.2353515625, "logps/rejected": -2161.817626953125, "loss": 28.6062, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -67.51194763183594, "rewards/margins": -23.11751937866211, "rewards/rejected": -44.3944206237793, "step": 1210 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.0009792948643523357, "logits/chosen": -5.3565473556518555, "logits/rejected": -5.324740409851074, "logps/chosen": -1734.7613525390625, "logps/rejected": -1558.154296875, "loss": 26.7842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.54832458496094, "rewards/margins": -13.02184772491455, "rewards/rejected": -56.52648162841797, "step": 1220 }, { "epoch": 0.07, "grad_norm": 101.35572814941406, "learning_rate": 0.0009791013584117033, "logits/chosen": -8.729275703430176, "logits/rejected": -8.719625473022461, "logps/chosen": -1618.886474609375, "logps/rejected": -1247.587890625, "loss": 48.5103, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -110.7624282836914, "rewards/margins": -42.865989685058594, "rewards/rejected": -67.89643096923828, "step": 1230 }, { "epoch": 0.07, "grad_norm": 1.2160513401031494, "learning_rate": 0.000978907852471071, "logits/chosen": -7.8919548988342285, "logits/rejected": -7.846997261047363, "logps/chosen": -1990.579833984375, "logps/rejected": -1971.1558837890625, "loss": 39.4688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.21746826171875, "rewards/margins": -21.507339477539062, "rewards/rejected": -78.71012878417969, "step": 1240 }, { "epoch": 0.07, "grad_norm": 38.581817626953125, "learning_rate": 0.0009787143465304386, "logits/chosen": -9.29665470123291, "logits/rejected": -9.241111755371094, "logps/chosen": -2508.069091796875, "logps/rejected": -2176.145751953125, "loss": 30.3692, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -97.83628845214844, "rewards/margins": -25.54680633544922, "rewards/rejected": -72.28949737548828, "step": 1250 }, { "epoch": 0.07, "grad_norm": 305.4879150390625, "learning_rate": 0.0009785208405898062, "logits/chosen": -8.300305366516113, "logits/rejected": -8.229948997497559, "logps/chosen": -2177.04150390625, "logps/rejected": -1670.7955322265625, "loss": 43.5451, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -117.7491226196289, "rewards/margins": -36.815155029296875, "rewards/rejected": -80.9339599609375, "step": 1260 }, { "epoch": 0.07, "grad_norm": 10.970734596252441, "learning_rate": 0.0009783273346491738, "logits/chosen": -7.2775559425354, "logits/rejected": -7.267753601074219, "logps/chosen": -1650.760498046875, "logps/rejected": -1253.8935546875, "loss": 41.8104, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -120.31656646728516, "rewards/margins": -35.6051139831543, "rewards/rejected": -84.71144104003906, "step": 1270 }, { "epoch": 0.07, "grad_norm": 96.39222717285156, "learning_rate": 0.0009781338287085414, "logits/chosen": -8.168400764465332, "logits/rejected": -8.147981643676758, "logps/chosen": -2439.00341796875, "logps/rejected": -2076.854248046875, "loss": 29.9179, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -56.641090393066406, "rewards/margins": -13.626214981079102, "rewards/rejected": -43.01487350463867, "step": 1280 }, { "epoch": 0.07, "grad_norm": 80.65170288085938, "learning_rate": 0.000977940322767909, "logits/chosen": -8.840522766113281, "logits/rejected": -8.829629898071289, "logps/chosen": -1535.2867431640625, "logps/rejected": -1215.056884765625, "loss": 44.3106, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -122.74845886230469, "rewards/margins": -28.422607421875, "rewards/rejected": -94.32585144042969, "step": 1290 }, { "epoch": 0.08, "grad_norm": 223.2726593017578, "learning_rate": 0.0009777468168272766, "logits/chosen": -7.42596960067749, "logits/rejected": -7.409411430358887, "logps/chosen": -2268.366455078125, "logps/rejected": -1646.814208984375, "loss": 37.7311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -112.98590087890625, "rewards/margins": -27.077260971069336, "rewards/rejected": -85.90865325927734, "step": 1300 }, { "epoch": 0.08, "grad_norm": 45.78799057006836, "learning_rate": 0.0009775533108866443, "logits/chosen": -7.765960693359375, "logits/rejected": -7.746672630310059, "logps/chosen": -2241.61083984375, "logps/rejected": -1618.8319091796875, "loss": 28.7014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -65.53253936767578, "rewards/margins": -14.986604690551758, "rewards/rejected": -50.545936584472656, "step": 1310 }, { "epoch": 0.08, "grad_norm": 36.79329299926758, "learning_rate": 0.0009773598049460119, "logits/chosen": -8.958813667297363, "logits/rejected": -8.924650192260742, "logps/chosen": -1629.7197265625, "logps/rejected": -1292.6861572265625, "loss": 47.5791, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -97.43775177001953, "rewards/margins": -27.831262588500977, "rewards/rejected": -69.60647583007812, "step": 1320 }, { "epoch": 0.08, "grad_norm": 65.09713745117188, "learning_rate": 0.0009771662990053795, "logits/chosen": -7.565373420715332, "logits/rejected": -7.536768913269043, "logps/chosen": -1872.3179931640625, "logps/rejected": -1345.3482666015625, "loss": 48.4523, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -63.701698303222656, "rewards/margins": -40.68822479248047, "rewards/rejected": -23.01346778869629, "step": 1330 }, { "epoch": 0.08, "grad_norm": 38.54478073120117, "learning_rate": 0.0009769727930647471, "logits/chosen": -8.484418869018555, "logits/rejected": -8.478303909301758, "logps/chosen": -2259.818603515625, "logps/rejected": -2078.18798828125, "loss": 14.1811, "rewards/accuracies": 0.5, "rewards/chosen": -116.06858825683594, "rewards/margins": 2.456510066986084, "rewards/rejected": -118.52510833740234, "step": 1340 }, { "epoch": 0.08, "grad_norm": 65.99559020996094, "learning_rate": 0.0009767792871241147, "logits/chosen": -8.539983749389648, "logits/rejected": -8.506093978881836, "logps/chosen": -1731.155029296875, "logps/rejected": -1419.294189453125, "loss": 17.7551, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -37.07837677001953, "rewards/margins": -12.608707427978516, "rewards/rejected": -24.469667434692383, "step": 1350 }, { "epoch": 0.08, "grad_norm": 1.7693498315672965e-20, "learning_rate": 0.0009765857811834824, "logits/chosen": -8.46470832824707, "logits/rejected": -8.424703598022461, "logps/chosen": -1781.1849365234375, "logps/rejected": -1550.3284912109375, "loss": 52.481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.343017578125, "rewards/margins": -20.258869171142578, "rewards/rejected": -121.08416748046875, "step": 1360 }, { "epoch": 0.08, "grad_norm": 17.695297241210938, "learning_rate": 0.00097639227524285, "logits/chosen": -7.720228672027588, "logits/rejected": -7.519351959228516, "logps/chosen": -2022.064697265625, "logps/rejected": -1093.033447265625, "loss": 59.6707, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -98.13117980957031, "rewards/margins": -50.15633010864258, "rewards/rejected": -47.97486114501953, "step": 1370 }, { "epoch": 0.08, "grad_norm": 64.73109436035156, "learning_rate": 0.0009761987693022176, "logits/chosen": -6.065072059631348, "logits/rejected": -5.986494064331055, "logps/chosen": -1874.7425537109375, "logps/rejected": -1484.5001220703125, "loss": 31.4266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -56.8227424621582, "rewards/margins": -15.803247451782227, "rewards/rejected": -41.019493103027344, "step": 1380 }, { "epoch": 0.08, "grad_norm": 74.85025787353516, "learning_rate": 0.0009760052633615852, "logits/chosen": -8.458157539367676, "logits/rejected": -8.409811019897461, "logps/chosen": -1826.4771728515625, "logps/rejected": -1739.2740478515625, "loss": 37.2226, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -60.80189895629883, "rewards/margins": -25.014881134033203, "rewards/rejected": -35.78702163696289, "step": 1390 }, { "epoch": 0.08, "grad_norm": 2.9446961357711137e-21, "learning_rate": 0.0009758117574209528, "logits/chosen": -8.373635292053223, "logits/rejected": -8.31163501739502, "logps/chosen": -2051.67041015625, "logps/rejected": -1903.814208984375, "loss": 20.6729, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -98.35816955566406, "rewards/margins": -10.436720848083496, "rewards/rejected": -87.92144775390625, "step": 1400 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.0009756182514803204, "logits/chosen": -6.7837324142456055, "logits/rejected": -6.8227949142456055, "logps/chosen": -1666.1669921875, "logps/rejected": -1838.012939453125, "loss": 17.773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.348132133483887, "rewards/margins": -4.142360687255859, "rewards/rejected": -9.20577335357666, "step": 1410 }, { "epoch": 0.08, "grad_norm": 38.694400787353516, "learning_rate": 0.000975424745539688, "logits/chosen": -6.294315338134766, "logits/rejected": -6.345691680908203, "logps/chosen": -2077.484619140625, "logps/rejected": -2171.823486328125, "loss": 9.6643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -56.66588592529297, "rewards/margins": 10.624542236328125, "rewards/rejected": -67.2904281616211, "step": 1420 }, { "epoch": 0.08, "grad_norm": 36.135772705078125, "learning_rate": 0.0009752312395990557, "logits/chosen": -8.144807815551758, "logits/rejected": -8.01035213470459, "logps/chosen": -2144.325439453125, "logps/rejected": -1400.167724609375, "loss": 45.3611, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -100.65633392333984, "rewards/margins": -32.58366394042969, "rewards/rejected": -68.07267761230469, "step": 1430 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.0009750377336584233, "logits/chosen": -7.512941837310791, "logits/rejected": -7.497988700866699, "logps/chosen": -2067.428466796875, "logps/rejected": -1876.552001953125, "loss": 24.8064, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -35.319122314453125, "rewards/margins": 3.2814605236053467, "rewards/rejected": -38.600582122802734, "step": 1440 }, { "epoch": 0.08, "grad_norm": 4.0517034879455286e-16, "learning_rate": 0.0009748442277177909, "logits/chosen": -7.357534885406494, "logits/rejected": -7.319121360778809, "logps/chosen": -2388.42529296875, "logps/rejected": -2273.78515625, "loss": 21.6579, "rewards/accuracies": 0.5, "rewards/chosen": -6.9617767333984375, "rewards/margins": -10.770238876342773, "rewards/rejected": 3.808462619781494, "step": 1450 }, { "epoch": 0.08, "grad_norm": 60.52048873901367, "learning_rate": 0.0009746507217771586, "logits/chosen": -7.9142351150512695, "logits/rejected": -7.895684719085693, "logps/chosen": -2100.256591796875, "logps/rejected": -1988.1448974609375, "loss": 23.5193, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -44.888702392578125, "rewards/margins": -12.999128341674805, "rewards/rejected": -31.889572143554688, "step": 1460 }, { "epoch": 0.09, "grad_norm": 32.81068801879883, "learning_rate": 0.0009744572158365262, "logits/chosen": -9.211642265319824, "logits/rejected": -9.089694023132324, "logps/chosen": -2209.76904296875, "logps/rejected": -1718.8363037109375, "loss": 28.9958, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.43879318237305, "rewards/margins": -9.916418075561523, "rewards/rejected": -39.522369384765625, "step": 1470 }, { "epoch": 0.09, "grad_norm": 60.211570739746094, "learning_rate": 0.0009742637098958939, "logits/chosen": -9.086076736450195, "logits/rejected": -8.950191497802734, "logps/chosen": -1939.0224609375, "logps/rejected": -1200.2783203125, "loss": 36.6983, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -50.12044143676758, "rewards/margins": -20.944385528564453, "rewards/rejected": -29.176061630249023, "step": 1480 }, { "epoch": 0.09, "grad_norm": 50.15373992919922, "learning_rate": 0.0009740702039552615, "logits/chosen": -7.842090606689453, "logits/rejected": -7.779275417327881, "logps/chosen": -2259.2939453125, "logps/rejected": -1885.5062255859375, "loss": 23.6776, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -73.8861312866211, "rewards/margins": -3.941044330596924, "rewards/rejected": -69.94508361816406, "step": 1490 }, { "epoch": 0.09, "grad_norm": 51.63887023925781, "learning_rate": 0.0009738766980146291, "logits/chosen": -8.509066581726074, "logits/rejected": -8.306478500366211, "logps/chosen": -1921.1968994140625, "logps/rejected": -1202.583740234375, "loss": 31.4947, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -50.01887893676758, "rewards/margins": -19.60201644897461, "rewards/rejected": -30.416860580444336, "step": 1500 }, { "epoch": 0.09, "grad_norm": 121.82027435302734, "learning_rate": 0.0009736831920739967, "logits/chosen": -7.6200456619262695, "logits/rejected": -7.551143646240234, "logps/chosen": -1986.149169921875, "logps/rejected": -1362.1629638671875, "loss": 53.1372, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -52.91957473754883, "rewards/margins": -48.738037109375, "rewards/rejected": -4.181537628173828, "step": 1510 }, { "epoch": 0.09, "grad_norm": 0.0001525065308669582, "learning_rate": 0.0009734896861333643, "logits/chosen": -7.9340338706970215, "logits/rejected": -7.9247565269470215, "logps/chosen": -1169.1142578125, "logps/rejected": -927.7692260742188, "loss": 14.1187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -70.24722290039062, "rewards/margins": -6.221745014190674, "rewards/rejected": -64.02547454833984, "step": 1520 }, { "epoch": 0.09, "grad_norm": 19.88353157043457, "learning_rate": 0.000973296180192732, "logits/chosen": -8.148111343383789, "logits/rejected": -8.130358695983887, "logps/chosen": -1880.976806640625, "logps/rejected": -1391.1353759765625, "loss": 27.1571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.030031204223633, "rewards/margins": 0.8921524286270142, "rewards/rejected": -11.922184944152832, "step": 1530 }, { "epoch": 0.09, "grad_norm": 8.933474619776671e-12, "learning_rate": 0.0009731026742520996, "logits/chosen": -7.646836757659912, "logits/rejected": -7.652210235595703, "logps/chosen": -1611.8486328125, "logps/rejected": -1429.852783203125, "loss": 16.5023, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -94.43465423583984, "rewards/margins": 7.101515293121338, "rewards/rejected": -101.53617095947266, "step": 1540 }, { "epoch": 0.09, "grad_norm": 68.48359680175781, "learning_rate": 0.0009729091683114672, "logits/chosen": -7.266582489013672, "logits/rejected": -7.245744228363037, "logps/chosen": -2326.060546875, "logps/rejected": -1730.793212890625, "loss": 60.2229, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -102.38539123535156, "rewards/margins": -50.55241012573242, "rewards/rejected": -51.832977294921875, "step": 1550 }, { "epoch": 0.09, "grad_norm": 109.8088607788086, "learning_rate": 0.0009727156623708348, "logits/chosen": -8.618000030517578, "logits/rejected": -8.543818473815918, "logps/chosen": -1370.26953125, "logps/rejected": -1024.7392578125, "loss": 40.5962, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -48.98949432373047, "rewards/margins": -34.686851501464844, "rewards/rejected": -14.302650451660156, "step": 1560 }, { "epoch": 0.09, "grad_norm": 26.455554962158203, "learning_rate": 0.0009725221564302025, "logits/chosen": -8.610920906066895, "logits/rejected": -8.596210479736328, "logps/chosen": -2235.490234375, "logps/rejected": -2140.4306640625, "loss": 23.5524, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -76.42903137207031, "rewards/margins": -17.24564552307129, "rewards/rejected": -59.183387756347656, "step": 1570 }, { "epoch": 0.09, "grad_norm": 86.40593719482422, "learning_rate": 0.00097232865048957, "logits/chosen": -7.23653507232666, "logits/rejected": -7.1738080978393555, "logps/chosen": -1850.064208984375, "logps/rejected": -1360.1488037109375, "loss": 37.4126, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -119.00187683105469, "rewards/margins": -30.030261993408203, "rewards/rejected": -88.97161102294922, "step": 1580 }, { "epoch": 0.09, "grad_norm": 52.52156448364258, "learning_rate": 0.0009721351445489377, "logits/chosen": -8.171455383300781, "logits/rejected": -8.110367774963379, "logps/chosen": -1826.4703369140625, "logps/rejected": -1344.263916015625, "loss": 48.4254, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -85.76456451416016, "rewards/margins": -38.99028778076172, "rewards/rejected": -46.77428436279297, "step": 1590 }, { "epoch": 0.09, "grad_norm": 1.5079461945788353e-07, "learning_rate": 0.0009719416386083053, "logits/chosen": -9.815098762512207, "logits/rejected": -9.852595329284668, "logps/chosen": -2007.177490234375, "logps/rejected": -2021.552734375, "loss": 11.6868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -75.67243957519531, "rewards/margins": 6.976134300231934, "rewards/rejected": -82.64857482910156, "step": 1600 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.0009717481326676729, "logits/chosen": -8.27138900756836, "logits/rejected": -8.239542007446289, "logps/chosen": -1609.1988525390625, "logps/rejected": -1423.694091796875, "loss": 24.2818, "rewards/accuracies": 0.5, "rewards/chosen": -93.28081512451172, "rewards/margins": -2.745077133178711, "rewards/rejected": -90.5357437133789, "step": 1610 }, { "epoch": 0.09, "grad_norm": 35.54193878173828, "learning_rate": 0.0009715546267270405, "logits/chosen": -7.317690849304199, "logits/rejected": -7.320178985595703, "logps/chosen": -1960.015625, "logps/rejected": -1755.7578125, "loss": 27.4055, "rewards/accuracies": 0.5, "rewards/chosen": -117.14512634277344, "rewards/margins": 0.9884254336357117, "rewards/rejected": -118.13356018066406, "step": 1620 }, { "epoch": 0.09, "grad_norm": 34.51939392089844, "learning_rate": 0.0009713611207864081, "logits/chosen": -8.04374885559082, "logits/rejected": -8.045202255249023, "logps/chosen": -1599.79248046875, "logps/rejected": -1318.917236328125, "loss": 20.5356, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -105.35140228271484, "rewards/margins": -20.384166717529297, "rewards/rejected": -84.96724700927734, "step": 1630 }, { "epoch": 0.09, "grad_norm": 28.46761703491211, "learning_rate": 0.0009711676148457757, "logits/chosen": -7.411416053771973, "logits/rejected": -7.407250881195068, "logps/chosen": -1704.020263671875, "logps/rejected": -1506.0062255859375, "loss": 38.671, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -72.16316223144531, "rewards/margins": -28.36284828186035, "rewards/rejected": -43.800315856933594, "step": 1640 }, { "epoch": 0.1, "grad_norm": 11.291749954223633, "learning_rate": 0.0009709741089051434, "logits/chosen": -7.476842403411865, "logits/rejected": -7.473348140716553, "logps/chosen": -2265.518798828125, "logps/rejected": -2109.982421875, "loss": 16.2937, "rewards/accuracies": 0.5, "rewards/chosen": 25.53750991821289, "rewards/margins": -2.2488465309143066, "rewards/rejected": 27.78635597229004, "step": 1650 }, { "epoch": 0.1, "grad_norm": 24.417165756225586, "learning_rate": 0.000970780602964511, "logits/chosen": -9.742887496948242, "logits/rejected": -9.731163024902344, "logps/chosen": -2117.62744140625, "logps/rejected": -1745.2874755859375, "loss": 30.8161, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -67.93427276611328, "rewards/margins": -23.04446792602539, "rewards/rejected": -44.889801025390625, "step": 1660 }, { "epoch": 0.1, "grad_norm": 89.15853118896484, "learning_rate": 0.0009705870970238787, "logits/chosen": -8.699087142944336, "logits/rejected": -8.682539939880371, "logps/chosen": -2143.13671875, "logps/rejected": -1711.0531005859375, "loss": 43.738, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -109.75856018066406, "rewards/margins": -34.504676818847656, "rewards/rejected": -75.25386810302734, "step": 1670 }, { "epoch": 0.1, "grad_norm": 42.19912338256836, "learning_rate": 0.0009703935910832463, "logits/chosen": -7.779648780822754, "logits/rejected": -7.785738945007324, "logps/chosen": -1852.4921875, "logps/rejected": -1970.970458984375, "loss": 11.1899, "rewards/accuracies": 0.5, "rewards/chosen": -44.34136199951172, "rewards/margins": 16.925334930419922, "rewards/rejected": -61.266700744628906, "step": 1680 }, { "epoch": 0.1, "grad_norm": 86.60478210449219, "learning_rate": 0.0009702000851426139, "logits/chosen": -8.579758644104004, "logits/rejected": -8.545761108398438, "logps/chosen": -1709.069091796875, "logps/rejected": -1113.7578125, "loss": 49.206, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -102.47623443603516, "rewards/margins": -44.39268112182617, "rewards/rejected": -58.08354949951172, "step": 1690 }, { "epoch": 0.1, "grad_norm": 111.91002655029297, "learning_rate": 0.0009700065792019816, "logits/chosen": -7.817850589752197, "logits/rejected": -7.784424781799316, "logps/chosen": -2244.33642578125, "logps/rejected": -1963.518798828125, "loss": 20.5947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -11.7116060256958, "rewards/margins": -9.29747200012207, "rewards/rejected": -2.414128065109253, "step": 1700 }, { "epoch": 0.1, "grad_norm": 40.758811950683594, "learning_rate": 0.0009698130732613492, "logits/chosen": -7.11596155166626, "logits/rejected": -7.0788421630859375, "logps/chosen": -1950.0023193359375, "logps/rejected": -1602.921142578125, "loss": 18.777, "rewards/accuracies": 0.5, "rewards/chosen": -15.271013259887695, "rewards/margins": 3.720280408859253, "rewards/rejected": -18.991289138793945, "step": 1710 }, { "epoch": 0.1, "grad_norm": 4.2967311275193335e-17, "learning_rate": 0.0009696195673207168, "logits/chosen": -8.940455436706543, "logits/rejected": -8.910650253295898, "logps/chosen": -1873.5185546875, "logps/rejected": -1587.0504150390625, "loss": 24.8953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.72665023803711, "rewards/margins": -1.255764365196228, "rewards/rejected": -16.470888137817383, "step": 1720 }, { "epoch": 0.1, "grad_norm": 0.0037739200051873922, "learning_rate": 0.0009694260613800844, "logits/chosen": -8.053491592407227, "logits/rejected": -7.981854438781738, "logps/chosen": -1639.5390625, "logps/rejected": -1279.221923828125, "loss": 33.3993, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -94.17420959472656, "rewards/margins": -31.986623764038086, "rewards/rejected": -62.187599182128906, "step": 1730 }, { "epoch": 0.1, "grad_norm": 71.81832122802734, "learning_rate": 0.000969232555439452, "logits/chosen": -7.818819522857666, "logits/rejected": -7.768178462982178, "logps/chosen": -1748.427978515625, "logps/rejected": -1367.477294921875, "loss": 36.234, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -102.74237060546875, "rewards/margins": -27.84381103515625, "rewards/rejected": -74.8985595703125, "step": 1740 }, { "epoch": 0.1, "grad_norm": 9.685249824542552e-05, "learning_rate": 0.0009690390494988196, "logits/chosen": -9.106744766235352, "logits/rejected": -9.079771041870117, "logps/chosen": -2085.8759765625, "logps/rejected": -1620.363525390625, "loss": 20.2653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -55.10689163208008, "rewards/margins": 1.6501598358154297, "rewards/rejected": -56.75707244873047, "step": 1750 }, { "epoch": 0.1, "grad_norm": 1.2833444464727898e-14, "learning_rate": 0.0009688455435581873, "logits/chosen": -9.021505355834961, "logits/rejected": -8.998078346252441, "logps/chosen": -2101.72998046875, "logps/rejected": -1999.4287109375, "loss": 25.2719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -64.54523468017578, "rewards/margins": 7.8180060386657715, "rewards/rejected": -72.36323547363281, "step": 1760 }, { "epoch": 0.1, "grad_norm": 1.977535197434268e-20, "learning_rate": 0.0009686520376175549, "logits/chosen": -7.387155055999756, "logits/rejected": -7.377167701721191, "logps/chosen": -1701.6204833984375, "logps/rejected": -1536.345458984375, "loss": 29.7655, "rewards/accuracies": 0.5, "rewards/chosen": -134.30599975585938, "rewards/margins": -15.959123611450195, "rewards/rejected": -118.34688568115234, "step": 1770 }, { "epoch": 0.1, "grad_norm": 5.217110042045857e-14, "learning_rate": 0.0009684585316769226, "logits/chosen": -8.552217483520508, "logits/rejected": -8.532529830932617, "logps/chosen": -1985.015625, "logps/rejected": -1696.8167724609375, "loss": 38.0962, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -114.17203521728516, "rewards/margins": -34.65470504760742, "rewards/rejected": -79.51734161376953, "step": 1780 }, { "epoch": 0.1, "grad_norm": 103.48637390136719, "learning_rate": 0.0009682650257362902, "logits/chosen": -8.813270568847656, "logits/rejected": -8.793573379516602, "logps/chosen": -1915.3870849609375, "logps/rejected": -1496.147705078125, "loss": 32.7193, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -43.315162658691406, "rewards/margins": -17.670047760009766, "rewards/rejected": -25.64510726928711, "step": 1790 }, { "epoch": 0.1, "grad_norm": 34.343990325927734, "learning_rate": 0.0009680715197956577, "logits/chosen": -7.866710662841797, "logits/rejected": -7.849493980407715, "logps/chosen": -1767.1392822265625, "logps/rejected": -1761.2861328125, "loss": 19.4187, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.38376539945602417, "rewards/margins": -14.2789306640625, "rewards/rejected": 14.662699699401855, "step": 1800 }, { "epoch": 0.1, "grad_norm": 236.49234008789062, "learning_rate": 0.0009678780138550253, "logits/chosen": -5.5820112228393555, "logits/rejected": -5.7058234214782715, "logps/chosen": -1965.166748046875, "logps/rejected": -2097.878173828125, "loss": 32.9472, "rewards/accuracies": 0.5, "rewards/chosen": -109.33534240722656, "rewards/margins": -10.60175895690918, "rewards/rejected": -98.73358154296875, "step": 1810 }, { "epoch": 0.11, "grad_norm": 37.16081619262695, "learning_rate": 0.000967684507914393, "logits/chosen": -5.733501434326172, "logits/rejected": -5.730005264282227, "logps/chosen": -1850.989013671875, "logps/rejected": -1562.873779296875, "loss": 31.051, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -82.93606567382812, "rewards/margins": -24.54292869567871, "rewards/rejected": -58.39313507080078, "step": 1820 }, { "epoch": 0.11, "grad_norm": 50.9183349609375, "learning_rate": 0.0009674910019737606, "logits/chosen": -10.308603286743164, "logits/rejected": -10.292196273803711, "logps/chosen": -2125.44384765625, "logps/rejected": -1807.393798828125, "loss": 33.5651, "rewards/accuracies": 0.5, "rewards/chosen": -31.5970458984375, "rewards/margins": -20.07107162475586, "rewards/rejected": -11.525980949401855, "step": 1830 }, { "epoch": 0.11, "grad_norm": 0.007354178931564093, "learning_rate": 0.0009672974960331282, "logits/chosen": -9.83216667175293, "logits/rejected": -9.810057640075684, "logps/chosen": -1305.5341796875, "logps/rejected": -1346.635498046875, "loss": 13.8547, "rewards/accuracies": 0.5, "rewards/chosen": -44.713775634765625, "rewards/margins": 12.01432991027832, "rewards/rejected": -56.72810745239258, "step": 1840 }, { "epoch": 0.11, "grad_norm": 51.162315368652344, "learning_rate": 0.0009671039900924958, "logits/chosen": -7.621107578277588, "logits/rejected": -7.585137844085693, "logps/chosen": -2344.931884765625, "logps/rejected": -1925.6187744140625, "loss": 36.7663, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -92.09931182861328, "rewards/margins": -22.468210220336914, "rewards/rejected": -69.631103515625, "step": 1850 }, { "epoch": 0.11, "grad_norm": 14.465654373168945, "learning_rate": 0.0009669104841518634, "logits/chosen": -7.381169319152832, "logits/rejected": -7.3219122886657715, "logps/chosen": -1549.057861328125, "logps/rejected": -1214.347900390625, "loss": 20.4412, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 5.80072021484375, "rewards/margins": -13.207870483398438, "rewards/rejected": 19.008594512939453, "step": 1860 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.000966716978211231, "logits/chosen": -8.524194717407227, "logits/rejected": -8.499007225036621, "logps/chosen": -2074.05126953125, "logps/rejected": -1896.6654052734375, "loss": 28.259, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 6.624856472015381, "rewards/margins": -8.115541458129883, "rewards/rejected": 14.740402221679688, "step": 1870 }, { "epoch": 0.11, "grad_norm": 9.32387322905015e-08, "learning_rate": 0.0009665234722705988, "logits/chosen": -8.362147331237793, "logits/rejected": -8.347192764282227, "logps/chosen": -1771.491943359375, "logps/rejected": -1709.877685546875, "loss": 30.1676, "rewards/accuracies": 0.5, "rewards/chosen": -49.90721893310547, "rewards/margins": -17.65475082397461, "rewards/rejected": -32.25246810913086, "step": 1880 }, { "epoch": 0.11, "grad_norm": 84.05795288085938, "learning_rate": 0.0009663299663299664, "logits/chosen": -8.820466995239258, "logits/rejected": -8.802046775817871, "logps/chosen": -1833.425537109375, "logps/rejected": -1525.3734130859375, "loss": 28.278, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -47.765968322753906, "rewards/margins": -14.503488540649414, "rewards/rejected": -33.262481689453125, "step": 1890 }, { "epoch": 0.11, "grad_norm": 42.9995231628418, "learning_rate": 0.000966136460389334, "logits/chosen": -9.46498966217041, "logits/rejected": -9.461679458618164, "logps/chosen": -2074.521240234375, "logps/rejected": -1569.4427490234375, "loss": 23.5112, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -65.62633514404297, "rewards/margins": -15.295870780944824, "rewards/rejected": -50.330474853515625, "step": 1900 }, { "epoch": 0.11, "grad_norm": 40.439674377441406, "learning_rate": 0.0009659429544487016, "logits/chosen": -8.281512260437012, "logits/rejected": -8.274741172790527, "logps/chosen": -1563.7633056640625, "logps/rejected": -1311.569091796875, "loss": 38.2862, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -39.674583435058594, "rewards/margins": -30.61505699157715, "rewards/rejected": -9.059521675109863, "step": 1910 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.0009657494485080692, "logits/chosen": -8.332167625427246, "logits/rejected": -8.319662094116211, "logps/chosen": -1730.8695068359375, "logps/rejected": -1981.612060546875, "loss": 19.2332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 11.196869850158691, "rewards/margins": 12.311779975891113, "rewards/rejected": -1.1149094104766846, "step": 1920 }, { "epoch": 0.11, "grad_norm": 0.19797678291797638, "learning_rate": 0.0009655559425674369, "logits/chosen": -8.460433959960938, "logits/rejected": -8.471529960632324, "logps/chosen": -2009.3466796875, "logps/rejected": -1781.789794921875, "loss": 29.3605, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.55167388916016, "rewards/margins": -16.968143463134766, "rewards/rejected": -82.58353424072266, "step": 1930 }, { "epoch": 0.11, "grad_norm": 23.552568435668945, "learning_rate": 0.0009653624366268045, "logits/chosen": -6.394064903259277, "logits/rejected": -6.404267311096191, "logps/chosen": -2495.350341796875, "logps/rejected": -2328.75048828125, "loss": 35.1544, "rewards/accuracies": 0.5, "rewards/chosen": -40.15266799926758, "rewards/margins": -20.799196243286133, "rewards/rejected": -19.353473663330078, "step": 1940 }, { "epoch": 0.11, "grad_norm": 36.28828811645508, "learning_rate": 0.0009651689306861721, "logits/chosen": -5.670906066894531, "logits/rejected": -5.677260875701904, "logps/chosen": -1714.5826416015625, "logps/rejected": -1681.7587890625, "loss": 5.0423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -50.42284393310547, "rewards/margins": 17.267845153808594, "rewards/rejected": -67.69068908691406, "step": 1950 }, { "epoch": 0.11, "grad_norm": 2.5229310196550614e-08, "learning_rate": 0.0009649754247455397, "logits/chosen": -6.8084540367126465, "logits/rejected": -6.798170566558838, "logps/chosen": -1855.761474609375, "logps/rejected": -1686.716064453125, "loss": 23.8611, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -127.67049407958984, "rewards/margins": -12.541595458984375, "rewards/rejected": -115.12889099121094, "step": 1960 }, { "epoch": 0.11, "grad_norm": 49.67913818359375, "learning_rate": 0.0009647819188049073, "logits/chosen": -8.725693702697754, "logits/rejected": -8.676850318908691, "logps/chosen": -1760.1859130859375, "logps/rejected": -1286.2650146484375, "loss": 29.8684, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -68.31782531738281, "rewards/margins": -9.914042472839355, "rewards/rejected": -58.403785705566406, "step": 1970 }, { "epoch": 0.11, "grad_norm": 3.3024191856384277, "learning_rate": 0.0009645884128642749, "logits/chosen": -7.8712568283081055, "logits/rejected": -7.823086738586426, "logps/chosen": -1462.7293701171875, "logps/rejected": -1107.876708984375, "loss": 35.1368, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -81.5904312133789, "rewards/margins": -28.143627166748047, "rewards/rejected": -53.446800231933594, "step": 1980 }, { "epoch": 0.12, "grad_norm": 25.94232749938965, "learning_rate": 0.0009643949069236427, "logits/chosen": -9.332759857177734, "logits/rejected": -9.343572616577148, "logps/chosen": -1803.928466796875, "logps/rejected": -1921.4326171875, "loss": 17.7393, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -64.08126068115234, "rewards/margins": 7.721364498138428, "rewards/rejected": -71.80262756347656, "step": 1990 }, { "epoch": 0.12, "grad_norm": 17.206417083740234, "learning_rate": 0.0009642014009830103, "logits/chosen": -9.41939926147461, "logits/rejected": -9.411661148071289, "logps/chosen": -1834.918701171875, "logps/rejected": -1781.1572265625, "loss": 12.3864, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -38.08293533325195, "rewards/margins": -3.3317723274230957, "rewards/rejected": -34.75116729736328, "step": 2000 }, { "epoch": 0.12, "grad_norm": 67.61500549316406, "learning_rate": 0.0009640078950423778, "logits/chosen": -8.027521133422852, "logits/rejected": -7.904717445373535, "logps/chosen": -1898.8675537109375, "logps/rejected": -1478.3271484375, "loss": 63.2659, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -63.73240280151367, "rewards/margins": -55.55619430541992, "rewards/rejected": -8.17620849609375, "step": 2010 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.0009638143891017454, "logits/chosen": -8.499006271362305, "logits/rejected": -8.296464920043945, "logps/chosen": -1631.405029296875, "logps/rejected": -688.9022216796875, "loss": 38.7936, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -68.35701751708984, "rewards/margins": -21.068559646606445, "rewards/rejected": -47.28845977783203, "step": 2020 }, { "epoch": 0.12, "grad_norm": 80.14086151123047, "learning_rate": 0.000963620883161113, "logits/chosen": -8.082088470458984, "logits/rejected": -7.983000755310059, "logps/chosen": -1886.3883056640625, "logps/rejected": -1631.2357177734375, "loss": 35.8587, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.073802947998047, "rewards/margins": -28.654077529907227, "rewards/rejected": 7.580275058746338, "step": 2030 }, { "epoch": 0.12, "grad_norm": 6.299870491027832, "learning_rate": 0.0009634273772204806, "logits/chosen": -8.19982624053955, "logits/rejected": -7.970854759216309, "logps/chosen": -1788.4925537109375, "logps/rejected": -1032.66845703125, "loss": 43.2933, "rewards/accuracies": 0.5, "rewards/chosen": -11.495601654052734, "rewards/margins": -23.96548080444336, "rewards/rejected": 12.469879150390625, "step": 2040 }, { "epoch": 0.12, "grad_norm": 52.46317672729492, "learning_rate": 0.0009632338712798483, "logits/chosen": -9.441232681274414, "logits/rejected": -9.396039009094238, "logps/chosen": -2150.636962890625, "logps/rejected": -2006.5560302734375, "loss": 16.8719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.567626953125, "rewards/margins": -0.3174575865268707, "rewards/rejected": -99.25016784667969, "step": 2050 }, { "epoch": 0.12, "grad_norm": 38.7554931640625, "learning_rate": 0.0009630403653392159, "logits/chosen": -8.043155670166016, "logits/rejected": -8.020406723022461, "logps/chosen": -1889.230224609375, "logps/rejected": -1658.5855712890625, "loss": 29.8567, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -47.05185317993164, "rewards/margins": -12.740206718444824, "rewards/rejected": -34.31163787841797, "step": 2060 }, { "epoch": 0.12, "grad_norm": 19.28898811340332, "learning_rate": 0.0009628468593985835, "logits/chosen": -7.5805158615112305, "logits/rejected": -7.498082160949707, "logps/chosen": -2016.1568603515625, "logps/rejected": -1355.259765625, "loss": 39.4079, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -57.88335037231445, "rewards/margins": -23.95041275024414, "rewards/rejected": -33.932945251464844, "step": 2070 }, { "epoch": 0.12, "grad_norm": 39.965328216552734, "learning_rate": 0.0009626533534579511, "logits/chosen": -8.21487045288086, "logits/rejected": -8.146539688110352, "logps/chosen": -1937.1907958984375, "logps/rejected": -1553.6510009765625, "loss": 16.9677, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 5.4885759353637695, "rewards/margins": 1.8022964000701904, "rewards/rejected": 3.6862785816192627, "step": 2080 }, { "epoch": 0.12, "grad_norm": 1.5425318992851412e-21, "learning_rate": 0.0009624598475173188, "logits/chosen": -8.585542678833008, "logits/rejected": -8.545797348022461, "logps/chosen": -1177.827392578125, "logps/rejected": -1078.474853515625, "loss": 21.7884, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -55.502723693847656, "rewards/margins": -7.468135833740234, "rewards/rejected": -48.03458786010742, "step": 2090 }, { "epoch": 0.12, "grad_norm": 1.1866856317597005e-17, "learning_rate": 0.0009622663415766865, "logits/chosen": -8.597369194030762, "logits/rejected": -8.596662521362305, "logps/chosen": -1660.373779296875, "logps/rejected": -1447.9365234375, "loss": 17.8932, "rewards/accuracies": 0.5, "rewards/chosen": -61.36429977416992, "rewards/margins": -8.79279899597168, "rewards/rejected": -52.571502685546875, "step": 2100 }, { "epoch": 0.12, "grad_norm": 70.58448028564453, "learning_rate": 0.0009620728356360541, "logits/chosen": -8.855504989624023, "logits/rejected": -8.863889694213867, "logps/chosen": -1831.5924072265625, "logps/rejected": -1414.8953857421875, "loss": 28.3616, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -37.118003845214844, "rewards/margins": -18.97319221496582, "rewards/rejected": -18.144813537597656, "step": 2110 }, { "epoch": 0.12, "grad_norm": 81.30391693115234, "learning_rate": 0.0009618793296954217, "logits/chosen": -10.325510025024414, "logits/rejected": -10.325668334960938, "logps/chosen": -1801.5947265625, "logps/rejected": -1583.81640625, "loss": 34.635, "rewards/accuracies": 0.5, "rewards/chosen": -20.431861877441406, "rewards/margins": -25.22069549560547, "rewards/rejected": 4.7888288497924805, "step": 2120 }, { "epoch": 0.12, "grad_norm": 0.246703639626503, "learning_rate": 0.0009616858237547893, "logits/chosen": -7.646176338195801, "logits/rejected": -7.633831977844238, "logps/chosen": -1006.6336059570312, "logps/rejected": -1106.060546875, "loss": 27.7839, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -35.247371673583984, "rewards/margins": -20.61563491821289, "rewards/rejected": -14.631738662719727, "step": 2130 }, { "epoch": 0.12, "grad_norm": 78.3028793334961, "learning_rate": 0.0009614923178141569, "logits/chosen": -6.512984275817871, "logits/rejected": -6.518296241760254, "logps/chosen": -2000.348388671875, "logps/rejected": -1890.1025390625, "loss": 31.029, "rewards/accuracies": 0.5, "rewards/chosen": -100.41018676757812, "rewards/margins": -17.137935638427734, "rewards/rejected": -83.27224731445312, "step": 2140 }, { "epoch": 0.12, "grad_norm": 11.368998527526855, "learning_rate": 0.0009612988118735245, "logits/chosen": -6.664881229400635, "logits/rejected": -6.631122589111328, "logps/chosen": -1606.363037109375, "logps/rejected": -1268.3399658203125, "loss": 29.917, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.699636459350586, "rewards/margins": -21.700368881225586, "rewards/rejected": 3.0007309913635254, "step": 2150 }, { "epoch": 0.13, "grad_norm": 1.0103350730314898e-19, "learning_rate": 0.0009611053059328922, "logits/chosen": -8.917917251586914, "logits/rejected": -8.90528678894043, "logps/chosen": -1771.266357421875, "logps/rejected": -1667.772216796875, "loss": 23.9278, "rewards/accuracies": 0.5, "rewards/chosen": -49.72394561767578, "rewards/margins": -2.005159854888916, "rewards/rejected": -47.71878433227539, "step": 2160 }, { "epoch": 0.13, "grad_norm": 97.66191101074219, "learning_rate": 0.0009609117999922598, "logits/chosen": -9.951173782348633, "logits/rejected": -9.913244247436523, "logps/chosen": -1868.1292724609375, "logps/rejected": -1484.8233642578125, "loss": 37.7922, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -50.120094299316406, "rewards/margins": -32.23230743408203, "rewards/rejected": -17.887792587280273, "step": 2170 }, { "epoch": 0.13, "grad_norm": 56.26930618286133, "learning_rate": 0.0009607182940516274, "logits/chosen": -9.787927627563477, "logits/rejected": -9.764455795288086, "logps/chosen": -1492.154541015625, "logps/rejected": -1468.1212158203125, "loss": 12.6451, "rewards/accuracies": 0.5, "rewards/chosen": -11.38939380645752, "rewards/margins": 2.4018073081970215, "rewards/rejected": -13.791200637817383, "step": 2180 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.000960524788110995, "logits/chosen": -8.67394733428955, "logits/rejected": -8.659610748291016, "logps/chosen": -1667.779052734375, "logps/rejected": -1398.1116943359375, "loss": 53.9787, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.18948364257812, "rewards/margins": -29.350147247314453, "rewards/rejected": -70.8393325805664, "step": 2190 }, { "epoch": 0.13, "grad_norm": 49.640411376953125, "learning_rate": 0.0009603312821703627, "logits/chosen": -7.646070957183838, "logits/rejected": -7.635677337646484, "logps/chosen": -1533.352294921875, "logps/rejected": -1453.8572998046875, "loss": 10.6505, "rewards/accuracies": 0.5, "rewards/chosen": -47.3289680480957, "rewards/margins": 2.6027228832244873, "rewards/rejected": -49.93169021606445, "step": 2200 }, { "epoch": 0.13, "grad_norm": 35.73087692260742, "learning_rate": 0.0009601377762297304, "logits/chosen": -8.35194206237793, "logits/rejected": -8.355508804321289, "logps/chosen": -2118.07177734375, "logps/rejected": -1962.0325927734375, "loss": 31.1677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.821937561035156, "rewards/margins": 7.361414909362793, "rewards/rejected": -30.183353424072266, "step": 2210 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.000959944270289098, "logits/chosen": -8.269164085388184, "logits/rejected": -8.204458236694336, "logps/chosen": -1682.141357421875, "logps/rejected": -1278.6937255859375, "loss": 36.7824, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -54.5468635559082, "rewards/margins": -21.273515701293945, "rewards/rejected": -33.273345947265625, "step": 2220 }, { "epoch": 0.13, "grad_norm": 75.04695129394531, "learning_rate": 0.0009597507643484655, "logits/chosen": -7.943234920501709, "logits/rejected": -7.868814945220947, "logps/chosen": -2172.23193359375, "logps/rejected": -1459.1005859375, "loss": 42.6266, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -30.911306381225586, "rewards/margins": -31.759876251220703, "rewards/rejected": 0.84857177734375, "step": 2230 }, { "epoch": 0.13, "grad_norm": 32.59028625488281, "learning_rate": 0.0009595572584078331, "logits/chosen": -8.484926223754883, "logits/rejected": -8.457697868347168, "logps/chosen": -1537.080322265625, "logps/rejected": -1174.2108154296875, "loss": 31.8935, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -82.85438537597656, "rewards/margins": -30.123973846435547, "rewards/rejected": -52.73041915893555, "step": 2240 }, { "epoch": 0.13, "grad_norm": 23.617382049560547, "learning_rate": 0.0009593637524672007, "logits/chosen": -8.488075256347656, "logits/rejected": -8.51951789855957, "logps/chosen": -1777.4564208984375, "logps/rejected": -1757.4185791015625, "loss": 20.1253, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -74.72306823730469, "rewards/margins": -6.404117584228516, "rewards/rejected": -68.3189468383789, "step": 2250 }, { "epoch": 0.13, "grad_norm": 5.178889751434326, "learning_rate": 0.0009591702465265683, "logits/chosen": -7.894136905670166, "logits/rejected": -7.8544416427612305, "logps/chosen": -1571.0826416015625, "logps/rejected": -941.2874145507812, "loss": 22.8739, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.387736320495605, "rewards/margins": 6.97629451751709, "rewards/rejected": -22.364028930664062, "step": 2260 }, { "epoch": 0.13, "grad_norm": 82.6334457397461, "learning_rate": 0.000958976740585936, "logits/chosen": -8.570417404174805, "logits/rejected": -8.565065383911133, "logps/chosen": -1794.998779296875, "logps/rejected": -2133.75732421875, "loss": 26.6877, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.82350540161133, "rewards/margins": -12.99291706085205, "rewards/rejected": -23.830585479736328, "step": 2270 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.0009587832346453036, "logits/chosen": -8.653336524963379, "logits/rejected": -8.64594554901123, "logps/chosen": -1919.416748046875, "logps/rejected": -1699.3089599609375, "loss": 40.1333, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -79.43597412109375, "rewards/margins": -20.189697265625, "rewards/rejected": -59.246253967285156, "step": 2280 }, { "epoch": 0.13, "grad_norm": 4.011725237488932e-14, "learning_rate": 0.0009585897287046712, "logits/chosen": -8.419755935668945, "logits/rejected": -8.29222583770752, "logps/chosen": -1642.234375, "logps/rejected": -1104.164794921875, "loss": 30.8626, "rewards/accuracies": 0.5, "rewards/chosen": -91.1643295288086, "rewards/margins": -13.331117630004883, "rewards/rejected": -77.83321380615234, "step": 2290 }, { "epoch": 0.13, "grad_norm": 74.81555938720703, "learning_rate": 0.0009583962227640389, "logits/chosen": -10.006963729858398, "logits/rejected": -10.02695369720459, "logps/chosen": -1383.350341796875, "logps/rejected": -1546.316650390625, "loss": 26.0726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -108.3229751586914, "rewards/margins": 14.854257583618164, "rewards/rejected": -123.17724609375, "step": 2300 }, { "epoch": 0.13, "grad_norm": 1.9729747772216797, "learning_rate": 0.0009582027168234065, "logits/chosen": -8.392050743103027, "logits/rejected": -8.245035171508789, "logps/chosen": -2043.956787109375, "logps/rejected": -1347.7955322265625, "loss": 30.6901, "rewards/accuracies": 0.5, "rewards/chosen": -24.0092830657959, "rewards/margins": -11.1117582321167, "rewards/rejected": -12.8975248336792, "step": 2310 }, { "epoch": 0.13, "grad_norm": 0.30907246470451355, "learning_rate": 0.0009580092108827741, "logits/chosen": -7.899774074554443, "logits/rejected": -7.885196685791016, "logps/chosen": -1938.9713134765625, "logps/rejected": -1765.6790771484375, "loss": 21.6796, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -119.86491394042969, "rewards/margins": -14.898317337036133, "rewards/rejected": -104.96659851074219, "step": 2320 }, { "epoch": 0.13, "grad_norm": 59.73882293701172, "learning_rate": 0.0009578157049421418, "logits/chosen": -9.286079406738281, "logits/rejected": -9.262800216674805, "logps/chosen": -2136.11865234375, "logps/rejected": -1763.690185546875, "loss": 30.4163, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.27587890625, "rewards/margins": -23.875703811645508, "rewards/rejected": -113.4001693725586, "step": 2330 }, { "epoch": 0.14, "grad_norm": 69.77515411376953, "learning_rate": 0.0009576221990015094, "logits/chosen": -7.767483711242676, "logits/rejected": -7.743335723876953, "logps/chosen": -2106.091552734375, "logps/rejected": -2102.513671875, "loss": 12.6372, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -39.76734924316406, "rewards/margins": -3.8550636768341064, "rewards/rejected": -35.91228485107422, "step": 2340 }, { "epoch": 0.14, "grad_norm": 38.90751647949219, "learning_rate": 0.000957428693060877, "logits/chosen": -8.053519248962402, "logits/rejected": -7.994143009185791, "logps/chosen": -1733.8238525390625, "logps/rejected": -1246.834228515625, "loss": 56.6286, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -99.17434692382812, "rewards/margins": -39.289329528808594, "rewards/rejected": -59.885009765625, "step": 2350 }, { "epoch": 0.14, "grad_norm": 46.564231872558594, "learning_rate": 0.0009572351871202446, "logits/chosen": -8.52148723602295, "logits/rejected": -8.433483123779297, "logps/chosen": -2386.479248046875, "logps/rejected": -1889.744873046875, "loss": 40.4249, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -115.15828704833984, "rewards/margins": -40.012088775634766, "rewards/rejected": -75.14619445800781, "step": 2360 }, { "epoch": 0.14, "grad_norm": 43.59225845336914, "learning_rate": 0.0009570416811796122, "logits/chosen": -7.593846321105957, "logits/rejected": -7.569089412689209, "logps/chosen": -1957.432861328125, "logps/rejected": -1365.9359130859375, "loss": 33.166, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -71.03755187988281, "rewards/margins": -11.335607528686523, "rewards/rejected": -59.701942443847656, "step": 2370 }, { "epoch": 0.14, "grad_norm": 85.73473358154297, "learning_rate": 0.0009568481752389798, "logits/chosen": -9.201221466064453, "logits/rejected": -9.168712615966797, "logps/chosen": -1673.1103515625, "logps/rejected": -1432.661865234375, "loss": 26.8709, "rewards/accuracies": 0.5, "rewards/chosen": -62.205406188964844, "rewards/margins": -0.18963012099266052, "rewards/rejected": -62.015785217285156, "step": 2380 }, { "epoch": 0.14, "grad_norm": 36.622615814208984, "learning_rate": 0.0009566546692983475, "logits/chosen": -9.251771926879883, "logits/rejected": -9.209451675415039, "logps/chosen": -1665.7750244140625, "logps/rejected": -1126.154296875, "loss": 45.0383, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -98.20699310302734, "rewards/margins": -41.30684280395508, "rewards/rejected": -56.9001579284668, "step": 2390 }, { "epoch": 0.14, "grad_norm": 20.58744239807129, "learning_rate": 0.0009564611633577151, "logits/chosen": -8.86546516418457, "logits/rejected": -8.865130424499512, "logps/chosen": -1978.4537353515625, "logps/rejected": -1347.8375244140625, "loss": 15.8235, "rewards/accuracies": 0.5, "rewards/chosen": -20.896347045898438, "rewards/margins": 16.401729583740234, "rewards/rejected": -37.29807662963867, "step": 2400 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.0009562676574170828, "logits/chosen": -9.031923294067383, "logits/rejected": -9.013383865356445, "logps/chosen": -2137.69384765625, "logps/rejected": -1975.8369140625, "loss": 10.5163, "rewards/accuracies": 0.5, "rewards/chosen": -11.42900276184082, "rewards/margins": 6.2987775802612305, "rewards/rejected": -17.727779388427734, "step": 2410 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.0009560741514764504, "logits/chosen": -8.524648666381836, "logits/rejected": -8.519102096557617, "logps/chosen": -1278.4949951171875, "logps/rejected": -1083.8199462890625, "loss": 20.3882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.27375793457031, "rewards/margins": 6.656881809234619, "rewards/rejected": -75.9306411743164, "step": 2420 }, { "epoch": 0.14, "grad_norm": 70.4651870727539, "learning_rate": 0.000955880645535818, "logits/chosen": -7.365322113037109, "logits/rejected": -7.323695182800293, "logps/chosen": -1779.586181640625, "logps/rejected": -1740.251708984375, "loss": 12.4302, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.472023010253906, "rewards/margins": -3.2132484912872314, "rewards/rejected": -33.25878143310547, "step": 2430 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.0009556871395951857, "logits/chosen": -8.027313232421875, "logits/rejected": -7.971531867980957, "logps/chosen": -1744.704345703125, "logps/rejected": -1501.099853515625, "loss": 10.8731, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -35.930335998535156, "rewards/margins": -0.6950340270996094, "rewards/rejected": -35.23529815673828, "step": 2440 }, { "epoch": 0.14, "grad_norm": 36.62560272216797, "learning_rate": 0.0009554936336545532, "logits/chosen": -8.960521697998047, "logits/rejected": -8.77195930480957, "logps/chosen": -1910.2689208984375, "logps/rejected": -1352.68115234375, "loss": 31.8708, "rewards/accuracies": 0.5, "rewards/chosen": -1.208081841468811, "rewards/margins": -14.809473037719727, "rewards/rejected": 13.601391792297363, "step": 2450 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.0009553001277139208, "logits/chosen": -8.802416801452637, "logits/rejected": -8.5809907913208, "logps/chosen": -2064.247802734375, "logps/rejected": -1490.5078125, "loss": 29.6959, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.31484603881836, "rewards/margins": -12.708564758300781, "rewards/rejected": -12.606282234191895, "step": 2460 }, { "epoch": 0.14, "grad_norm": 73.07392883300781, "learning_rate": 0.0009551066217732884, "logits/chosen": -8.618124961853027, "logits/rejected": -8.541589736938477, "logps/chosen": -1669.269775390625, "logps/rejected": -1588.1112060546875, "loss": 16.2824, "rewards/accuracies": 0.5, "rewards/chosen": 41.7687873840332, "rewards/margins": 12.031316757202148, "rewards/rejected": 29.737472534179688, "step": 2470 }, { "epoch": 0.14, "grad_norm": 25.046228408813477, "learning_rate": 0.000954913115832656, "logits/chosen": -9.366161346435547, "logits/rejected": -9.114850997924805, "logps/chosen": -1576.9505615234375, "logps/rejected": -1018.6384887695312, "loss": 41.0012, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -87.08724975585938, "rewards/margins": -29.43536949157715, "rewards/rejected": -57.651885986328125, "step": 2480 }, { "epoch": 0.14, "grad_norm": 4.975533962249756, "learning_rate": 0.0009547196098920236, "logits/chosen": -8.014814376831055, "logits/rejected": -7.950819492340088, "logps/chosen": -1426.673583984375, "logps/rejected": -1067.341796875, "loss": 25.6877, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -89.20896911621094, "rewards/margins": -11.665274620056152, "rewards/rejected": -77.54369354248047, "step": 2490 }, { "epoch": 0.14, "grad_norm": 104.6783218383789, "learning_rate": 0.0009545261039513913, "logits/chosen": -8.603545188903809, "logits/rejected": -8.506192207336426, "logps/chosen": -1455.7867431640625, "logps/rejected": -1296.89794921875, "loss": 27.0721, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -37.55474853515625, "rewards/margins": -23.062379837036133, "rewards/rejected": -14.49237060546875, "step": 2500 }, { "epoch": 0.15, "grad_norm": 116.59404754638672, "learning_rate": 0.000954332598010759, "logits/chosen": -9.614295959472656, "logits/rejected": -9.442873001098633, "logps/chosen": -2066.77587890625, "logps/rejected": -1804.507080078125, "loss": 30.1897, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -79.23681640625, "rewards/margins": -23.509822845458984, "rewards/rejected": -55.72698974609375, "step": 2510 }, { "epoch": 0.15, "grad_norm": 80.36162567138672, "learning_rate": 0.0009541390920701266, "logits/chosen": -8.773384094238281, "logits/rejected": -8.526317596435547, "logps/chosen": -1645.013671875, "logps/rejected": -1009.7564697265625, "loss": 62.8162, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -129.4744415283203, "rewards/margins": -50.64090347290039, "rewards/rejected": -78.83354187011719, "step": 2520 }, { "epoch": 0.15, "grad_norm": 61.56290054321289, "learning_rate": 0.0009539455861294942, "logits/chosen": -7.889615535736084, "logits/rejected": -7.532788276672363, "logps/chosen": -1807.037353515625, "logps/rejected": -1127.5274658203125, "loss": 58.2033, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -65.68363189697266, "rewards/margins": -49.19841766357422, "rewards/rejected": -16.485210418701172, "step": 2530 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0009537520801888618, "logits/chosen": -7.57016134262085, "logits/rejected": -7.375388145446777, "logps/chosen": -1901.3665771484375, "logps/rejected": -1490.226318359375, "loss": 34.3605, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -82.78834533691406, "rewards/margins": -7.998466491699219, "rewards/rejected": -74.78987884521484, "step": 2540 }, { "epoch": 0.15, "grad_norm": 47.652339935302734, "learning_rate": 0.0009535585742482294, "logits/chosen": -8.199433326721191, "logits/rejected": -8.102953910827637, "logps/chosen": -1162.2001953125, "logps/rejected": -1056.0950927734375, "loss": 25.9883, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -92.80009460449219, "rewards/margins": -8.939802169799805, "rewards/rejected": -83.86030578613281, "step": 2550 }, { "epoch": 0.15, "grad_norm": 61.87654113769531, "learning_rate": 0.0009533650683075971, "logits/chosen": -7.1704254150390625, "logits/rejected": -7.107861518859863, "logps/chosen": -1952.640625, "logps/rejected": -1636.672607421875, "loss": 8.4217, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -50.60524368286133, "rewards/margins": 10.172021865844727, "rewards/rejected": -60.77726364135742, "step": 2560 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0009531715623669647, "logits/chosen": -7.678842067718506, "logits/rejected": -7.6409430503845215, "logps/chosen": -2031.035888671875, "logps/rejected": -1703.3251953125, "loss": 15.653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -49.8093376159668, "rewards/margins": 0.7388870120048523, "rewards/rejected": -50.548221588134766, "step": 2570 }, { "epoch": 0.15, "grad_norm": 1.540708532976609e-11, "learning_rate": 0.0009529780564263323, "logits/chosen": -8.033846855163574, "logits/rejected": -7.992568016052246, "logps/chosen": -1982.4296875, "logps/rejected": -1673.847900390625, "loss": 25.9655, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -58.965126037597656, "rewards/margins": -22.34149742126465, "rewards/rejected": -36.623619079589844, "step": 2580 }, { "epoch": 0.15, "grad_norm": 85.14410400390625, "learning_rate": 0.0009527845504856999, "logits/chosen": -8.955196380615234, "logits/rejected": -8.975152969360352, "logps/chosen": -1675.2181396484375, "logps/rejected": -1743.0810546875, "loss": 9.7934, "rewards/accuracies": 0.5, "rewards/chosen": -33.21003341674805, "rewards/margins": 0.3946201205253601, "rewards/rejected": -33.60465621948242, "step": 2590 }, { "epoch": 0.15, "grad_norm": 5.544691248360323e-06, "learning_rate": 0.0009525910445450675, "logits/chosen": -9.3262300491333, "logits/rejected": -9.341909408569336, "logps/chosen": -1714.206787109375, "logps/rejected": -1571.27978515625, "loss": 27.1725, "rewards/accuracies": 0.5, "rewards/chosen": -104.7972640991211, "rewards/margins": -17.786800384521484, "rewards/rejected": -87.01044464111328, "step": 2600 }, { "epoch": 0.15, "grad_norm": 2.8218672061181267e-20, "learning_rate": 0.0009523975386044351, "logits/chosen": -8.030218124389648, "logits/rejected": -8.01695442199707, "logps/chosen": -1291.7537841796875, "logps/rejected": -1078.449951171875, "loss": 20.119, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.210840225219727, "rewards/margins": -7.69257116317749, "rewards/rejected": -23.518266677856445, "step": 2610 }, { "epoch": 0.15, "grad_norm": 90.51036834716797, "learning_rate": 0.0009522040326638029, "logits/chosen": -8.389528274536133, "logits/rejected": -8.362550735473633, "logps/chosen": -1881.0863037109375, "logps/rejected": -1496.21728515625, "loss": 21.636, "rewards/accuracies": 0.5, "rewards/chosen": -30.479345321655273, "rewards/margins": 1.9197616577148438, "rewards/rejected": -32.399112701416016, "step": 2620 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0009520105267231705, "logits/chosen": -8.082670211791992, "logits/rejected": -8.057915687561035, "logps/chosen": -1775.7073974609375, "logps/rejected": -1473.243896484375, "loss": 20.1535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -67.60094451904297, "rewards/margins": -4.761845588684082, "rewards/rejected": -62.8390998840332, "step": 2630 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0009518170207825381, "logits/chosen": -8.64805793762207, "logits/rejected": -8.62385082244873, "logps/chosen": -2031.5843505859375, "logps/rejected": -1564.435302734375, "loss": 51.5394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -67.17060089111328, "rewards/margins": -30.888172149658203, "rewards/rejected": -36.282432556152344, "step": 2640 }, { "epoch": 0.15, "grad_norm": 2.6358485706590162e-12, "learning_rate": 0.0009516235148419057, "logits/chosen": -7.881623268127441, "logits/rejected": -7.876047611236572, "logps/chosen": -1729.458984375, "logps/rejected": -1549.14892578125, "loss": 27.1109, "rewards/accuracies": 0.5, "rewards/chosen": -80.88059997558594, "rewards/margins": -15.90558910369873, "rewards/rejected": -64.97501373291016, "step": 2650 }, { "epoch": 0.15, "grad_norm": 59.32986068725586, "learning_rate": 0.0009514300089012733, "logits/chosen": -8.070916175842285, "logits/rejected": -8.056241035461426, "logps/chosen": -1447.0455322265625, "logps/rejected": -1410.7982177734375, "loss": 16.5454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -45.635475158691406, "rewards/margins": 1.1339095830917358, "rewards/rejected": -46.769386291503906, "step": 2660 }, { "epoch": 0.15, "grad_norm": 2.830563457641112e-15, "learning_rate": 0.0009512365029606409, "logits/chosen": -8.063379287719727, "logits/rejected": -8.056730270385742, "logps/chosen": -1822.707763671875, "logps/rejected": -1740.5384521484375, "loss": 20.2305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.661903381347656, "rewards/margins": 4.1883544921875, "rewards/rejected": -13.850253105163574, "step": 2670 }, { "epoch": 0.16, "grad_norm": 50.56951904296875, "learning_rate": 0.0009510429970200085, "logits/chosen": -8.20128059387207, "logits/rejected": -8.177210807800293, "logps/chosen": -1971.839111328125, "logps/rejected": -1895.7962646484375, "loss": 13.2691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.15129089355469, "rewards/margins": 1.251373291015625, "rewards/rejected": -119.40267181396484, "step": 2680 }, { "epoch": 0.16, "grad_norm": 26.26957130432129, "learning_rate": 0.0009508494910793761, "logits/chosen": -9.246278762817383, "logits/rejected": -9.143880844116211, "logps/chosen": -1667.1763916015625, "logps/rejected": -1739.029296875, "loss": 23.8294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.946630477905273, "rewards/margins": 13.499870300292969, "rewards/rejected": -40.446502685546875, "step": 2690 }, { "epoch": 0.16, "grad_norm": 38.60537338256836, "learning_rate": 0.0009506559851387437, "logits/chosen": -9.354273796081543, "logits/rejected": -9.301504135131836, "logps/chosen": -1600.0245361328125, "logps/rejected": -1267.498779296875, "loss": 26.2814, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -91.77301788330078, "rewards/margins": -18.044330596923828, "rewards/rejected": -73.72869110107422, "step": 2700 }, { "epoch": 0.16, "grad_norm": 45.220558166503906, "learning_rate": 0.0009504624791981113, "logits/chosen": -8.796186447143555, "logits/rejected": -8.683786392211914, "logps/chosen": -1848.125732421875, "logps/rejected": -1106.113037109375, "loss": 42.7196, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -70.95956420898438, "rewards/margins": -27.61184310913086, "rewards/rejected": -43.347721099853516, "step": 2710 }, { "epoch": 0.16, "grad_norm": 0.018556037917733192, "learning_rate": 0.000950268973257479, "logits/chosen": -8.418424606323242, "logits/rejected": -8.320832252502441, "logps/chosen": -1583.2742919921875, "logps/rejected": -1390.049072265625, "loss": 9.0517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 16.181285858154297, "rewards/margins": 30.28023910522461, "rewards/rejected": -14.098953247070312, "step": 2720 }, { "epoch": 0.16, "grad_norm": 49.996742248535156, "learning_rate": 0.0009500754673168467, "logits/chosen": -7.991931915283203, "logits/rejected": -7.912200927734375, "logps/chosen": -2028.737060546875, "logps/rejected": -1771.1888427734375, "loss": 11.0955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -62.00762176513672, "rewards/margins": 13.416735649108887, "rewards/rejected": -75.42436218261719, "step": 2730 }, { "epoch": 0.16, "grad_norm": 9.799238398500165e-08, "learning_rate": 0.0009498819613762143, "logits/chosen": -8.921686172485352, "logits/rejected": -8.93134880065918, "logps/chosen": -1801.4615478515625, "logps/rejected": -1700.841552734375, "loss": 12.9533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -97.50072479248047, "rewards/margins": -4.528462886810303, "rewards/rejected": -92.97225952148438, "step": 2740 }, { "epoch": 0.16, "grad_norm": 27.514930725097656, "learning_rate": 0.0009496884554355819, "logits/chosen": -8.319608688354492, "logits/rejected": -8.178912162780762, "logps/chosen": -1830.0142822265625, "logps/rejected": -1169.6126708984375, "loss": 38.5968, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -109.9552230834961, "rewards/margins": -23.823511123657227, "rewards/rejected": -86.13169860839844, "step": 2750 }, { "epoch": 0.16, "grad_norm": 26.2659854888916, "learning_rate": 0.0009494949494949495, "logits/chosen": -7.424629211425781, "logits/rejected": -7.209588527679443, "logps/chosen": -1708.3294677734375, "logps/rejected": -1505.0179443359375, "loss": 52.6086, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -89.92652130126953, "rewards/margins": -36.53548812866211, "rewards/rejected": -53.39104080200195, "step": 2760 }, { "epoch": 0.16, "grad_norm": 59.79667282104492, "learning_rate": 0.0009493014435543171, "logits/chosen": -9.200172424316406, "logits/rejected": -9.225387573242188, "logps/chosen": -1688.4632568359375, "logps/rejected": -1881.7620849609375, "loss": 22.8826, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -80.133544921875, "rewards/margins": -6.107524871826172, "rewards/rejected": -74.02601623535156, "step": 2770 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.0009491079376136848, "logits/chosen": -7.611553192138672, "logits/rejected": -7.451489448547363, "logps/chosen": -2033.071533203125, "logps/rejected": -1521.1815185546875, "loss": 37.4426, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -57.56422805786133, "rewards/margins": -12.47274112701416, "rewards/rejected": -45.091487884521484, "step": 2780 }, { "epoch": 0.16, "grad_norm": 2.8613733604743175e-10, "learning_rate": 0.0009489144316730524, "logits/chosen": -7.469753265380859, "logits/rejected": -7.354804992675781, "logps/chosen": -1885.624267578125, "logps/rejected": -1519.5147705078125, "loss": 24.4972, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -81.50711059570312, "rewards/margins": -9.597112655639648, "rewards/rejected": -71.90998840332031, "step": 2790 }, { "epoch": 0.16, "grad_norm": 55.19593048095703, "learning_rate": 0.00094872092573242, "logits/chosen": -9.902843475341797, "logits/rejected": -9.908632278442383, "logps/chosen": -1895.419189453125, "logps/rejected": -1485.3592529296875, "loss": 47.8168, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -80.66827392578125, "rewards/margins": -30.358190536499023, "rewards/rejected": -50.310081481933594, "step": 2800 }, { "epoch": 0.16, "grad_norm": 1.5384600260404113e-07, "learning_rate": 0.0009485274197917876, "logits/chosen": -8.182465553283691, "logits/rejected": -8.144519805908203, "logps/chosen": -1502.842041015625, "logps/rejected": -1528.011962890625, "loss": 12.203, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -84.3280029296875, "rewards/margins": 2.0573391914367676, "rewards/rejected": -86.3853530883789, "step": 2810 }, { "epoch": 0.16, "grad_norm": 21.20224380493164, "learning_rate": 0.0009483339138511553, "logits/chosen": -7.477170467376709, "logits/rejected": -7.461258888244629, "logps/chosen": -2088.64599609375, "logps/rejected": -1624.2044677734375, "loss": 44.2741, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -43.84979248046875, "rewards/margins": -34.98548126220703, "rewards/rejected": -8.864311218261719, "step": 2820 }, { "epoch": 0.16, "grad_norm": 62.08571243286133, "learning_rate": 0.0009481404079105229, "logits/chosen": -8.943631172180176, "logits/rejected": -8.82641315460205, "logps/chosen": -1698.3726806640625, "logps/rejected": -1300.765869140625, "loss": 33.5878, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -134.24673461914062, "rewards/margins": -32.8784065246582, "rewards/rejected": -101.36833953857422, "step": 2830 }, { "epoch": 0.16, "grad_norm": 37.41350555419922, "learning_rate": 0.0009479469019698906, "logits/chosen": -7.703639030456543, "logits/rejected": -7.694466590881348, "logps/chosen": -1782.5657958984375, "logps/rejected": -1484.1053466796875, "loss": 43.4594, "rewards/accuracies": 0.5, "rewards/chosen": -48.82711410522461, "rewards/margins": -35.28961944580078, "rewards/rejected": -13.537503242492676, "step": 2840 }, { "epoch": 0.16, "grad_norm": 52.169517517089844, "learning_rate": 0.0009477533960292582, "logits/chosen": -8.964095115661621, "logits/rejected": -8.932100296020508, "logps/chosen": -1444.97900390625, "logps/rejected": -1224.0574951171875, "loss": 17.6835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.2043514251709, "rewards/margins": -7.231348991394043, "rewards/rejected": -19.97300148010254, "step": 2850 }, { "epoch": 0.17, "grad_norm": 100.22389221191406, "learning_rate": 0.0009475598900886258, "logits/chosen": -8.00173568725586, "logits/rejected": -7.999772548675537, "logps/chosen": -2175.27099609375, "logps/rejected": -1253.23828125, "loss": 41.3499, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -59.393882751464844, "rewards/margins": -10.319540023803711, "rewards/rejected": -49.07435607910156, "step": 2860 }, { "epoch": 0.17, "grad_norm": 47.952701568603516, "learning_rate": 0.0009473663841479934, "logits/chosen": -7.232243537902832, "logits/rejected": -7.244473934173584, "logps/chosen": -1855.568359375, "logps/rejected": -1663.5689697265625, "loss": 22.3714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 11.41168212890625, "rewards/margins": -19.098241806030273, "rewards/rejected": 30.50992202758789, "step": 2870 }, { "epoch": 0.17, "grad_norm": 1.2567484830583453e-08, "learning_rate": 0.000947172878207361, "logits/chosen": -9.058934211730957, "logits/rejected": -9.054736137390137, "logps/chosen": -2037.8238525390625, "logps/rejected": -1512.76513671875, "loss": 44.5908, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -144.59347534179688, "rewards/margins": -41.91835021972656, "rewards/rejected": -102.67513275146484, "step": 2880 }, { "epoch": 0.17, "grad_norm": 20.161535263061523, "learning_rate": 0.0009469793722667285, "logits/chosen": -8.377878189086914, "logits/rejected": -8.388252258300781, "logps/chosen": -1741.373046875, "logps/rejected": -1477.035400390625, "loss": 20.9903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -54.6690673828125, "rewards/margins": -3.95695424079895, "rewards/rejected": -50.71211624145508, "step": 2890 }, { "epoch": 0.17, "grad_norm": 16.341520309448242, "learning_rate": 0.0009467858663260962, "logits/chosen": -8.636987686157227, "logits/rejected": -8.650459289550781, "logps/chosen": -2047.110595703125, "logps/rejected": -1817.267822265625, "loss": 10.1831, "rewards/accuracies": 0.5, "rewards/chosen": -16.88031005859375, "rewards/margins": 6.683633327484131, "rewards/rejected": -23.56394386291504, "step": 2900 }, { "epoch": 0.17, "grad_norm": 54.843997955322266, "learning_rate": 0.0009465923603854638, "logits/chosen": -9.341910362243652, "logits/rejected": -9.349281311035156, "logps/chosen": -1856.342041015625, "logps/rejected": -1801.5498046875, "loss": 15.898, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -80.80390930175781, "rewards/margins": -0.35181769728660583, "rewards/rejected": -80.45207977294922, "step": 2910 }, { "epoch": 0.17, "grad_norm": 29.90599250793457, "learning_rate": 0.0009463988544448314, "logits/chosen": -8.800680160522461, "logits/rejected": -8.841449737548828, "logps/chosen": -1600.341552734375, "logps/rejected": -1371.384521484375, "loss": 26.194, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.857484817504883, "rewards/margins": -20.912982940673828, "rewards/rejected": 0.0554962158203125, "step": 2920 }, { "epoch": 0.17, "grad_norm": 24.537702560424805, "learning_rate": 0.0009462053485041991, "logits/chosen": -8.120457649230957, "logits/rejected": -8.109103202819824, "logps/chosen": -1501.355712890625, "logps/rejected": -1297.4967041015625, "loss": 21.4, "rewards/accuracies": 0.5, "rewards/chosen": 2.6735222339630127, "rewards/margins": -12.030122756958008, "rewards/rejected": 14.703645706176758, "step": 2930 }, { "epoch": 0.17, "grad_norm": 52.64665603637695, "learning_rate": 0.0009460118425635667, "logits/chosen": -8.252087593078613, "logits/rejected": -8.228696823120117, "logps/chosen": -1649.754638671875, "logps/rejected": -1443.249755859375, "loss": 28.6136, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -128.8332977294922, "rewards/margins": -18.584646224975586, "rewards/rejected": -110.24867248535156, "step": 2940 }, { "epoch": 0.17, "grad_norm": 45.2380256652832, "learning_rate": 0.0009458183366229344, "logits/chosen": -9.11462116241455, "logits/rejected": -9.077970504760742, "logps/chosen": -1163.755859375, "logps/rejected": -1158.75537109375, "loss": 13.3064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.018287658691406, "rewards/margins": 10.81491470336914, "rewards/rejected": -64.83319854736328, "step": 2950 }, { "epoch": 0.17, "grad_norm": 8.871385944075882e-06, "learning_rate": 0.000945624830682302, "logits/chosen": -9.204428672790527, "logits/rejected": -8.974420547485352, "logps/chosen": -1906.378662109375, "logps/rejected": -1182.7108154296875, "loss": 34.3846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -44.714134216308594, "rewards/margins": -9.558793067932129, "rewards/rejected": -35.155338287353516, "step": 2960 }, { "epoch": 0.17, "grad_norm": 74.43576049804688, "learning_rate": 0.0009454313247416696, "logits/chosen": -7.345269680023193, "logits/rejected": -7.322915077209473, "logps/chosen": -1797.0072021484375, "logps/rejected": -1880.421142578125, "loss": 28.1928, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -96.09190368652344, "rewards/margins": -8.928838729858398, "rewards/rejected": -87.16307067871094, "step": 2970 }, { "epoch": 0.17, "grad_norm": 84.72682189941406, "learning_rate": 0.0009452378188010372, "logits/chosen": -8.211282730102539, "logits/rejected": -7.976790428161621, "logps/chosen": -1905.3548583984375, "logps/rejected": -1440.2342529296875, "loss": 44.4096, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -44.70731735229492, "rewards/margins": -41.60854721069336, "rewards/rejected": -3.0987753868103027, "step": 2980 }, { "epoch": 0.17, "grad_norm": 60.91679382324219, "learning_rate": 0.0009450443128604048, "logits/chosen": -9.169424057006836, "logits/rejected": -9.052993774414062, "logps/chosen": -2419.627685546875, "logps/rejected": -2085.341796875, "loss": 20.9184, "rewards/accuracies": 0.5, "rewards/chosen": 10.370521545410156, "rewards/margins": 0.181355282664299, "rewards/rejected": 10.189165115356445, "step": 2990 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.0009448508069197724, "logits/chosen": -9.824614524841309, "logits/rejected": -9.747113227844238, "logps/chosen": -1986.109375, "logps/rejected": -1707.404052734375, "loss": 32.3656, "rewards/accuracies": 0.5, "rewards/chosen": -43.29988098144531, "rewards/margins": -4.524162292480469, "rewards/rejected": -38.77571487426758, "step": 3000 }, { "epoch": 0.17, "grad_norm": 3.5070399917458417e-07, "learning_rate": 0.00094465730097914, "logits/chosen": -10.342714309692383, "logits/rejected": -10.325729370117188, "logps/chosen": -1588.7701416015625, "logps/rejected": -1534.366455078125, "loss": 17.7123, "rewards/accuracies": 0.5, "rewards/chosen": -59.31488800048828, "rewards/margins": -6.184584140777588, "rewards/rejected": -53.13031005859375, "step": 3010 }, { "epoch": 0.17, "grad_norm": 98.99474334716797, "learning_rate": 0.0009444637950385077, "logits/chosen": -9.627092361450195, "logits/rejected": -9.58442497253418, "logps/chosen": -2146.07666015625, "logps/rejected": -1938.9371337890625, "loss": 47.1243, "rewards/accuracies": 0.5, "rewards/chosen": -87.63168334960938, "rewards/margins": -28.06146812438965, "rewards/rejected": -59.570228576660156, "step": 3020 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.0009442702890978754, "logits/chosen": -7.763548851013184, "logits/rejected": -7.824507713317871, "logps/chosen": -1761.2857666015625, "logps/rejected": -1887.998046875, "loss": 20.6326, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.287753105163574, "rewards/margins": 10.342926025390625, "rewards/rejected": -23.630680084228516, "step": 3030 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.000944076783157243, "logits/chosen": -7.470152378082275, "logits/rejected": -7.337943077087402, "logps/chosen": -1633.31689453125, "logps/rejected": -1436.556640625, "loss": 35.5556, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -84.30223083496094, "rewards/margins": -20.19291114807129, "rewards/rejected": -64.10932159423828, "step": 3040 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.0009438832772166106, "logits/chosen": -7.293765068054199, "logits/rejected": -7.341050624847412, "logps/chosen": -2651.558837890625, "logps/rejected": -2458.374267578125, "loss": 20.6721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -54.521484375, "rewards/margins": 5.517293930053711, "rewards/rejected": -60.03876876831055, "step": 3050 }, { "epoch": 0.18, "grad_norm": 45.5135612487793, "learning_rate": 0.0009436897712759782, "logits/chosen": -7.246234893798828, "logits/rejected": -7.284828186035156, "logps/chosen": -2084.340576171875, "logps/rejected": -1611.0262451171875, "loss": 50.9405, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -128.50344848632812, "rewards/margins": -36.41931915283203, "rewards/rejected": -92.0841293334961, "step": 3060 }, { "epoch": 0.18, "grad_norm": 4.957384853262073e-20, "learning_rate": 0.0009434962653353459, "logits/chosen": -8.08543586730957, "logits/rejected": -8.128222465515137, "logps/chosen": -2039.8804931640625, "logps/rejected": -1336.1590576171875, "loss": 19.2981, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.28295135498047, "rewards/margins": -10.399771690368652, "rewards/rejected": -58.8831787109375, "step": 3070 }, { "epoch": 0.18, "grad_norm": 41.48952102661133, "learning_rate": 0.0009433027593947135, "logits/chosen": -9.095118522644043, "logits/rejected": -9.179288864135742, "logps/chosen": -1582.718505859375, "logps/rejected": -1306.345458984375, "loss": 25.5751, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.8664093017578125, "rewards/margins": -14.787759780883789, "rewards/rejected": 19.65416717529297, "step": 3080 }, { "epoch": 0.18, "grad_norm": 16.11919403076172, "learning_rate": 0.0009431092534540811, "logits/chosen": -9.143038749694824, "logits/rejected": -9.2750825881958, "logps/chosen": -1690.5550537109375, "logps/rejected": -993.7479248046875, "loss": 57.129, "rewards/accuracies": 0.0, "rewards/chosen": -110.12324523925781, "rewards/margins": -57.1290283203125, "rewards/rejected": -52.99422073364258, "step": 3090 }, { "epoch": 0.18, "grad_norm": 0.00015472297673113644, "learning_rate": 0.0009429157475134487, "logits/chosen": -9.146415710449219, "logits/rejected": -9.17653751373291, "logps/chosen": -1695.322265625, "logps/rejected": -1040.815673828125, "loss": 59.0516, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -135.1615447998047, "rewards/margins": -55.175376892089844, "rewards/rejected": -79.98616790771484, "step": 3100 }, { "epoch": 0.18, "grad_norm": 54.78703689575195, "learning_rate": 0.0009427222415728162, "logits/chosen": -7.762447357177734, "logits/rejected": -7.712035179138184, "logps/chosen": -1875.2041015625, "logps/rejected": -1336.5445556640625, "loss": 40.06, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -115.40220642089844, "rewards/margins": -22.766874313354492, "rewards/rejected": -92.63533020019531, "step": 3110 }, { "epoch": 0.18, "grad_norm": 122.95219421386719, "learning_rate": 0.0009425287356321838, "logits/chosen": -7.020888328552246, "logits/rejected": -7.031190395355225, "logps/chosen": -2208.7509765625, "logps/rejected": -1733.9173583984375, "loss": 24.993, "rewards/accuracies": 0.5, "rewards/chosen": -62.5024299621582, "rewards/margins": -5.817954063415527, "rewards/rejected": -56.684486389160156, "step": 3120 }, { "epoch": 0.18, "grad_norm": 22.36829948425293, "learning_rate": 0.0009423352296915515, "logits/chosen": -8.246630668640137, "logits/rejected": -8.246831893920898, "logps/chosen": -1424.98583984375, "logps/rejected": -1307.8731689453125, "loss": 23.0802, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -105.49180603027344, "rewards/margins": -7.852456569671631, "rewards/rejected": -97.63935089111328, "step": 3130 }, { "epoch": 0.18, "grad_norm": 63.98225021362305, "learning_rate": 0.0009421417237509192, "logits/chosen": -7.672812461853027, "logits/rejected": -7.660381317138672, "logps/chosen": -1617.5902099609375, "logps/rejected": -1647.2900390625, "loss": 19.6522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.669034957885742, "rewards/margins": -5.903903484344482, "rewards/rejected": -2.765132427215576, "step": 3140 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.0009419482178102868, "logits/chosen": -9.261918067932129, "logits/rejected": -9.253633499145508, "logps/chosen": -1450.260009765625, "logps/rejected": -1315.0137939453125, "loss": 33.9465, "rewards/accuracies": 0.5, "rewards/chosen": -105.713134765625, "rewards/margins": -7.394974708557129, "rewards/rejected": -98.31816101074219, "step": 3150 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.0009417547118696544, "logits/chosen": -9.516426086425781, "logits/rejected": -9.520073890686035, "logps/chosen": -1931.9654541015625, "logps/rejected": -1991.5009765625, "loss": 18.4026, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -60.502418518066406, "rewards/margins": -0.7114017605781555, "rewards/rejected": -59.79100799560547, "step": 3160 }, { "epoch": 0.18, "grad_norm": 42.975929260253906, "learning_rate": 0.000941561205929022, "logits/chosen": -8.115017890930176, "logits/rejected": -8.108919143676758, "logps/chosen": -1658.740966796875, "logps/rejected": -1877.986572265625, "loss": 19.4704, "rewards/accuracies": 0.5, "rewards/chosen": -109.8653564453125, "rewards/margins": -8.147645950317383, "rewards/rejected": -101.71771240234375, "step": 3170 }, { "epoch": 0.18, "grad_norm": 22.78135871887207, "learning_rate": 0.0009413676999883897, "logits/chosen": -8.86577320098877, "logits/rejected": -8.759061813354492, "logps/chosen": -1994.8658447265625, "logps/rejected": -1203.143798828125, "loss": 18.7622, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.202049255371094, "rewards/margins": 0.3568374514579773, "rewards/rejected": -20.55888557434082, "step": 3180 }, { "epoch": 0.18, "grad_norm": 0.12810315191745758, "learning_rate": 0.0009411741940477573, "logits/chosen": -8.563993453979492, "logits/rejected": -8.49451732635498, "logps/chosen": -1962.2222900390625, "logps/rejected": -1659.9449462890625, "loss": 25.9756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.177130222320557, "rewards/margins": -2.532893419265747, "rewards/rejected": -1.6442378759384155, "step": 3190 }, { "epoch": 0.19, "grad_norm": 137.1099395751953, "learning_rate": 0.0009409806881071249, "logits/chosen": -7.929892063140869, "logits/rejected": -7.855855464935303, "logps/chosen": -1930.891845703125, "logps/rejected": -1571.3343505859375, "loss": 26.3919, "rewards/accuracies": 0.5, "rewards/chosen": -85.64048767089844, "rewards/margins": -13.462579727172852, "rewards/rejected": -72.17790985107422, "step": 3200 }, { "epoch": 0.19, "grad_norm": 48.23454284667969, "learning_rate": 0.0009407871821664925, "logits/chosen": -8.54582691192627, "logits/rejected": -8.52124309539795, "logps/chosen": -1879.377685546875, "logps/rejected": -1241.0185546875, "loss": 53.1241, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -114.0196762084961, "rewards/margins": -48.6369743347168, "rewards/rejected": -65.38270568847656, "step": 3210 }, { "epoch": 0.19, "grad_norm": 33.511077880859375, "learning_rate": 0.0009405936762258601, "logits/chosen": -9.154560089111328, "logits/rejected": -9.11271858215332, "logps/chosen": -1384.902587890625, "logps/rejected": -1327.8260498046875, "loss": 21.5609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -47.35600662231445, "rewards/margins": -10.390185356140137, "rewards/rejected": -36.965816497802734, "step": 3220 }, { "epoch": 0.19, "grad_norm": 1.7175406333714421e-13, "learning_rate": 0.0009404001702852277, "logits/chosen": -9.00403881072998, "logits/rejected": -8.9661865234375, "logps/chosen": -2092.66357421875, "logps/rejected": -1708.127685546875, "loss": 16.0903, "rewards/accuracies": 0.5, "rewards/chosen": -88.5472183227539, "rewards/margins": 6.6815032958984375, "rewards/rejected": -95.22871398925781, "step": 3230 }, { "epoch": 0.19, "grad_norm": 81.72650909423828, "learning_rate": 0.0009402066643445955, "logits/chosen": -8.55872917175293, "logits/rejected": -8.501558303833008, "logps/chosen": -1588.837158203125, "logps/rejected": -1421.61328125, "loss": 17.147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -57.41021728515625, "rewards/margins": 16.075143814086914, "rewards/rejected": -73.48535919189453, "step": 3240 }, { "epoch": 0.19, "grad_norm": 65.39106750488281, "learning_rate": 0.0009400131584039631, "logits/chosen": -8.57758617401123, "logits/rejected": -8.607234001159668, "logps/chosen": -1830.114013671875, "logps/rejected": -1715.6468505859375, "loss": 36.919, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -85.14326477050781, "rewards/margins": -23.33574676513672, "rewards/rejected": -61.80751419067383, "step": 3250 }, { "epoch": 0.19, "grad_norm": 56.57868957519531, "learning_rate": 0.0009398196524633307, "logits/chosen": -7.575648307800293, "logits/rejected": -7.571455478668213, "logps/chosen": -2116.114013671875, "logps/rejected": -2225.2626953125, "loss": 14.7475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -54.7210578918457, "rewards/margins": -5.079890251159668, "rewards/rejected": -49.64116668701172, "step": 3260 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.0009396261465226983, "logits/chosen": -8.079916000366211, "logits/rejected": -8.069523811340332, "logps/chosen": -1719.1546630859375, "logps/rejected": -1464.358642578125, "loss": 12.7145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.318777084350586, "rewards/margins": 3.006699323654175, "rewards/rejected": -29.325485229492188, "step": 3270 }, { "epoch": 0.19, "grad_norm": 13.272704124450684, "learning_rate": 0.0009394326405820659, "logits/chosen": -8.377842903137207, "logits/rejected": -8.401209831237793, "logps/chosen": -2014.120361328125, "logps/rejected": -1343.0169677734375, "loss": 67.414, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -129.47071838378906, "rewards/margins": -53.77128219604492, "rewards/rejected": -75.6994400024414, "step": 3280 }, { "epoch": 0.19, "grad_norm": 19.288372039794922, "learning_rate": 0.0009392391346414336, "logits/chosen": -7.0062127113342285, "logits/rejected": -7.000612735748291, "logps/chosen": -1839.072265625, "logps/rejected": -1592.122314453125, "loss": 27.9509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -78.78765106201172, "rewards/margins": -6.367119789123535, "rewards/rejected": -72.4205322265625, "step": 3290 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.0009390456287008012, "logits/chosen": -8.183120727539062, "logits/rejected": -8.037330627441406, "logps/chosen": -1550.2337646484375, "logps/rejected": -1215.2752685546875, "loss": 27.9075, "rewards/accuracies": 0.5, "rewards/chosen": -59.586387634277344, "rewards/margins": 0.4348411560058594, "rewards/rejected": -60.02122116088867, "step": 3300 }, { "epoch": 0.19, "grad_norm": 11.250005722045898, "learning_rate": 0.0009388521227601688, "logits/chosen": -9.41681957244873, "logits/rejected": -9.437299728393555, "logps/chosen": -1852.452392578125, "logps/rejected": -1441.337646484375, "loss": 25.971, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.30665588378906, "rewards/margins": -17.18233871459961, "rewards/rejected": -19.12431526184082, "step": 3310 }, { "epoch": 0.19, "grad_norm": 5.069759723141942e-08, "learning_rate": 0.0009386586168195363, "logits/chosen": -8.347051620483398, "logits/rejected": -8.327738761901855, "logps/chosen": -1520.29931640625, "logps/rejected": -1357.865478515625, "loss": 5.0763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.52254295349121, "rewards/margins": 26.088369369506836, "rewards/rejected": -44.61091232299805, "step": 3320 }, { "epoch": 0.19, "grad_norm": 0.0010989723959937692, "learning_rate": 0.0009384651108789039, "logits/chosen": -8.001028060913086, "logits/rejected": -7.990425109863281, "logps/chosen": -1768.91015625, "logps/rejected": -1350.28125, "loss": 36.753, "rewards/accuracies": 0.5, "rewards/chosen": -32.58502197265625, "rewards/margins": -8.64805793762207, "rewards/rejected": -23.936965942382812, "step": 3330 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.0009382716049382715, "logits/chosen": -9.122034072875977, "logits/rejected": -9.067682266235352, "logps/chosen": -1981.3179931640625, "logps/rejected": -1283.291015625, "loss": 42.4067, "rewards/accuracies": 0.5, "rewards/chosen": -35.76735305786133, "rewards/margins": -9.308874130249023, "rewards/rejected": -26.458477020263672, "step": 3340 }, { "epoch": 0.19, "grad_norm": 5.860136565085793e-17, "learning_rate": 0.0009380780989976393, "logits/chosen": -8.298300743103027, "logits/rejected": -8.301536560058594, "logps/chosen": -1936.325927734375, "logps/rejected": -1992.4990234375, "loss": 22.4662, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -60.713783264160156, "rewards/margins": -16.433917999267578, "rewards/rejected": -44.279869079589844, "step": 3350 }, { "epoch": 0.19, "grad_norm": 5.5825697927502915e-05, "learning_rate": 0.0009378845930570069, "logits/chosen": -7.574742317199707, "logits/rejected": -7.579817771911621, "logps/chosen": -1555.266357421875, "logps/rejected": -1523.408447265625, "loss": 1.2764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -82.54724884033203, "rewards/margins": 33.401405334472656, "rewards/rejected": -115.94865417480469, "step": 3360 }, { "epoch": 0.2, "grad_norm": 51.54780960083008, "learning_rate": 0.0009376910871163745, "logits/chosen": -7.527673244476318, "logits/rejected": -7.514391899108887, "logps/chosen": -1747.651611328125, "logps/rejected": -1783.7232666015625, "loss": 12.4323, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -73.661865234375, "rewards/margins": 2.2589333057403564, "rewards/rejected": -75.9207992553711, "step": 3370 }, { "epoch": 0.2, "grad_norm": 312.8269958496094, "learning_rate": 0.0009374975811757421, "logits/chosen": -10.864104270935059, "logits/rejected": -10.840837478637695, "logps/chosen": -1134.437255859375, "logps/rejected": -1073.851806640625, "loss": 10.6812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.412623405456543, "rewards/margins": 7.795899868011475, "rewards/rejected": -12.208525657653809, "step": 3380 }, { "epoch": 0.2, "grad_norm": 3.1503116760414946e-19, "learning_rate": 0.0009373040752351097, "logits/chosen": -10.517317771911621, "logits/rejected": -10.490636825561523, "logps/chosen": -1538.049560546875, "logps/rejected": -1245.357666015625, "loss": 41.671, "rewards/accuracies": 0.5, "rewards/chosen": -120.17903900146484, "rewards/margins": -22.985595703125, "rewards/rejected": -97.19344329833984, "step": 3390 }, { "epoch": 0.2, "grad_norm": 0.0026880833320319653, "learning_rate": 0.0009371105692944773, "logits/chosen": -8.805553436279297, "logits/rejected": -8.76444149017334, "logps/chosen": -2195.32080078125, "logps/rejected": -2155.884033203125, "loss": 31.9075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -96.14558410644531, "rewards/margins": -24.059579849243164, "rewards/rejected": -72.08601379394531, "step": 3400 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.000936917063353845, "logits/chosen": -10.216272354125977, "logits/rejected": -10.194625854492188, "logps/chosen": -2269.627685546875, "logps/rejected": -2036.596435546875, "loss": 39.8009, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -55.030784606933594, "rewards/margins": -32.70468521118164, "rewards/rejected": -22.326107025146484, "step": 3410 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.0009367235574132126, "logits/chosen": -9.1826810836792, "logits/rejected": -9.203193664550781, "logps/chosen": -1967.044189453125, "logps/rejected": -1975.177734375, "loss": 30.6792, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.88856506347656, "rewards/margins": -4.957289695739746, "rewards/rejected": -94.93126678466797, "step": 3420 }, { "epoch": 0.2, "grad_norm": 44.37617492675781, "learning_rate": 0.0009365300514725802, "logits/chosen": -8.416610717773438, "logits/rejected": -8.323084831237793, "logps/chosen": -1902.9097900390625, "logps/rejected": -1442.5228271484375, "loss": 24.1803, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -48.32640075683594, "rewards/margins": -4.7899274826049805, "rewards/rejected": -43.536476135253906, "step": 3430 }, { "epoch": 0.2, "grad_norm": 92.9279556274414, "learning_rate": 0.0009363365455319478, "logits/chosen": -8.7667875289917, "logits/rejected": -8.721907615661621, "logps/chosen": -1878.478271484375, "logps/rejected": -1642.673583984375, "loss": 28.5267, "rewards/accuracies": 0.0, "rewards/chosen": -60.85554122924805, "rewards/margins": -28.507537841796875, "rewards/rejected": -32.34800338745117, "step": 3440 }, { "epoch": 0.2, "grad_norm": 0.41050708293914795, "learning_rate": 0.0009361430395913155, "logits/chosen": -6.9395928382873535, "logits/rejected": -6.975235939025879, "logps/chosen": -1973.660888671875, "logps/rejected": -1782.926513671875, "loss": 13.7185, "rewards/accuracies": 0.5, "rewards/chosen": -32.39301300048828, "rewards/margins": -5.101534843444824, "rewards/rejected": -27.29147720336914, "step": 3450 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.0009359495336506832, "logits/chosen": -9.01763916015625, "logits/rejected": -8.992587089538574, "logps/chosen": -1597.4886474609375, "logps/rejected": -785.6058959960938, "loss": 34.7408, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -42.27281951904297, "rewards/margins": 0.14946594834327698, "rewards/rejected": -42.42229461669922, "step": 3460 }, { "epoch": 0.2, "grad_norm": 39.043277740478516, "learning_rate": 0.0009357560277100508, "logits/chosen": -9.726953506469727, "logits/rejected": -9.641809463500977, "logps/chosen": -1857.4033203125, "logps/rejected": -1065.3089599609375, "loss": 32.1824, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -75.990234375, "rewards/margins": -4.064963340759277, "rewards/rejected": -71.92526245117188, "step": 3470 }, { "epoch": 0.2, "grad_norm": 54.37092208862305, "learning_rate": 0.0009355625217694184, "logits/chosen": -9.166987419128418, "logits/rejected": -9.151494979858398, "logps/chosen": -1889.0904541015625, "logps/rejected": -1921.846435546875, "loss": 26.6099, "rewards/accuracies": 0.5, "rewards/chosen": -106.64918518066406, "rewards/margins": -22.196269989013672, "rewards/rejected": -84.45291900634766, "step": 3480 }, { "epoch": 0.2, "grad_norm": 35.78702926635742, "learning_rate": 0.000935369015828786, "logits/chosen": -9.64307689666748, "logits/rejected": -9.497735977172852, "logps/chosen": -1441.833984375, "logps/rejected": -1200.91259765625, "loss": 23.304, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -80.26805114746094, "rewards/margins": -18.958148956298828, "rewards/rejected": -61.309906005859375, "step": 3490 }, { "epoch": 0.2, "grad_norm": 84.96395111083984, "learning_rate": 0.0009351755098881536, "logits/chosen": -8.164396286010742, "logits/rejected": -8.018976211547852, "logps/chosen": -2110.05322265625, "logps/rejected": -1730.657958984375, "loss": 16.3657, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -74.03788757324219, "rewards/margins": -12.259282112121582, "rewards/rejected": -61.77860641479492, "step": 3500 }, { "epoch": 0.2, "grad_norm": 39.48674392700195, "learning_rate": 0.0009349820039475212, "logits/chosen": -7.5890655517578125, "logits/rejected": -7.557645320892334, "logps/chosen": -2068.471923828125, "logps/rejected": -1865.9722900390625, "loss": 28.3164, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.781375885009766, "rewards/margins": -19.058244705200195, "rewards/rejected": -17.72313117980957, "step": 3510 }, { "epoch": 0.2, "grad_norm": 18.290800094604492, "learning_rate": 0.0009347884980068889, "logits/chosen": -8.769168853759766, "logits/rejected": -8.74716854095459, "logps/chosen": -1283.6153564453125, "logps/rejected": -1256.8521728515625, "loss": 32.7075, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -34.51445007324219, "rewards/margins": -29.36822509765625, "rewards/rejected": -5.146224021911621, "step": 3520 }, { "epoch": 0.2, "grad_norm": 0.3554331958293915, "learning_rate": 0.0009345949920662565, "logits/chosen": -8.642909049987793, "logits/rejected": -8.59641170501709, "logps/chosen": -1490.035400390625, "logps/rejected": -1167.065185546875, "loss": 26.1184, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -81.86649322509766, "rewards/margins": -22.30820083618164, "rewards/rejected": -59.55828857421875, "step": 3530 }, { "epoch": 0.2, "grad_norm": 1.883105993270874, "learning_rate": 0.000934401486125624, "logits/chosen": -8.688993453979492, "logits/rejected": -8.725262641906738, "logps/chosen": -1591.6414794921875, "logps/rejected": -1835.1812744140625, "loss": 7.891, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -52.47114181518555, "rewards/margins": 25.257862091064453, "rewards/rejected": -77.72900390625, "step": 3540 }, { "epoch": 0.21, "grad_norm": 67.55298614501953, "learning_rate": 0.0009342079801849916, "logits/chosen": -8.457229614257812, "logits/rejected": -8.46538257598877, "logps/chosen": -2098.803466796875, "logps/rejected": -2233.08447265625, "loss": 29.8484, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -61.2713623046875, "rewards/margins": -18.010547637939453, "rewards/rejected": -43.26081466674805, "step": 3550 }, { "epoch": 0.21, "grad_norm": 3.759286215654356e-10, "learning_rate": 0.0009340144742443593, "logits/chosen": -8.839159965515137, "logits/rejected": -8.828369140625, "logps/chosen": -2107.48291015625, "logps/rejected": -1648.5771484375, "loss": 21.1616, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.30857467651367, "rewards/margins": -4.212553977966309, "rewards/rejected": -29.096023559570312, "step": 3560 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.0009338209683037269, "logits/chosen": -9.299966812133789, "logits/rejected": -9.229998588562012, "logps/chosen": -1486.0494384765625, "logps/rejected": -1201.806396484375, "loss": 33.1521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -54.94166946411133, "rewards/margins": -2.8652215003967285, "rewards/rejected": -52.076438903808594, "step": 3570 }, { "epoch": 0.21, "grad_norm": 77.66106414794922, "learning_rate": 0.0009336274623630946, "logits/chosen": -8.316801071166992, "logits/rejected": -8.260093688964844, "logps/chosen": -2334.58984375, "logps/rejected": -2365.436767578125, "loss": 11.6106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.7174015045166, "rewards/margins": 11.159322738647461, "rewards/rejected": -39.87672805786133, "step": 3580 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.0009334339564224622, "logits/chosen": -7.965493679046631, "logits/rejected": -7.982199192047119, "logps/chosen": -1680.905517578125, "logps/rejected": -1539.408935546875, "loss": 27.7243, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -53.09714889526367, "rewards/margins": -14.039230346679688, "rewards/rejected": -39.05791473388672, "step": 3590 }, { "epoch": 0.21, "grad_norm": 6.218990121237385e-22, "learning_rate": 0.0009332404504818298, "logits/chosen": -8.631247520446777, "logits/rejected": -8.600420951843262, "logps/chosen": -1870.82421875, "logps/rejected": -1598.3408203125, "loss": 42.2846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -109.17115783691406, "rewards/margins": -20.56908416748047, "rewards/rejected": -88.60206604003906, "step": 3600 }, { "epoch": 0.21, "grad_norm": 0.11276492476463318, "learning_rate": 0.0009330469445411974, "logits/chosen": -10.006195068359375, "logits/rejected": -10.004241943359375, "logps/chosen": -1902.5474853515625, "logps/rejected": -1521.999755859375, "loss": 36.6204, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -82.17428588867188, "rewards/margins": -32.20343017578125, "rewards/rejected": -49.97085189819336, "step": 3610 }, { "epoch": 0.21, "grad_norm": 25.001392364501953, "learning_rate": 0.000932853438600565, "logits/chosen": -7.9695868492126465, "logits/rejected": -7.9894585609436035, "logps/chosen": -1299.892822265625, "logps/rejected": -939.4382934570312, "loss": 31.6623, "rewards/accuracies": 0.5, "rewards/chosen": -64.41548156738281, "rewards/margins": -18.030475616455078, "rewards/rejected": -46.385005950927734, "step": 3620 }, { "epoch": 0.21, "grad_norm": 122.78651428222656, "learning_rate": 0.0009326599326599326, "logits/chosen": -8.238987922668457, "logits/rejected": -8.171333312988281, "logps/chosen": -1459.6629638671875, "logps/rejected": -1189.2259521484375, "loss": 48.7459, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -81.27054595947266, "rewards/margins": -32.96311950683594, "rewards/rejected": -48.30741500854492, "step": 3630 }, { "epoch": 0.21, "grad_norm": 2.6993984615192214e-22, "learning_rate": 0.0009324664267193003, "logits/chosen": -8.756455421447754, "logits/rejected": -8.742239952087402, "logps/chosen": -1738.904541015625, "logps/rejected": -1656.112548828125, "loss": 15.4837, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -34.043418884277344, "rewards/margins": 9.753973007202148, "rewards/rejected": -43.797393798828125, "step": 3640 }, { "epoch": 0.21, "grad_norm": 0.003278074786067009, "learning_rate": 0.0009322729207786679, "logits/chosen": -7.754812717437744, "logits/rejected": -7.706968784332275, "logps/chosen": -1874.591064453125, "logps/rejected": -1663.647216796875, "loss": 13.0116, "rewards/accuracies": 0.5, "rewards/chosen": -84.41429901123047, "rewards/margins": -5.945555210113525, "rewards/rejected": -78.46875762939453, "step": 3650 }, { "epoch": 0.21, "grad_norm": 13.069666862487793, "learning_rate": 0.0009320794148380356, "logits/chosen": -7.090424537658691, "logits/rejected": -7.015928745269775, "logps/chosen": -1980.185546875, "logps/rejected": -1696.8248291015625, "loss": 29.0134, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -123.3875732421875, "rewards/margins": -19.15544319152832, "rewards/rejected": -104.23213195800781, "step": 3660 }, { "epoch": 0.21, "grad_norm": 26.407175064086914, "learning_rate": 0.0009318859088974032, "logits/chosen": -9.009489059448242, "logits/rejected": -8.988927841186523, "logps/chosen": -1738.316162109375, "logps/rejected": -1276.080322265625, "loss": 27.6571, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -71.84487915039062, "rewards/margins": -7.659684658050537, "rewards/rejected": -64.18519592285156, "step": 3670 }, { "epoch": 0.21, "grad_norm": 118.88785552978516, "learning_rate": 0.0009316924029567708, "logits/chosen": -8.549696922302246, "logits/rejected": -8.614594459533691, "logps/chosen": -2293.14990234375, "logps/rejected": -1589.32861328125, "loss": 57.325, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -71.53897094726562, "rewards/margins": -52.23163604736328, "rewards/rejected": -19.307342529296875, "step": 3680 }, { "epoch": 0.21, "grad_norm": 3.2755792744865175e-08, "learning_rate": 0.0009314988970161385, "logits/chosen": -8.103835105895996, "logits/rejected": -8.109182357788086, "logps/chosen": -1319.6343994140625, "logps/rejected": -1049.3116455078125, "loss": 22.1922, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -57.08574676513672, "rewards/margins": -17.08016014099121, "rewards/rejected": -40.005592346191406, "step": 3690 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.0009313053910755061, "logits/chosen": -9.373492240905762, "logits/rejected": -9.398412704467773, "logps/chosen": -1561.5469970703125, "logps/rejected": -1270.342041015625, "loss": 16.3641, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.743682861328125, "rewards/margins": 1.6334621906280518, "rewards/rejected": -23.377147674560547, "step": 3700 }, { "epoch": 0.21, "grad_norm": 52.75992965698242, "learning_rate": 0.0009311118851348737, "logits/chosen": -9.435159683227539, "logits/rejected": -9.435627937316895, "logps/chosen": -1899.8248291015625, "logps/rejected": -1153.300048828125, "loss": 57.3794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -116.1534423828125, "rewards/margins": -51.6572380065918, "rewards/rejected": -64.49620056152344, "step": 3710 }, { "epoch": 0.22, "grad_norm": 5.296868177706932e-20, "learning_rate": 0.0009309183791942413, "logits/chosen": -7.851377010345459, "logits/rejected": -7.836795806884766, "logps/chosen": -1904.8876953125, "logps/rejected": -1885.9739990234375, "loss": 28.6139, "rewards/accuracies": 0.5, "rewards/chosen": -89.34260559082031, "rewards/margins": -11.43128776550293, "rewards/rejected": -77.91130828857422, "step": 3720 }, { "epoch": 0.22, "grad_norm": 49.61537170410156, "learning_rate": 0.0009307248732536089, "logits/chosen": -6.985241889953613, "logits/rejected": -7.058312892913818, "logps/chosen": -1465.2744140625, "logps/rejected": -960.4876708984375, "loss": 38.3674, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -64.942626953125, "rewards/margins": -12.207527160644531, "rewards/rejected": -52.73509979248047, "step": 3730 }, { "epoch": 0.22, "grad_norm": 149.73239135742188, "learning_rate": 0.0009305313673129765, "logits/chosen": -9.180254936218262, "logits/rejected": -9.100876808166504, "logps/chosen": -2016.5855712890625, "logps/rejected": -1156.9366455078125, "loss": 66.3236, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -122.90840911865234, "rewards/margins": -36.11726379394531, "rewards/rejected": -86.79115295410156, "step": 3740 }, { "epoch": 0.22, "grad_norm": 7.198091043392196e-05, "learning_rate": 0.0009303378613723442, "logits/chosen": -8.053522109985352, "logits/rejected": -7.945916652679443, "logps/chosen": -1677.7867431640625, "logps/rejected": -1298.8914794921875, "loss": 34.5758, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -94.59349060058594, "rewards/margins": -29.242229461669922, "rewards/rejected": -65.35125732421875, "step": 3750 }, { "epoch": 0.22, "grad_norm": 110.07933044433594, "learning_rate": 0.0009301443554317117, "logits/chosen": -8.880256652832031, "logits/rejected": -8.848882675170898, "logps/chosen": -1678.1800537109375, "logps/rejected": -1383.875732421875, "loss": 31.0354, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.850360870361328, "rewards/margins": -8.747169494628906, "rewards/rejected": -9.103195190429688, "step": 3760 }, { "epoch": 0.22, "grad_norm": 111.64981079101562, "learning_rate": 0.0009299508494910794, "logits/chosen": -9.01764965057373, "logits/rejected": -9.038980484008789, "logps/chosen": -2069.036376953125, "logps/rejected": -2052.003662109375, "loss": 17.268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -89.18209075927734, "rewards/margins": 4.173206329345703, "rewards/rejected": -93.35530090332031, "step": 3770 }, { "epoch": 0.22, "grad_norm": 23.16996192932129, "learning_rate": 0.000929757343550447, "logits/chosen": -9.328783988952637, "logits/rejected": -9.315079689025879, "logps/chosen": -2102.692626953125, "logps/rejected": -2090.3359375, "loss": 17.331, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -63.46915817260742, "rewards/margins": -0.06637725979089737, "rewards/rejected": -63.40277862548828, "step": 3780 }, { "epoch": 0.22, "grad_norm": 0.2662234902381897, "learning_rate": 0.0009295638376098146, "logits/chosen": -8.122942924499512, "logits/rejected": -8.138830184936523, "logps/chosen": -1706.0003662109375, "logps/rejected": -1900.912841796875, "loss": 5.5209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.124374389648438, "rewards/margins": 25.439327239990234, "rewards/rejected": -48.56370162963867, "step": 3790 }, { "epoch": 0.22, "grad_norm": 27.7899227142334, "learning_rate": 0.0009293703316691822, "logits/chosen": -7.764555931091309, "logits/rejected": -7.935965538024902, "logps/chosen": -1524.53662109375, "logps/rejected": -1710.4273681640625, "loss": 5.7735, "rewards/accuracies": 0.5, "rewards/chosen": -52.72815704345703, "rewards/margins": 22.537277221679688, "rewards/rejected": -75.26542663574219, "step": 3800 }, { "epoch": 0.22, "grad_norm": 50.049354553222656, "learning_rate": 0.0009291768257285499, "logits/chosen": -8.551105499267578, "logits/rejected": -8.604185104370117, "logps/chosen": -1584.148681640625, "logps/rejected": -1465.0657958984375, "loss": 13.0233, "rewards/accuracies": 0.5, "rewards/chosen": 8.117512702941895, "rewards/margins": 5.78646183013916, "rewards/rejected": 2.3310515880584717, "step": 3810 }, { "epoch": 0.22, "grad_norm": 22.41517448425293, "learning_rate": 0.0009289833197879175, "logits/chosen": -9.147610664367676, "logits/rejected": -9.089153289794922, "logps/chosen": -1395.7860107421875, "logps/rejected": -1436.665771484375, "loss": 12.5342, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -38.9852180480957, "rewards/margins": 0.710498034954071, "rewards/rejected": -39.695716857910156, "step": 3820 }, { "epoch": 0.22, "grad_norm": 35.703670501708984, "learning_rate": 0.0009287898138472851, "logits/chosen": -8.687565803527832, "logits/rejected": -8.614180564880371, "logps/chosen": -1943.57421875, "logps/rejected": -1765.571044921875, "loss": 24.518, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -81.33849334716797, "rewards/margins": -16.684629440307617, "rewards/rejected": -64.65385437011719, "step": 3830 }, { "epoch": 0.22, "grad_norm": 0.12690427899360657, "learning_rate": 0.0009285963079066527, "logits/chosen": -10.056694984436035, "logits/rejected": -10.065403938293457, "logps/chosen": -1776.019287109375, "logps/rejected": -1509.34228515625, "loss": 17.2433, "rewards/accuracies": 0.5, "rewards/chosen": -32.5283088684082, "rewards/margins": 6.233877658843994, "rewards/rejected": -38.762184143066406, "step": 3840 }, { "epoch": 0.22, "grad_norm": 3.24022789754963e-06, "learning_rate": 0.0009284028019660203, "logits/chosen": -10.28187370300293, "logits/rejected": -10.352188110351562, "logps/chosen": -1778.345703125, "logps/rejected": -1428.49755859375, "loss": 35.0685, "rewards/accuracies": 0.5, "rewards/chosen": -103.96697998046875, "rewards/margins": -29.367740631103516, "rewards/rejected": -74.59922790527344, "step": 3850 }, { "epoch": 0.22, "grad_norm": 89.48892211914062, "learning_rate": 0.000928209296025388, "logits/chosen": -7.955052375793457, "logits/rejected": -7.987532138824463, "logps/chosen": -1457.1798095703125, "logps/rejected": -948.5035400390625, "loss": 41.8017, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -76.60820007324219, "rewards/margins": -39.362979888916016, "rewards/rejected": -37.245216369628906, "step": 3860 }, { "epoch": 0.22, "grad_norm": 4.7325721119761965e-09, "learning_rate": 0.0009280157900847557, "logits/chosen": -8.818272590637207, "logits/rejected": -8.77804946899414, "logps/chosen": -2123.839111328125, "logps/rejected": -1377.7587890625, "loss": 47.996, "rewards/accuracies": 0.5, "rewards/chosen": -39.70893859863281, "rewards/margins": -19.814868927001953, "rewards/rejected": -19.894075393676758, "step": 3870 }, { "epoch": 0.22, "grad_norm": 0.0033447942696511745, "learning_rate": 0.0009278222841441233, "logits/chosen": -8.553569793701172, "logits/rejected": -8.488561630249023, "logps/chosen": -1774.7825927734375, "logps/rejected": -1450.401123046875, "loss": 22.9255, "rewards/accuracies": 0.5, "rewards/chosen": -44.230003356933594, "rewards/margins": -4.842113018035889, "rewards/rejected": -39.38788986206055, "step": 3880 }, { "epoch": 0.23, "grad_norm": 43.78214645385742, "learning_rate": 0.0009276287782034909, "logits/chosen": -7.501887321472168, "logits/rejected": -7.400717735290527, "logps/chosen": -2077.7001953125, "logps/rejected": -1429.0084228515625, "loss": 44.4724, "rewards/accuracies": 0.5, "rewards/chosen": -38.158485412597656, "rewards/margins": -22.11669921875, "rewards/rejected": -16.04178237915039, "step": 3890 }, { "epoch": 0.23, "grad_norm": 36.51645278930664, "learning_rate": 0.0009274352722628585, "logits/chosen": -8.681915283203125, "logits/rejected": -8.643739700317383, "logps/chosen": -1269.23486328125, "logps/rejected": -1186.7135009765625, "loss": 25.1582, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.07377624511719, "rewards/margins": -8.138714790344238, "rewards/rejected": -90.93505096435547, "step": 3900 }, { "epoch": 0.23, "grad_norm": 84.0, "learning_rate": 0.0009272417663222261, "logits/chosen": -8.78742790222168, "logits/rejected": -8.694483757019043, "logps/chosen": -1704.026123046875, "logps/rejected": -1093.88671875, "loss": 34.1545, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.29328155517578, "rewards/margins": -17.237085342407227, "rewards/rejected": -52.05619430541992, "step": 3910 }, { "epoch": 0.23, "grad_norm": 4.844831466674805, "learning_rate": 0.0009270482603815938, "logits/chosen": -8.829949378967285, "logits/rejected": -8.707624435424805, "logps/chosen": -1677.8394775390625, "logps/rejected": -698.44091796875, "loss": 59.0605, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -94.5552978515625, "rewards/margins": -51.09885787963867, "rewards/rejected": -43.45644760131836, "step": 3920 }, { "epoch": 0.23, "grad_norm": 54.28427505493164, "learning_rate": 0.0009268547544409614, "logits/chosen": -9.82005786895752, "logits/rejected": -9.705799102783203, "logps/chosen": -1857.7626953125, "logps/rejected": -1551.4417724609375, "loss": 36.5986, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -75.474365234375, "rewards/margins": -26.639135360717773, "rewards/rejected": -48.835227966308594, "step": 3930 }, { "epoch": 0.23, "grad_norm": 2.9802591800689697, "learning_rate": 0.000926661248500329, "logits/chosen": -8.166667938232422, "logits/rejected": -8.077478408813477, "logps/chosen": -1428.464111328125, "logps/rejected": -1077.0665283203125, "loss": 17.5425, "rewards/accuracies": 0.5, "rewards/chosen": -75.7662124633789, "rewards/margins": 6.780928611755371, "rewards/rejected": -82.54713439941406, "step": 3940 }, { "epoch": 0.23, "grad_norm": 0.09590743482112885, "learning_rate": 0.0009264677425596966, "logits/chosen": -9.266221046447754, "logits/rejected": -9.218578338623047, "logps/chosen": -1748.959228515625, "logps/rejected": -1535.4376220703125, "loss": 32.594, "rewards/accuracies": 0.5, "rewards/chosen": -97.57916259765625, "rewards/margins": -17.353408813476562, "rewards/rejected": -80.22575378417969, "step": 3950 }, { "epoch": 0.23, "grad_norm": 2.573457891230646e-09, "learning_rate": 0.0009262742366190642, "logits/chosen": -9.264888763427734, "logits/rejected": -9.205583572387695, "logps/chosen": -1970.645263671875, "logps/rejected": -1509.4312744140625, "loss": 43.9069, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.27825164794922, "rewards/margins": -28.79804039001465, "rewards/rejected": -4.480208396911621, "step": 3960 }, { "epoch": 0.23, "grad_norm": 0.3812979757785797, "learning_rate": 0.0009260807306784318, "logits/chosen": -9.791322708129883, "logits/rejected": -9.771215438842773, "logps/chosen": -3211.533935546875, "logps/rejected": -3390.60888671875, "loss": 31.6268, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -213.07473754882812, "rewards/margins": -9.488733291625977, "rewards/rejected": -203.5859832763672, "step": 3970 }, { "epoch": 0.23, "grad_norm": 60.26753616333008, "learning_rate": 0.0009258872247377995, "logits/chosen": -9.756768226623535, "logits/rejected": -9.797303199768066, "logps/chosen": -1306.6719970703125, "logps/rejected": -1384.2677001953125, "loss": 6.0644, "rewards/accuracies": 0.5, "rewards/chosen": -96.94139099121094, "rewards/margins": 7.428764343261719, "rewards/rejected": -104.37015533447266, "step": 3980 }, { "epoch": 0.23, "grad_norm": 3298.14013671875, "learning_rate": 0.0009256937187971671, "logits/chosen": -8.176797866821289, "logits/rejected": -7.933627128601074, "logps/chosen": -6222.345703125, "logps/rejected": -5889.81787109375, "loss": 91.5371, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -446.57666015625, "rewards/margins": -27.308597564697266, "rewards/rejected": -419.26806640625, "step": 3990 }, { "epoch": 0.23, "grad_norm": 63.85945510864258, "learning_rate": 0.0009255002128565347, "logits/chosen": -8.634092330932617, "logits/rejected": -8.543691635131836, "logps/chosen": -1840.255859375, "logps/rejected": -1526.2454833984375, "loss": 26.846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -143.8533172607422, "rewards/margins": -23.5709285736084, "rewards/rejected": -120.28239440917969, "step": 4000 }, { "epoch": 0.23, "grad_norm": 73.63335418701172, "learning_rate": 0.0009253067069159023, "logits/chosen": -6.288109302520752, "logits/rejected": -6.308525085449219, "logps/chosen": -2026.9661865234375, "logps/rejected": -2086.376220703125, "loss": 11.9331, "rewards/accuracies": 0.5, "rewards/chosen": -129.34817504882812, "rewards/margins": 3.432399034500122, "rewards/rejected": -132.78057861328125, "step": 4010 }, { "epoch": 0.23, "grad_norm": 54.89995193481445, "learning_rate": 0.0009251132009752699, "logits/chosen": -6.815203666687012, "logits/rejected": -6.694698333740234, "logps/chosen": -2136.1298828125, "logps/rejected": -1610.01953125, "loss": 26.5601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.48846435546875, "rewards/margins": -12.0780029296875, "rewards/rejected": -117.41044616699219, "step": 4020 }, { "epoch": 0.23, "grad_norm": 66.92342376708984, "learning_rate": 0.0009249196950346375, "logits/chosen": -8.910615921020508, "logits/rejected": -8.900287628173828, "logps/chosen": -2381.043701171875, "logps/rejected": -2040.3463134765625, "loss": 25.761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.056180000305176, "rewards/margins": 5.9084153175354, "rewards/rejected": -16.964595794677734, "step": 4030 }, { "epoch": 0.23, "grad_norm": 43.97075271606445, "learning_rate": 0.0009247261890940052, "logits/chosen": -9.960431098937988, "logits/rejected": -9.966080665588379, "logps/chosen": -1414.1175537109375, "logps/rejected": -1288.701904296875, "loss": 19.398, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -72.20028686523438, "rewards/margins": -6.111391544342041, "rewards/rejected": -66.08890533447266, "step": 4040 }, { "epoch": 0.23, "grad_norm": 92.0217056274414, "learning_rate": 0.0009245326831533728, "logits/chosen": -9.081583976745605, "logits/rejected": -9.026572227478027, "logps/chosen": -1778.756591796875, "logps/rejected": -1380.808349609375, "loss": 26.7451, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -49.90746307373047, "rewards/margins": -17.11075210571289, "rewards/rejected": -32.796714782714844, "step": 4050 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.0009243391772127404, "logits/chosen": -8.361654281616211, "logits/rejected": -8.316448211669922, "logps/chosen": -1526.5994873046875, "logps/rejected": -1207.71337890625, "loss": 14.8849, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -56.5015869140625, "rewards/margins": 2.8171839714050293, "rewards/rejected": -59.31876754760742, "step": 4060 }, { "epoch": 0.24, "grad_norm": 51.398685455322266, "learning_rate": 0.000924145671272108, "logits/chosen": -7.826988220214844, "logits/rejected": -7.871653079986572, "logps/chosen": -1658.857666015625, "logps/rejected": -1591.552001953125, "loss": 16.2952, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -65.09477233886719, "rewards/margins": -3.0563411712646484, "rewards/rejected": -62.038414001464844, "step": 4070 }, { "epoch": 0.24, "grad_norm": 4.2415157215473787e-13, "learning_rate": 0.0009239521653314757, "logits/chosen": -8.513627052307129, "logits/rejected": -8.495003700256348, "logps/chosen": -2134.87841796875, "logps/rejected": -1895.674560546875, "loss": 17.1439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.92413902282715, "rewards/margins": 9.506441116333008, "rewards/rejected": -34.430580139160156, "step": 4080 }, { "epoch": 0.24, "grad_norm": 1.4670970813313033e-05, "learning_rate": 0.0009237586593908434, "logits/chosen": -9.642766952514648, "logits/rejected": -9.611018180847168, "logps/chosen": -2302.39453125, "logps/rejected": -2010.3402099609375, "loss": 24.9364, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -107.85514831542969, "rewards/margins": -18.023683547973633, "rewards/rejected": -89.83146667480469, "step": 4090 }, { "epoch": 0.24, "grad_norm": 2.991047613370242e-20, "learning_rate": 0.000923565153450211, "logits/chosen": -9.761533737182617, "logits/rejected": -9.727927207946777, "logps/chosen": -1402.8529052734375, "logps/rejected": -1344.4281005859375, "loss": 28.9259, "rewards/accuracies": 0.5, "rewards/chosen": -64.64570617675781, "rewards/margins": -2.4986770153045654, "rewards/rejected": -62.14702224731445, "step": 4100 }, { "epoch": 0.24, "grad_norm": 48.466285705566406, "learning_rate": 0.0009233716475095786, "logits/chosen": -9.933538436889648, "logits/rejected": -9.874788284301758, "logps/chosen": -1923.717041015625, "logps/rejected": -1478.3909912109375, "loss": 34.0994, "rewards/accuracies": 0.5, "rewards/chosen": -39.05588150024414, "rewards/margins": -14.002538681030273, "rewards/rejected": -25.053348541259766, "step": 4110 }, { "epoch": 0.24, "grad_norm": 0.00032737298170104623, "learning_rate": 0.0009231781415689462, "logits/chosen": -8.853133201599121, "logits/rejected": -8.824807167053223, "logps/chosen": -1606.75146484375, "logps/rejected": -1381.519287109375, "loss": 38.7287, "rewards/accuracies": 0.5, "rewards/chosen": -100.25405883789062, "rewards/margins": -16.47211265563965, "rewards/rejected": -83.78194427490234, "step": 4120 }, { "epoch": 0.24, "grad_norm": 0.0040057385340332985, "learning_rate": 0.0009229846356283138, "logits/chosen": -8.91771125793457, "logits/rejected": -8.909492492675781, "logps/chosen": -1124.291259765625, "logps/rejected": -1159.584716796875, "loss": 10.3244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -49.82084655761719, "rewards/margins": 11.109724998474121, "rewards/rejected": -60.930572509765625, "step": 4130 }, { "epoch": 0.24, "grad_norm": 55.86928176879883, "learning_rate": 0.0009227911296876814, "logits/chosen": -8.510544776916504, "logits/rejected": -8.466550827026367, "logps/chosen": -1994.724609375, "logps/rejected": -1527.110107421875, "loss": 37.3459, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -45.34007263183594, "rewards/margins": -37.06328201293945, "rewards/rejected": -8.276789665222168, "step": 4140 }, { "epoch": 0.24, "grad_norm": 3.359434205230746e-18, "learning_rate": 0.0009225976237470491, "logits/chosen": -9.52294921875, "logits/rejected": -9.351316452026367, "logps/chosen": -1493.9422607421875, "logps/rejected": -1206.5386962890625, "loss": 6.662, "rewards/accuracies": 0.5, "rewards/chosen": 29.2064208984375, "rewards/margins": 21.250940322875977, "rewards/rejected": 7.955488681793213, "step": 4150 }, { "epoch": 0.24, "grad_norm": 6.2102150960703575e-19, "learning_rate": 0.0009224041178064167, "logits/chosen": -9.382906913757324, "logits/rejected": -9.150348663330078, "logps/chosen": -2140.057861328125, "logps/rejected": -1742.093017578125, "loss": 20.8148, "rewards/accuracies": 0.5, "rewards/chosen": -35.4161376953125, "rewards/margins": -3.128371477127075, "rewards/rejected": -32.28776550292969, "step": 4160 }, { "epoch": 0.24, "grad_norm": 33.39594650268555, "learning_rate": 0.0009222106118657843, "logits/chosen": -7.881783962249756, "logits/rejected": -7.72054386138916, "logps/chosen": -1733.1060791015625, "logps/rejected": -1468.4326171875, "loss": 17.9425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.464435577392578, "rewards/margins": 28.9499568939209, "rewards/rejected": -56.414398193359375, "step": 4170 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.0009220171059251519, "logits/chosen": -9.019591331481934, "logits/rejected": -8.872458457946777, "logps/chosen": -1754.285400390625, "logps/rejected": -1316.1673583984375, "loss": 20.4041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.456506729125977, "rewards/margins": 1.3448368310928345, "rewards/rejected": -31.801342010498047, "step": 4180 }, { "epoch": 0.24, "grad_norm": 1.3926763534545898, "learning_rate": 0.0009218235999845196, "logits/chosen": -9.351203918457031, "logits/rejected": -9.161458015441895, "logps/chosen": -1752.873046875, "logps/rejected": -1322.14501953125, "loss": 37.4921, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -62.704681396484375, "rewards/margins": -31.53310775756836, "rewards/rejected": -31.17156982421875, "step": 4190 }, { "epoch": 0.24, "grad_norm": 80.16028594970703, "learning_rate": 0.0009216300940438871, "logits/chosen": -8.233763694763184, "logits/rejected": -8.178319931030273, "logps/chosen": -1804.525390625, "logps/rejected": -1330.25732421875, "loss": 54.0665, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -96.94880676269531, "rewards/margins": -48.33831024169922, "rewards/rejected": -48.61049270629883, "step": 4200 }, { "epoch": 0.24, "grad_norm": 57.46701431274414, "learning_rate": 0.0009214365881032548, "logits/chosen": -7.9512505531311035, "logits/rejected": -7.954233646392822, "logps/chosen": -1547.416015625, "logps/rejected": -1565.3756103515625, "loss": 15.3429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -44.283836364746094, "rewards/margins": -1.740007758140564, "rewards/rejected": -42.5438346862793, "step": 4210 }, { "epoch": 0.24, "grad_norm": 25.458114624023438, "learning_rate": 0.0009212430821626224, "logits/chosen": -8.484758377075195, "logits/rejected": -8.4434814453125, "logps/chosen": -1998.813720703125, "logps/rejected": -1563.4322509765625, "loss": 24.7605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4543933868408203, "rewards/margins": -4.390016078948975, "rewards/rejected": 2.9356234073638916, "step": 4220 }, { "epoch": 0.24, "grad_norm": 77.6031494140625, "learning_rate": 0.00092104957622199, "logits/chosen": -9.789072036743164, "logits/rejected": -9.748247146606445, "logps/chosen": -1612.0352783203125, "logps/rejected": -1368.4617919921875, "loss": 32.8996, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.939453125, "rewards/margins": -18.355823516845703, "rewards/rejected": -107.58363342285156, "step": 4230 }, { "epoch": 0.25, "grad_norm": 44.37128829956055, "learning_rate": 0.0009208560702813576, "logits/chosen": -8.759521484375, "logits/rejected": -8.753707885742188, "logps/chosen": -2104.99072265625, "logps/rejected": -1377.25390625, "loss": 39.5269, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -97.52552032470703, "rewards/margins": -27.7829647064209, "rewards/rejected": -69.7425537109375, "step": 4240 }, { "epoch": 0.25, "grad_norm": 13.415846824645996, "learning_rate": 0.0009206625643407252, "logits/chosen": -8.225362777709961, "logits/rejected": -8.150832176208496, "logps/chosen": -1523.627197265625, "logps/rejected": -1101.5068359375, "loss": 39.1914, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -54.22563552856445, "rewards/margins": -38.583290100097656, "rewards/rejected": -15.642343521118164, "step": 4250 }, { "epoch": 0.25, "grad_norm": 1.7882250347156514e-07, "learning_rate": 0.0009204690584000929, "logits/chosen": -9.109691619873047, "logits/rejected": -8.961874008178711, "logps/chosen": -1824.976806640625, "logps/rejected": -1262.978271484375, "loss": 38.9108, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -108.1036376953125, "rewards/margins": -28.83957290649414, "rewards/rejected": -79.26406860351562, "step": 4260 }, { "epoch": 0.25, "grad_norm": 1.2668671161009115e-06, "learning_rate": 0.0009202755524594605, "logits/chosen": -8.861927032470703, "logits/rejected": -8.87398624420166, "logps/chosen": -1608.1243896484375, "logps/rejected": -1790.7584228515625, "loss": 18.9732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.43267059326172, "rewards/margins": -8.918753623962402, "rewards/rejected": -26.5139102935791, "step": 4270 }, { "epoch": 0.25, "grad_norm": 5.474191224893223e-21, "learning_rate": 0.0009200820465188281, "logits/chosen": -8.799155235290527, "logits/rejected": -8.762063980102539, "logps/chosen": -2077.98388671875, "logps/rejected": -2057.4130859375, "loss": 7.2332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.0821533203125, "rewards/margins": 17.36285400390625, "rewards/rejected": -34.44500732421875, "step": 4280 }, { "epoch": 0.25, "grad_norm": 7.831119191844116e-21, "learning_rate": 0.0009198885405781958, "logits/chosen": -10.314074516296387, "logits/rejected": -10.29301643371582, "logps/chosen": -1618.3648681640625, "logps/rejected": -1891.866455078125, "loss": 27.0728, "rewards/accuracies": 0.5, "rewards/chosen": -81.48388671875, "rewards/margins": -7.289531707763672, "rewards/rejected": -74.19435119628906, "step": 4290 }, { "epoch": 0.25, "grad_norm": 0.0005670975660905242, "learning_rate": 0.0009196950346375634, "logits/chosen": -10.695152282714844, "logits/rejected": -10.838968276977539, "logps/chosen": -2525.88916015625, "logps/rejected": -2520.718017578125, "loss": 7.9073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -61.45819854736328, "rewards/margins": 29.3259334564209, "rewards/rejected": -90.78412628173828, "step": 4300 }, { "epoch": 0.25, "grad_norm": 3.520671649424912e-20, "learning_rate": 0.000919501528696931, "logits/chosen": -9.511697769165039, "logits/rejected": -9.633695602416992, "logps/chosen": -1998.0716552734375, "logps/rejected": -1801.285888671875, "loss": 21.2666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -99.45918273925781, "rewards/margins": 5.247494697570801, "rewards/rejected": -104.7066650390625, "step": 4310 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.0009193080227562987, "logits/chosen": -9.37700080871582, "logits/rejected": -9.36920166015625, "logps/chosen": -1622.3333740234375, "logps/rejected": -1217.0361328125, "loss": 7.916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -62.16078567504883, "rewards/margins": 15.9514741897583, "rewards/rejected": -78.11225891113281, "step": 4320 }, { "epoch": 0.25, "grad_norm": 19.791032791137695, "learning_rate": 0.0009191145168156663, "logits/chosen": -8.477682113647461, "logits/rejected": -8.43016529083252, "logps/chosen": -1416.1986083984375, "logps/rejected": -1115.9085693359375, "loss": 18.873, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -72.71562194824219, "rewards/margins": -1.0265274047851562, "rewards/rejected": -71.68909454345703, "step": 4330 }, { "epoch": 0.25, "grad_norm": 79.0573959350586, "learning_rate": 0.0009189210108750339, "logits/chosen": -8.794090270996094, "logits/rejected": -8.762716293334961, "logps/chosen": -1800.9439697265625, "logps/rejected": -1532.7393798828125, "loss": 25.0047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.036090850830078, "rewards/margins": -7.763224124908447, "rewards/rejected": -15.272867202758789, "step": 4340 }, { "epoch": 0.25, "grad_norm": 1.4022542700331542e-06, "learning_rate": 0.0009187275049344015, "logits/chosen": -9.253393173217773, "logits/rejected": -9.233792304992676, "logps/chosen": -1662.9388427734375, "logps/rejected": -1301.8094482421875, "loss": 30.0309, "rewards/accuracies": 0.5, "rewards/chosen": -4.580517292022705, "rewards/margins": -16.792251586914062, "rewards/rejected": 12.2117338180542, "step": 4350 }, { "epoch": 0.25, "grad_norm": 79.10806274414062, "learning_rate": 0.0009185339989937691, "logits/chosen": -10.281286239624023, "logits/rejected": -10.226259231567383, "logps/chosen": -1780.7744140625, "logps/rejected": -1655.264404296875, "loss": 25.9015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -101.68093872070312, "rewards/margins": -7.180128574371338, "rewards/rejected": -94.50080871582031, "step": 4360 }, { "epoch": 0.25, "grad_norm": 61.16004943847656, "learning_rate": 0.0009183404930531368, "logits/chosen": -9.024381637573242, "logits/rejected": -8.96928596496582, "logps/chosen": -2005.399169921875, "logps/rejected": -1356.3983154296875, "loss": 56.58, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -122.90446472167969, "rewards/margins": -50.29194641113281, "rewards/rejected": -72.61251068115234, "step": 4370 }, { "epoch": 0.25, "grad_norm": 66.22138977050781, "learning_rate": 0.0009181469871125044, "logits/chosen": -9.497810363769531, "logits/rejected": -9.505108833312988, "logps/chosen": -1599.6865234375, "logps/rejected": -1528.5458984375, "loss": 35.2688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.167543411254883, "rewards/margins": -27.319082260131836, "rewards/rejected": 18.151540756225586, "step": 4380 }, { "epoch": 0.25, "grad_norm": 0.005387674551457167, "learning_rate": 0.000917953481171872, "logits/chosen": -8.617475509643555, "logits/rejected": -8.587068557739258, "logps/chosen": -1743.6087646484375, "logps/rejected": -1467.503662109375, "loss": 17.9264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.919567108154297, "rewards/margins": 4.754436492919922, "rewards/rejected": -36.67400360107422, "step": 4390 }, { "epoch": 0.25, "grad_norm": 33.836158752441406, "learning_rate": 0.0009177599752312397, "logits/chosen": -8.167339324951172, "logits/rejected": -8.13007926940918, "logps/chosen": -883.3196411132812, "logps/rejected": -547.6429443359375, "loss": 25.6136, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -66.07376861572266, "rewards/margins": -25.28500747680664, "rewards/rejected": -40.78875732421875, "step": 4400 }, { "epoch": 0.26, "grad_norm": 1.1275691517383856e-20, "learning_rate": 0.0009175664692906073, "logits/chosen": -8.938416481018066, "logits/rejected": -8.922842979431152, "logps/chosen": -1723.552001953125, "logps/rejected": -1450.483154296875, "loss": 27.4924, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.35665130615234, "rewards/margins": -12.322918891906738, "rewards/rejected": -57.033729553222656, "step": 4410 }, { "epoch": 0.26, "grad_norm": 50.251407623291016, "learning_rate": 0.0009173729633499748, "logits/chosen": -8.63249397277832, "logits/rejected": -8.533026695251465, "logps/chosen": -1876.547119140625, "logps/rejected": -1651.549072265625, "loss": 35.3111, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -67.8359146118164, "rewards/margins": -30.084243774414062, "rewards/rejected": -37.751670837402344, "step": 4420 }, { "epoch": 0.26, "grad_norm": 3.5947799682617188, "learning_rate": 0.0009171794574093425, "logits/chosen": -10.365991592407227, "logits/rejected": -10.36186408996582, "logps/chosen": -1877.7249755859375, "logps/rejected": -1941.398681640625, "loss": 7.8855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -73.10862731933594, "rewards/margins": 6.834598541259766, "rewards/rejected": -79.94322204589844, "step": 4430 }, { "epoch": 0.26, "grad_norm": 3.739405022130355e-20, "learning_rate": 0.0009169859514687101, "logits/chosen": -10.754658699035645, "logits/rejected": -10.650461196899414, "logps/chosen": -1820.9095458984375, "logps/rejected": -1478.69482421875, "loss": 43.5515, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -116.22654724121094, "rewards/margins": -38.135101318359375, "rewards/rejected": -78.09143829345703, "step": 4440 }, { "epoch": 0.26, "grad_norm": 0.0006514202686958015, "learning_rate": 0.0009167924455280777, "logits/chosen": -8.533493995666504, "logits/rejected": -8.550214767456055, "logps/chosen": -1403.0098876953125, "logps/rejected": -1041.796630859375, "loss": 32.7888, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -110.99317932128906, "rewards/margins": -26.523136138916016, "rewards/rejected": -84.47004699707031, "step": 4450 }, { "epoch": 0.26, "grad_norm": 50.02614212036133, "learning_rate": 0.0009165989395874453, "logits/chosen": -7.458718776702881, "logits/rejected": -7.520946502685547, "logps/chosen": -1834.529541015625, "logps/rejected": -1530.726318359375, "loss": 42.0065, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.736294746398926, "rewards/margins": -35.173431396484375, "rewards/rejected": 19.4371395111084, "step": 4460 }, { "epoch": 0.26, "grad_norm": 45.75019073486328, "learning_rate": 0.0009164054336468129, "logits/chosen": -8.09919548034668, "logits/rejected": -8.114591598510742, "logps/chosen": -2324.199462890625, "logps/rejected": -2042.9049072265625, "loss": 20.4651, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -112.15811920166016, "rewards/margins": -15.943532943725586, "rewards/rejected": -96.2145767211914, "step": 4470 }, { "epoch": 0.26, "grad_norm": 40.58920669555664, "learning_rate": 0.0009162119277061805, "logits/chosen": -6.76464319229126, "logits/rejected": -6.709828853607178, "logps/chosen": -2066.287841796875, "logps/rejected": -1737.689697265625, "loss": 24.6064, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.692434310913086, "rewards/margins": -15.55189323425293, "rewards/rejected": -6.1405439376831055, "step": 4480 }, { "epoch": 0.26, "grad_norm": 1.9031295501103704e-14, "learning_rate": 0.0009160184217655482, "logits/chosen": -6.7418928146362305, "logits/rejected": -6.690354347229004, "logps/chosen": -1589.874267578125, "logps/rejected": -1552.9649658203125, "loss": 3.8606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.626113891601562, "rewards/margins": 21.39170265197754, "rewards/rejected": -39.01781463623047, "step": 4490 }, { "epoch": 0.26, "grad_norm": 43.80301284790039, "learning_rate": 0.0009158249158249159, "logits/chosen": -8.096484184265137, "logits/rejected": -8.096883773803711, "logps/chosen": -2118.14990234375, "logps/rejected": -1934.535888671875, "loss": 12.2988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -65.20568084716797, "rewards/margins": -7.482363224029541, "rewards/rejected": -57.72332000732422, "step": 4500 }, { "epoch": 0.26, "grad_norm": 66.36636352539062, "learning_rate": 0.0009156314098842835, "logits/chosen": -9.680177688598633, "logits/rejected": -9.633504867553711, "logps/chosen": -1647.1488037109375, "logps/rejected": -1235.615234375, "loss": 50.9415, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -129.1929473876953, "rewards/margins": -34.07018280029297, "rewards/rejected": -95.12276458740234, "step": 4510 }, { "epoch": 0.26, "grad_norm": 89.31822204589844, "learning_rate": 0.0009154379039436511, "logits/chosen": -10.041346549987793, "logits/rejected": -9.936990737915039, "logps/chosen": -1680.6546630859375, "logps/rejected": -1652.2044677734375, "loss": 19.5157, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 0.20162391662597656, "rewards/margins": -13.607028007507324, "rewards/rejected": 13.8086519241333, "step": 4520 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.0009152443980030187, "logits/chosen": -8.576089859008789, "logits/rejected": -8.484148025512695, "logps/chosen": -1603.655517578125, "logps/rejected": -1493.7203369140625, "loss": 6.7211, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 18.59952163696289, "rewards/margins": 19.564373016357422, "rewards/rejected": -0.9648529291152954, "step": 4530 }, { "epoch": 0.26, "grad_norm": 69.13729095458984, "learning_rate": 0.0009150508920623864, "logits/chosen": -7.995484828948975, "logits/rejected": -7.997275352478027, "logps/chosen": -2348.11572265625, "logps/rejected": -1819.9375, "loss": 46.7722, "rewards/accuracies": 0.5, "rewards/chosen": -10.94190502166748, "rewards/margins": -22.7613468170166, "rewards/rejected": 11.819440841674805, "step": 4540 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.000914857386121754, "logits/chosen": -8.954835891723633, "logits/rejected": -8.930074691772461, "logps/chosen": -1491.5081787109375, "logps/rejected": -1503.7950439453125, "loss": 26.8852, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.893341064453125, "rewards/margins": -9.842002868652344, "rewards/rejected": -40.051334381103516, "step": 4550 }, { "epoch": 0.26, "grad_norm": 60.601383209228516, "learning_rate": 0.0009146638801811216, "logits/chosen": -9.201874732971191, "logits/rejected": -9.031620979309082, "logps/chosen": -1640.013671875, "logps/rejected": -1065.8861083984375, "loss": 38.6575, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -62.3117790222168, "rewards/margins": -34.73897171020508, "rewards/rejected": -27.57280921936035, "step": 4560 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.0009144703742404892, "logits/chosen": -9.479452133178711, "logits/rejected": -9.43109130859375, "logps/chosen": -1413.1552734375, "logps/rejected": -1223.937255859375, "loss": 33.9685, "rewards/accuracies": 0.5, "rewards/chosen": -75.63743591308594, "rewards/margins": -10.22513484954834, "rewards/rejected": -65.41230010986328, "step": 4570 }, { "epoch": 0.27, "grad_norm": 171.94760131835938, "learning_rate": 0.0009142768682998568, "logits/chosen": -7.158148765563965, "logits/rejected": -7.120888710021973, "logps/chosen": -2199.48193359375, "logps/rejected": -1674.191162109375, "loss": 30.1879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.676555633544922, "rewards/margins": -13.740864753723145, "rewards/rejected": -13.935693740844727, "step": 4580 }, { "epoch": 0.27, "grad_norm": 60.217872619628906, "learning_rate": 0.0009140833623592244, "logits/chosen": -8.40305233001709, "logits/rejected": -8.28870677947998, "logps/chosen": -1892.1234130859375, "logps/rejected": -1599.315673828125, "loss": 12.7526, "rewards/accuracies": 0.5, "rewards/chosen": -46.287574768066406, "rewards/margins": -1.1506354808807373, "rewards/rejected": -45.13693618774414, "step": 4590 }, { "epoch": 0.27, "grad_norm": 22.175811767578125, "learning_rate": 0.0009138898564185922, "logits/chosen": -7.937497138977051, "logits/rejected": -7.8281073570251465, "logps/chosen": -1935.8603515625, "logps/rejected": -1301.595947265625, "loss": 13.567, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -55.540367126464844, "rewards/margins": 9.5119047164917, "rewards/rejected": -65.0522689819336, "step": 4600 }, { "epoch": 0.27, "grad_norm": 0.00014867036952637136, "learning_rate": 0.0009136963504779598, "logits/chosen": -7.373157501220703, "logits/rejected": -7.330329895019531, "logps/chosen": -1735.0582275390625, "logps/rejected": -1645.200439453125, "loss": 10.8871, "rewards/accuracies": 0.5, "rewards/chosen": -60.46418380737305, "rewards/margins": 6.732351779937744, "rewards/rejected": -67.19654083251953, "step": 4610 }, { "epoch": 0.27, "grad_norm": 91.08039855957031, "learning_rate": 0.0009135028445373274, "logits/chosen": -8.495539665222168, "logits/rejected": -8.451286315917969, "logps/chosen": -1990.779296875, "logps/rejected": -2014.433349609375, "loss": 13.1223, "rewards/accuracies": 0.5, "rewards/chosen": -16.635425567626953, "rewards/margins": -0.4210678040981293, "rewards/rejected": -16.21435546875, "step": 4620 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.000913309338596695, "logits/chosen": -9.159147262573242, "logits/rejected": -9.116947174072266, "logps/chosen": -1864.058837890625, "logps/rejected": -1499.009521484375, "loss": 25.1015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.714317321777344, "rewards/margins": -5.28855037689209, "rewards/rejected": -28.425769805908203, "step": 4630 }, { "epoch": 0.27, "grad_norm": 92.13738250732422, "learning_rate": 0.0009131158326560625, "logits/chosen": -8.428374290466309, "logits/rejected": -8.38530158996582, "logps/chosen": -1740.7427978515625, "logps/rejected": -1449.84326171875, "loss": 8.5712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 9.582484245300293, "rewards/margins": 21.547468185424805, "rewards/rejected": -11.964982032775879, "step": 4640 }, { "epoch": 0.27, "grad_norm": 60.57853317260742, "learning_rate": 0.0009129223267154301, "logits/chosen": -8.13377571105957, "logits/rejected": -8.108789443969727, "logps/chosen": -2296.688232421875, "logps/rejected": -2075.966796875, "loss": 23.4312, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 40.66999435424805, "rewards/margins": -15.9452486038208, "rewards/rejected": 56.6152458190918, "step": 4650 }, { "epoch": 0.27, "grad_norm": 9.000663757324219, "learning_rate": 0.0009127288207747978, "logits/chosen": -9.113134384155273, "logits/rejected": -9.135261535644531, "logps/chosen": -2098.92333984375, "logps/rejected": -1773.758544921875, "loss": 37.2544, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.58670806884766, "rewards/margins": -15.551788330078125, "rewards/rejected": -54.0349235534668, "step": 4660 }, { "epoch": 0.27, "grad_norm": 3.42274724971503e-05, "learning_rate": 0.0009125353148341654, "logits/chosen": -9.201505661010742, "logits/rejected": -9.19025993347168, "logps/chosen": -1671.610595703125, "logps/rejected": -1463.5609130859375, "loss": 32.718, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.01222229003906, "rewards/margins": -18.12932777404785, "rewards/rejected": -81.88288879394531, "step": 4670 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.000912341808893533, "logits/chosen": -8.694707870483398, "logits/rejected": -8.640705108642578, "logps/chosen": -2141.27734375, "logps/rejected": -1920.0152587890625, "loss": 31.4208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -63.626853942871094, "rewards/margins": -6.784656524658203, "rewards/rejected": -56.842201232910156, "step": 4680 }, { "epoch": 0.27, "grad_norm": 23.237117767333984, "learning_rate": 0.0009121483029529006, "logits/chosen": -9.022674560546875, "logits/rejected": -8.98784065246582, "logps/chosen": -1712.90234375, "logps/rejected": -1225.3616943359375, "loss": 36.6861, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -62.36731719970703, "rewards/margins": -16.693431854248047, "rewards/rejected": -45.67388153076172, "step": 4690 }, { "epoch": 0.27, "grad_norm": 3.3804261053029024e-17, "learning_rate": 0.0009119547970122682, "logits/chosen": -8.22315502166748, "logits/rejected": -8.186437606811523, "logps/chosen": -2122.8974609375, "logps/rejected": -1891.0794677734375, "loss": 32.5505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 30.191781997680664, "rewards/margins": -14.471837997436523, "rewards/rejected": 44.66362762451172, "step": 4700 }, { "epoch": 0.27, "grad_norm": 29.79326057434082, "learning_rate": 0.000911761291071636, "logits/chosen": -8.467879295349121, "logits/rejected": -8.478471755981445, "logps/chosen": -1756.400146484375, "logps/rejected": -1625.9141845703125, "loss": 24.0344, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -82.21501159667969, "rewards/margins": -16.261804580688477, "rewards/rejected": -65.95320129394531, "step": 4710 }, { "epoch": 0.27, "grad_norm": 0.010401641950011253, "learning_rate": 0.0009115677851310036, "logits/chosen": -9.89979362487793, "logits/rejected": -9.934893608093262, "logps/chosen": -2284.050537109375, "logps/rejected": -1864.8193359375, "loss": 30.5535, "rewards/accuracies": 0.5, "rewards/chosen": -80.15672302246094, "rewards/margins": -24.399372100830078, "rewards/rejected": -55.757347106933594, "step": 4720 }, { "epoch": 0.27, "grad_norm": 38.92198944091797, "learning_rate": 0.0009113742791903712, "logits/chosen": -10.476301193237305, "logits/rejected": -10.670148849487305, "logps/chosen": -2123.49169921875, "logps/rejected": -1364.4326171875, "loss": 49.0855, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -97.52474212646484, "rewards/margins": -30.960369110107422, "rewards/rejected": -66.56437683105469, "step": 4730 }, { "epoch": 0.27, "grad_norm": 21.107322692871094, "learning_rate": 0.0009111807732497388, "logits/chosen": -9.105428695678711, "logits/rejected": -9.145160675048828, "logps/chosen": -1450.1298828125, "logps/rejected": -1696.328125, "loss": 24.0522, "rewards/accuracies": 0.5, "rewards/chosen": -6.736875057220459, "rewards/margins": -5.979021072387695, "rewards/rejected": -0.757856011390686, "step": 4740 }, { "epoch": 0.27, "grad_norm": 133.71554565429688, "learning_rate": 0.0009109872673091064, "logits/chosen": -7.442114353179932, "logits/rejected": -7.392122745513916, "logps/chosen": -1965.4056396484375, "logps/rejected": -1830.0394287109375, "loss": 10.9653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.108110427856445, "rewards/margins": 9.79053020477295, "rewards/rejected": -31.898645401000977, "step": 4750 }, { "epoch": 0.28, "grad_norm": 28.690288543701172, "learning_rate": 0.000910793761368474, "logits/chosen": -7.554097652435303, "logits/rejected": -7.547989845275879, "logps/chosen": -1824.5911865234375, "logps/rejected": -1180.704345703125, "loss": 31.5883, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -40.17559051513672, "rewards/margins": -12.928515434265137, "rewards/rejected": -27.247079849243164, "step": 4760 }, { "epoch": 0.28, "grad_norm": 38.983890533447266, "learning_rate": 0.0009106002554278417, "logits/chosen": -8.179401397705078, "logits/rejected": -8.157408714294434, "logps/chosen": -1777.947998046875, "logps/rejected": -1399.157958984375, "loss": 35.377, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -67.96568298339844, "rewards/margins": -29.759906768798828, "rewards/rejected": -38.205787658691406, "step": 4770 }, { "epoch": 0.28, "grad_norm": 8.329216003417969, "learning_rate": 0.0009104067494872093, "logits/chosen": -9.728559494018555, "logits/rejected": -9.741262435913086, "logps/chosen": -1406.599365234375, "logps/rejected": -1407.638427734375, "loss": 18.3135, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.98790168762207, "rewards/margins": -11.405340194702148, "rewards/rejected": -13.582560539245605, "step": 4780 }, { "epoch": 0.28, "grad_norm": 15.422090530395508, "learning_rate": 0.0009102132435465769, "logits/chosen": -9.033002853393555, "logits/rejected": -8.991859436035156, "logps/chosen": -1689.7662353515625, "logps/rejected": -1284.168212890625, "loss": 28.5916, "rewards/accuracies": 0.5, "rewards/chosen": -21.258281707763672, "rewards/margins": -9.113956451416016, "rewards/rejected": -12.144327163696289, "step": 4790 }, { "epoch": 0.28, "grad_norm": 38.7027702331543, "learning_rate": 0.0009100197376059445, "logits/chosen": -9.679778099060059, "logits/rejected": -9.767784118652344, "logps/chosen": -1646.754150390625, "logps/rejected": -1515.140869140625, "loss": 16.0343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -94.05101776123047, "rewards/margins": -9.70647144317627, "rewards/rejected": -84.34454345703125, "step": 4800 }, { "epoch": 0.28, "grad_norm": 8.543562247096716e-15, "learning_rate": 0.0009098262316653122, "logits/chosen": -9.896444320678711, "logits/rejected": -9.976241111755371, "logps/chosen": -1555.3028564453125, "logps/rejected": -1349.4141845703125, "loss": 32.0787, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -80.7896728515625, "rewards/margins": -14.26862621307373, "rewards/rejected": -66.52104949951172, "step": 4810 }, { "epoch": 0.28, "grad_norm": 91.83363342285156, "learning_rate": 0.0009096327257246798, "logits/chosen": -8.33303451538086, "logits/rejected": -8.348377227783203, "logps/chosen": -1964.185791015625, "logps/rejected": -1778.5406494140625, "loss": 25.0358, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 17.394702911376953, "rewards/margins": -15.210981369018555, "rewards/rejected": 32.605682373046875, "step": 4820 }, { "epoch": 0.28, "grad_norm": 38.66618347167969, "learning_rate": 0.0009094392197840475, "logits/chosen": -8.402642250061035, "logits/rejected": -8.351762771606445, "logps/chosen": -1190.909423828125, "logps/rejected": -1244.2294921875, "loss": 12.0175, "rewards/accuracies": 0.5, "rewards/chosen": -66.53385162353516, "rewards/margins": -4.840240001678467, "rewards/rejected": -61.69361114501953, "step": 4830 }, { "epoch": 0.28, "grad_norm": 32.11689376831055, "learning_rate": 0.0009092457138434151, "logits/chosen": -9.676976203918457, "logits/rejected": -9.690530776977539, "logps/chosen": -1822.9248046875, "logps/rejected": -1558.1734619140625, "loss": 26.8282, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -108.80438232421875, "rewards/margins": -23.646839141845703, "rewards/rejected": -85.15753936767578, "step": 4840 }, { "epoch": 0.28, "grad_norm": 6.508529186248779, "learning_rate": 0.0009090522079027827, "logits/chosen": -9.232074737548828, "logits/rejected": -9.246625900268555, "logps/chosen": -2273.44189453125, "logps/rejected": -2037.8101806640625, "loss": 22.1671, "rewards/accuracies": 0.5, "rewards/chosen": -16.647146224975586, "rewards/margins": -3.256946563720703, "rewards/rejected": -13.3902006149292, "step": 4850 }, { "epoch": 0.28, "grad_norm": 40.33480453491211, "learning_rate": 0.0009088587019621502, "logits/chosen": -8.680143356323242, "logits/rejected": -8.734598159790039, "logps/chosen": -1402.847412109375, "logps/rejected": -1099.5872802734375, "loss": 28.716, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -109.042236328125, "rewards/margins": -24.997882843017578, "rewards/rejected": -84.04435729980469, "step": 4860 }, { "epoch": 0.28, "grad_norm": 59.428218841552734, "learning_rate": 0.0009086651960215178, "logits/chosen": -10.215103149414062, "logits/rejected": -10.222582817077637, "logps/chosen": -1793.542724609375, "logps/rejected": -1814.247314453125, "loss": 17.0728, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -103.69966125488281, "rewards/margins": 4.309503555297852, "rewards/rejected": -108.0091781616211, "step": 4870 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.0009084716900808854, "logits/chosen": -10.06739330291748, "logits/rejected": -10.022003173828125, "logps/chosen": -1525.9420166015625, "logps/rejected": -1280.331787109375, "loss": 42.2033, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -42.845664978027344, "rewards/margins": -22.63824462890625, "rewards/rejected": -20.207422256469727, "step": 4880 }, { "epoch": 0.28, "grad_norm": 26.33176040649414, "learning_rate": 0.0009082781841402531, "logits/chosen": -7.990780830383301, "logits/rejected": -8.000675201416016, "logps/chosen": -2211.8203125, "logps/rejected": -1649.341064453125, "loss": 14.2521, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -46.90192413330078, "rewards/margins": 7.740502834320068, "rewards/rejected": -54.642425537109375, "step": 4890 }, { "epoch": 0.28, "grad_norm": 10.887837409973145, "learning_rate": 0.0009080846781996207, "logits/chosen": -7.983477592468262, "logits/rejected": -7.9893388748168945, "logps/chosen": -1897.659912109375, "logps/rejected": -1701.429443359375, "loss": 23.7367, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -93.77479553222656, "rewards/margins": -16.669391632080078, "rewards/rejected": -77.10540771484375, "step": 4900 }, { "epoch": 0.28, "grad_norm": 41.39078140258789, "learning_rate": 0.0009078911722589883, "logits/chosen": -8.276566505432129, "logits/rejected": -8.288628578186035, "logps/chosen": -1895.559326171875, "logps/rejected": -2036.4208984375, "loss": 13.1466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -77.34185028076172, "rewards/margins": 13.143815994262695, "rewards/rejected": -90.48567199707031, "step": 4910 }, { "epoch": 0.28, "grad_norm": 79.08123016357422, "learning_rate": 0.000907697666318356, "logits/chosen": -8.950051307678223, "logits/rejected": -8.97147274017334, "logps/chosen": -1572.3023681640625, "logps/rejected": -1182.833984375, "loss": 37.2265, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -74.11878204345703, "rewards/margins": -33.50984573364258, "rewards/rejected": -40.60894012451172, "step": 4920 }, { "epoch": 0.29, "grad_norm": 120.4771728515625, "learning_rate": 0.0009075041603777236, "logits/chosen": -9.069738388061523, "logits/rejected": -9.095442771911621, "logps/chosen": -1961.9290771484375, "logps/rejected": -1706.6353759765625, "loss": 20.9812, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -10.428757667541504, "rewards/margins": -17.594974517822266, "rewards/rejected": 7.1662187576293945, "step": 4930 }, { "epoch": 0.29, "grad_norm": 7.198016627185666e-10, "learning_rate": 0.0009073106544370913, "logits/chosen": -9.25047492980957, "logits/rejected": -9.23034954071045, "logps/chosen": -1731.0013427734375, "logps/rejected": -1404.304443359375, "loss": 12.0851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -61.54961395263672, "rewards/margins": 9.6342191696167, "rewards/rejected": -71.18383026123047, "step": 4940 }, { "epoch": 0.29, "grad_norm": 29.053478240966797, "learning_rate": 0.0009071171484964589, "logits/chosen": -8.589349746704102, "logits/rejected": -8.673431396484375, "logps/chosen": -1702.613525390625, "logps/rejected": -1392.7261962890625, "loss": 32.3954, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -81.0113296508789, "rewards/margins": -18.615886688232422, "rewards/rejected": -62.39544677734375, "step": 4950 }, { "epoch": 0.29, "grad_norm": 79.77056121826172, "learning_rate": 0.0009069236425558265, "logits/chosen": -8.608006477355957, "logits/rejected": -8.62388801574707, "logps/chosen": -1739.668212890625, "logps/rejected": -1299.370849609375, "loss": 30.0006, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -71.62852478027344, "rewards/margins": -12.588729858398438, "rewards/rejected": -59.0398063659668, "step": 4960 }, { "epoch": 0.29, "grad_norm": 65.55309295654297, "learning_rate": 0.0009067301366151941, "logits/chosen": -8.64206600189209, "logits/rejected": -8.613697052001953, "logps/chosen": -1662.1849365234375, "logps/rejected": -1552.6448974609375, "loss": 25.9709, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -97.74774169921875, "rewards/margins": -11.144105911254883, "rewards/rejected": -86.60363006591797, "step": 4970 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.0009065366306745617, "logits/chosen": -9.799661636352539, "logits/rejected": -9.751272201538086, "logps/chosen": -2202.81494140625, "logps/rejected": -1788.1627197265625, "loss": 14.8662, "rewards/accuracies": 0.5, "rewards/chosen": 13.232316970825195, "rewards/margins": 11.993356704711914, "rewards/rejected": 1.2389557361602783, "step": 4980 }, { "epoch": 0.29, "grad_norm": 0.04437274858355522, "learning_rate": 0.0009063431247339293, "logits/chosen": -9.721487045288086, "logits/rejected": -9.830042839050293, "logps/chosen": -1904.5697021484375, "logps/rejected": -1510.189208984375, "loss": 21.0831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -53.81181716918945, "rewards/margins": 2.2867369651794434, "rewards/rejected": -56.09855270385742, "step": 4990 }, { "epoch": 0.29, "grad_norm": 53.78908920288086, "learning_rate": 0.000906149618793297, "logits/chosen": -9.892816543579102, "logits/rejected": -9.968270301818848, "logps/chosen": -1991.4664306640625, "logps/rejected": -1543.12939453125, "loss": 41.0536, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -88.68828582763672, "rewards/margins": -32.74137878417969, "rewards/rejected": -55.94690704345703, "step": 5000 }, { "epoch": 0.29, "grad_norm": 2.207744273619028e-06, "learning_rate": 0.0009059561128526646, "logits/chosen": -8.127449989318848, "logits/rejected": -8.111414909362793, "logps/chosen": -2158.933349609375, "logps/rejected": -1342.28515625, "loss": 57.6507, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.63200378417969, "rewards/margins": -43.591392517089844, "rewards/rejected": -57.04060745239258, "step": 5010 }, { "epoch": 0.29, "grad_norm": 8.319942474365234, "learning_rate": 0.0009057626069120323, "logits/chosen": -7.271368503570557, "logits/rejected": -7.2952141761779785, "logps/chosen": -1641.103271484375, "logps/rejected": -1683.3297119140625, "loss": 5.2616, "rewards/accuracies": 0.5, "rewards/chosen": -58.654014587402344, "rewards/margins": 20.360687255859375, "rewards/rejected": -79.01470184326172, "step": 5020 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.0009055691009713999, "logits/chosen": -7.820645809173584, "logits/rejected": -7.737917900085449, "logps/chosen": -1950.729248046875, "logps/rejected": -1461.9620361328125, "loss": 40.7048, "rewards/accuracies": 0.5, "rewards/chosen": -45.017738342285156, "rewards/margins": -10.723549842834473, "rewards/rejected": -34.294189453125, "step": 5030 }, { "epoch": 0.29, "grad_norm": 0.005835599731653929, "learning_rate": 0.0009053755950307675, "logits/chosen": -9.321410179138184, "logits/rejected": -9.306618690490723, "logps/chosen": -1551.5267333984375, "logps/rejected": -1205.4345703125, "loss": 27.2859, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -14.31006145477295, "rewards/margins": -7.173033714294434, "rewards/rejected": -7.137030124664307, "step": 5040 }, { "epoch": 0.29, "grad_norm": 0.23326639831066132, "learning_rate": 0.0009051820890901352, "logits/chosen": -8.248186111450195, "logits/rejected": -8.237090110778809, "logps/chosen": -1779.3883056640625, "logps/rejected": -1459.693603515625, "loss": 17.5791, "rewards/accuracies": 0.5, "rewards/chosen": -34.855628967285156, "rewards/margins": -6.6123456954956055, "rewards/rejected": -28.2432861328125, "step": 5050 }, { "epoch": 0.29, "grad_norm": 2.330403790530398e-19, "learning_rate": 0.0009049885831495028, "logits/chosen": -8.334344863891602, "logits/rejected": -8.347662925720215, "logps/chosen": -1741.4710693359375, "logps/rejected": -1658.209228515625, "loss": 8.5274, "rewards/accuracies": 0.5, "rewards/chosen": 23.732603073120117, "rewards/margins": 3.984076738357544, "rewards/rejected": 19.748523712158203, "step": 5060 }, { "epoch": 0.29, "grad_norm": 0.03158211708068848, "learning_rate": 0.0009047950772088703, "logits/chosen": -8.342867851257324, "logits/rejected": -8.2826509475708, "logps/chosen": -1363.2806396484375, "logps/rejected": -1461.1875, "loss": 21.7828, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -53.09303665161133, "rewards/margins": -0.7507591247558594, "rewards/rejected": -52.34226608276367, "step": 5070 }, { "epoch": 0.29, "grad_norm": 49.819828033447266, "learning_rate": 0.0009046015712682379, "logits/chosen": -9.245862007141113, "logits/rejected": -9.21226692199707, "logps/chosen": -1580.656982421875, "logps/rejected": -1236.5810546875, "loss": 19.9155, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -96.77537536621094, "rewards/margins": -8.980085372924805, "rewards/rejected": -87.7952880859375, "step": 5080 }, { "epoch": 0.29, "grad_norm": 33.49330139160156, "learning_rate": 0.0009044080653276055, "logits/chosen": -9.05580997467041, "logits/rejected": -9.019022941589355, "logps/chosen": -1677.138916015625, "logps/rejected": -1407.889404296875, "loss": 18.389, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -89.90663146972656, "rewards/margins": 5.812913417816162, "rewards/rejected": -95.71955108642578, "step": 5090 }, { "epoch": 0.3, "grad_norm": 1.702856388874352e-05, "learning_rate": 0.0009042145593869731, "logits/chosen": -8.432421684265137, "logits/rejected": -8.444604873657227, "logps/chosen": -2151.07373046875, "logps/rejected": -1791.9986572265625, "loss": 3.6525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 31.72052574157715, "rewards/margins": 20.047622680664062, "rewards/rejected": 11.672903060913086, "step": 5100 }, { "epoch": 0.3, "grad_norm": 58.070152282714844, "learning_rate": 0.0009040210534463407, "logits/chosen": -8.280766487121582, "logits/rejected": -8.24877643585205, "logps/chosen": -1869.23046875, "logps/rejected": -1712.6156005859375, "loss": 32.5383, "rewards/accuracies": 0.5, "rewards/chosen": -92.93489837646484, "rewards/margins": -21.791675567626953, "rewards/rejected": -71.14321899414062, "step": 5110 }, { "epoch": 0.3, "grad_norm": 44.44122314453125, "learning_rate": 0.0009038275475057084, "logits/chosen": -8.776019096374512, "logits/rejected": -8.747950553894043, "logps/chosen": -1729.081787109375, "logps/rejected": -1789.705078125, "loss": 11.9263, "rewards/accuracies": 0.5, "rewards/chosen": -99.80635833740234, "rewards/margins": 8.116853713989258, "rewards/rejected": -107.9232177734375, "step": 5120 }, { "epoch": 0.3, "grad_norm": 32.91712951660156, "learning_rate": 0.0009036340415650761, "logits/chosen": -8.966012954711914, "logits/rejected": -8.987971305847168, "logps/chosen": -2005.693359375, "logps/rejected": -1542.9775390625, "loss": 39.4581, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.42525482177734, "rewards/margins": -14.991094589233398, "rewards/rejected": -109.4341812133789, "step": 5130 }, { "epoch": 0.3, "grad_norm": 70.39370727539062, "learning_rate": 0.0009034405356244437, "logits/chosen": -8.114550590515137, "logits/rejected": -8.191965103149414, "logps/chosen": -2157.15087890625, "logps/rejected": -1876.8961181640625, "loss": 36.2804, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -49.425113677978516, "rewards/margins": -35.85783386230469, "rewards/rejected": -13.567276000976562, "step": 5140 }, { "epoch": 0.3, "grad_norm": 12.509115219116211, "learning_rate": 0.0009032470296838113, "logits/chosen": -9.002252578735352, "logits/rejected": -8.92145824432373, "logps/chosen": -1705.046875, "logps/rejected": -1484.8804931640625, "loss": 34.7842, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -71.01628112792969, "rewards/margins": -33.16510772705078, "rewards/rejected": -37.851173400878906, "step": 5150 }, { "epoch": 0.3, "grad_norm": 0.11767452955245972, "learning_rate": 0.0009030535237431789, "logits/chosen": -8.59290885925293, "logits/rejected": -8.58056354522705, "logps/chosen": -1939.163330078125, "logps/rejected": -1412.544189453125, "loss": 32.0876, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -65.78522491455078, "rewards/margins": -15.946276664733887, "rewards/rejected": -49.838951110839844, "step": 5160 }, { "epoch": 0.3, "grad_norm": 117.52180480957031, "learning_rate": 0.0009028600178025466, "logits/chosen": -9.763860702514648, "logits/rejected": -9.792852401733398, "logps/chosen": -1747.2451171875, "logps/rejected": -1795.364013671875, "loss": 28.7432, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.7563705444336, "rewards/margins": -19.95815658569336, "rewards/rejected": -67.7982177734375, "step": 5170 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.0009026665118619142, "logits/chosen": -9.298297882080078, "logits/rejected": -9.265253067016602, "logps/chosen": -1657.968017578125, "logps/rejected": -1443.156494140625, "loss": 3.8035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 14.499807357788086, "rewards/margins": 36.641544342041016, "rewards/rejected": -22.141735076904297, "step": 5180 }, { "epoch": 0.3, "grad_norm": 8.094407081604004, "learning_rate": 0.0009024730059212818, "logits/chosen": -9.408623695373535, "logits/rejected": -9.261428833007812, "logps/chosen": -1646.176513671875, "logps/rejected": -1322.1761474609375, "loss": 25.1026, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -61.58174514770508, "rewards/margins": -23.68143081665039, "rewards/rejected": -37.90031051635742, "step": 5190 }, { "epoch": 0.3, "grad_norm": 0.0020140644628554583, "learning_rate": 0.0009022794999806494, "logits/chosen": -9.330455780029297, "logits/rejected": -9.389273643493652, "logps/chosen": -1435.347900390625, "logps/rejected": -1617.1123046875, "loss": 37.7271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -38.67889404296875, "rewards/margins": -26.713909149169922, "rewards/rejected": -11.964982986450195, "step": 5200 }, { "epoch": 0.3, "grad_norm": 2.1804956062965175e-08, "learning_rate": 0.000902085994040017, "logits/chosen": -8.2926607131958, "logits/rejected": -8.258206367492676, "logps/chosen": -1879.5931396484375, "logps/rejected": -1411.9874267578125, "loss": 28.1339, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -45.104408264160156, "rewards/margins": 4.306944370269775, "rewards/rejected": -49.41135025024414, "step": 5210 }, { "epoch": 0.3, "grad_norm": 21.999046325683594, "learning_rate": 0.0009018924880993846, "logits/chosen": -8.128942489624023, "logits/rejected": -8.125322341918945, "logps/chosen": -2093.32080078125, "logps/rejected": -1528.042236328125, "loss": 29.2441, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -32.419044494628906, "rewards/margins": -21.92289924621582, "rewards/rejected": -10.49614429473877, "step": 5220 }, { "epoch": 0.3, "grad_norm": 44.74166488647461, "learning_rate": 0.0009016989821587524, "logits/chosen": -9.41213607788086, "logits/rejected": -9.449938774108887, "logps/chosen": -1782.0836181640625, "logps/rejected": -1640.5413818359375, "loss": 19.4968, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -107.0965347290039, "rewards/margins": -10.964597702026367, "rewards/rejected": -96.13194274902344, "step": 5230 }, { "epoch": 0.3, "grad_norm": 2.3782840798958205e-05, "learning_rate": 0.00090150547621812, "logits/chosen": -8.581865310668945, "logits/rejected": -8.542877197265625, "logps/chosen": -1494.671875, "logps/rejected": -1526.7587890625, "loss": 22.5771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -58.18119430541992, "rewards/margins": -5.447527885437012, "rewards/rejected": -52.73366165161133, "step": 5240 }, { "epoch": 0.3, "grad_norm": 1.3294848031364381e-06, "learning_rate": 0.0009013119702774876, "logits/chosen": -7.236302375793457, "logits/rejected": -7.060077667236328, "logps/chosen": -2157.6259765625, "logps/rejected": -1689.053466796875, "loss": 20.4487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.078462600708008, "rewards/margins": 2.2236857414245605, "rewards/rejected": -18.302148818969727, "step": 5250 }, { "epoch": 0.3, "grad_norm": 70.81320190429688, "learning_rate": 0.0009011184643368552, "logits/chosen": -8.856797218322754, "logits/rejected": -8.75737190246582, "logps/chosen": -1614.818359375, "logps/rejected": -1414.13916015625, "loss": 36.0447, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -130.11749267578125, "rewards/margins": -19.39010238647461, "rewards/rejected": -110.72737884521484, "step": 5260 }, { "epoch": 0.31, "grad_norm": 21.656421661376953, "learning_rate": 0.0009009249583962228, "logits/chosen": -7.551632881164551, "logits/rejected": -7.357658386230469, "logps/chosen": -1497.929443359375, "logps/rejected": -1265.58251953125, "loss": 31.0857, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -114.9302978515625, "rewards/margins": -20.307809829711914, "rewards/rejected": -94.62248992919922, "step": 5270 }, { "epoch": 0.31, "grad_norm": 27.564311981201172, "learning_rate": 0.0009007314524555905, "logits/chosen": -9.33316421508789, "logits/rejected": -9.035351753234863, "logps/chosen": -1344.1856689453125, "logps/rejected": -1030.660888671875, "loss": 33.9366, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.92274475097656, "rewards/margins": -25.40689468383789, "rewards/rejected": -81.51585388183594, "step": 5280 }, { "epoch": 0.31, "grad_norm": 0.03188885375857353, "learning_rate": 0.000900537946514958, "logits/chosen": -8.930988311767578, "logits/rejected": -8.718992233276367, "logps/chosen": -1919.0035400390625, "logps/rejected": -1691.923095703125, "loss": 20.2263, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -83.88905334472656, "rewards/margins": -17.296720504760742, "rewards/rejected": -66.59233093261719, "step": 5290 }, { "epoch": 0.31, "grad_norm": 43.81001281738281, "learning_rate": 0.0009003444405743256, "logits/chosen": -8.445356369018555, "logits/rejected": -8.187868118286133, "logps/chosen": -1607.367919921875, "logps/rejected": -1003.1843872070312, "loss": 41.4722, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -30.006359100341797, "rewards/margins": -31.2424259185791, "rewards/rejected": 1.2360671758651733, "step": 5300 }, { "epoch": 0.31, "grad_norm": 36.07637405395508, "learning_rate": 0.0009001509346336932, "logits/chosen": -9.539705276489258, "logits/rejected": -9.387110710144043, "logps/chosen": -1772.8548583984375, "logps/rejected": -1556.390380859375, "loss": 37.0754, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -85.454345703125, "rewards/margins": -35.24570846557617, "rewards/rejected": -50.208641052246094, "step": 5310 }, { "epoch": 0.31, "grad_norm": 11.131653785705566, "learning_rate": 0.0008999574286930608, "logits/chosen": -9.012438774108887, "logits/rejected": -8.930061340332031, "logps/chosen": -1280.926513671875, "logps/rejected": -912.6585083007812, "loss": 14.9757, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.79758834838867, "rewards/margins": 0.16841164231300354, "rewards/rejected": -36.96600341796875, "step": 5320 }, { "epoch": 0.31, "grad_norm": 35.82550811767578, "learning_rate": 0.0008997639227524284, "logits/chosen": -9.494274139404297, "logits/rejected": -9.481293678283691, "logps/chosen": -1953.9410400390625, "logps/rejected": -1289.6220703125, "loss": 39.2187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.54655456542969, "rewards/margins": -20.069194793701172, "rewards/rejected": -98.47737121582031, "step": 5330 }, { "epoch": 0.31, "grad_norm": 33.64588165283203, "learning_rate": 0.0008995704168117962, "logits/chosen": -8.636696815490723, "logits/rejected": -8.617141723632812, "logps/chosen": -1907.7457275390625, "logps/rejected": -1592.4324951171875, "loss": 21.6852, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -83.63787078857422, "rewards/margins": -16.493881225585938, "rewards/rejected": -67.14398956298828, "step": 5340 }, { "epoch": 0.31, "grad_norm": 104.92646789550781, "learning_rate": 0.0008993769108711638, "logits/chosen": -7.139856815338135, "logits/rejected": -7.060240268707275, "logps/chosen": -1661.890869140625, "logps/rejected": -1507.5511474609375, "loss": 24.4686, "rewards/accuracies": 0.5, "rewards/chosen": -33.55754852294922, "rewards/margins": -14.011073112487793, "rewards/rejected": -19.546478271484375, "step": 5350 }, { "epoch": 0.31, "grad_norm": 28.093496322631836, "learning_rate": 0.0008991834049305314, "logits/chosen": -7.688560485839844, "logits/rejected": -7.648138999938965, "logps/chosen": -1808.170654296875, "logps/rejected": -2056.297119140625, "loss": 7.4001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -65.6431884765625, "rewards/margins": 20.436248779296875, "rewards/rejected": -86.07943725585938, "step": 5360 }, { "epoch": 0.31, "grad_norm": 31.516633987426758, "learning_rate": 0.000898989898989899, "logits/chosen": -8.593656539916992, "logits/rejected": -8.540823936462402, "logps/chosen": -1868.7349853515625, "logps/rejected": -1593.286376953125, "loss": 32.1942, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.258174896240234, "rewards/margins": -19.60848617553711, "rewards/rejected": -2.6496901512145996, "step": 5370 }, { "epoch": 0.31, "grad_norm": 46.205299377441406, "learning_rate": 0.0008987963930492666, "logits/chosen": -7.829113006591797, "logits/rejected": -7.851248264312744, "logps/chosen": -1512.6361083984375, "logps/rejected": -1538.2447509765625, "loss": 24.3339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -83.18604278564453, "rewards/margins": 3.7743561267852783, "rewards/rejected": -86.96040344238281, "step": 5380 }, { "epoch": 0.31, "grad_norm": 78.55866241455078, "learning_rate": 0.0008986028871086342, "logits/chosen": -6.7974982261657715, "logits/rejected": -6.8109540939331055, "logps/chosen": -2072.42333984375, "logps/rejected": -1619.576904296875, "loss": 21.5457, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -94.80870056152344, "rewards/margins": -8.868295669555664, "rewards/rejected": -85.94039916992188, "step": 5390 }, { "epoch": 0.31, "grad_norm": 8.112340145341804e-14, "learning_rate": 0.0008984093811680019, "logits/chosen": -9.310327529907227, "logits/rejected": -9.221717834472656, "logps/chosen": -1838.6832275390625, "logps/rejected": -1568.3199462890625, "loss": 45.7109, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -76.40618896484375, "rewards/margins": -21.6414737701416, "rewards/rejected": -54.76470947265625, "step": 5400 }, { "epoch": 0.31, "grad_norm": 46.76328659057617, "learning_rate": 0.0008982158752273695, "logits/chosen": -9.522175788879395, "logits/rejected": -9.500704765319824, "logps/chosen": -1722.765869140625, "logps/rejected": -1410.817626953125, "loss": 21.5434, "rewards/accuracies": 0.5, "rewards/chosen": -53.69675827026367, "rewards/margins": -17.704118728637695, "rewards/rejected": -35.992637634277344, "step": 5410 }, { "epoch": 0.31, "grad_norm": 58.267112731933594, "learning_rate": 0.0008980223692867371, "logits/chosen": -8.604406356811523, "logits/rejected": -8.578794479370117, "logps/chosen": -2212.37158203125, "logps/rejected": -1974.898681640625, "loss": 14.755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.991214752197266, "rewards/margins": 12.155694961547852, "rewards/rejected": -41.14690399169922, "step": 5420 }, { "epoch": 0.31, "grad_norm": 1.8071494878313388e-06, "learning_rate": 0.0008978288633461047, "logits/chosen": -9.183589935302734, "logits/rejected": -9.012282371520996, "logps/chosen": -2079.46630859375, "logps/rejected": -1740.296630859375, "loss": 27.0999, "rewards/accuracies": 0.5, "rewards/chosen": -46.63042068481445, "rewards/margins": -12.293874740600586, "rewards/rejected": -34.33654022216797, "step": 5430 }, { "epoch": 0.31, "grad_norm": 152.5574951171875, "learning_rate": 0.0008976353574054724, "logits/chosen": -9.058444023132324, "logits/rejected": -9.013318061828613, "logps/chosen": -1702.8304443359375, "logps/rejected": -1461.29931640625, "loss": 24.629, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.710922241210938, "rewards/margins": -8.34979248046875, "rewards/rejected": -20.361125946044922, "step": 5440 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.0008974418514648401, "logits/chosen": -8.128946304321289, "logits/rejected": -8.078855514526367, "logps/chosen": -1820.9622802734375, "logps/rejected": -1337.873779296875, "loss": 31.905, "rewards/accuracies": 0.5, "rewards/chosen": -54.22119140625, "rewards/margins": -8.01668930053711, "rewards/rejected": -46.204505920410156, "step": 5450 }, { "epoch": 0.32, "grad_norm": 141.21144104003906, "learning_rate": 0.0008972483455242077, "logits/chosen": -7.524806976318359, "logits/rejected": -7.4666266441345215, "logps/chosen": -2665.655029296875, "logps/rejected": -2708.4091796875, "loss": 15.6549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -86.8245849609375, "rewards/margins": 3.2335476875305176, "rewards/rejected": -90.0581283569336, "step": 5460 }, { "epoch": 0.32, "grad_norm": 131.2815399169922, "learning_rate": 0.0008970548395835753, "logits/chosen": -12.145551681518555, "logits/rejected": -12.333539962768555, "logps/chosen": -3022.111328125, "logps/rejected": -2761.40673828125, "loss": 31.0456, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -188.2490234375, "rewards/margins": -14.488622665405273, "rewards/rejected": -173.76040649414062, "step": 5470 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.0008968613336429429, "logits/chosen": -14.911221504211426, "logits/rejected": -15.07685375213623, "logps/chosen": -1574.0928955078125, "logps/rejected": -1333.3564453125, "loss": 30.4841, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -132.35009765625, "rewards/margins": -20.543838500976562, "rewards/rejected": -111.80625915527344, "step": 5480 }, { "epoch": 0.32, "grad_norm": 7.657138800253963e-17, "learning_rate": 0.0008966678277023105, "logits/chosen": -10.639230728149414, "logits/rejected": -10.655903816223145, "logps/chosen": -2218.615966796875, "logps/rejected": -2221.426025390625, "loss": 3.8547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -82.60459899902344, "rewards/margins": 18.82196044921875, "rewards/rejected": -101.42656707763672, "step": 5490 }, { "epoch": 0.32, "grad_norm": 3.8222067707991934e-20, "learning_rate": 0.0008964743217616781, "logits/chosen": -11.798418045043945, "logits/rejected": -11.55506706237793, "logps/chosen": -3049.147216796875, "logps/rejected": -2546.922607421875, "loss": 47.4969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -221.5164031982422, "rewards/margins": -35.11113357543945, "rewards/rejected": -186.4052734375, "step": 5500 }, { "epoch": 0.32, "grad_norm": 256.0093688964844, "learning_rate": 0.0008962808158210457, "logits/chosen": -11.66625690460205, "logits/rejected": -11.66930103302002, "logps/chosen": -2580.31103515625, "logps/rejected": -2377.55908203125, "loss": 16.0841, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -157.957275390625, "rewards/margins": -14.192463874816895, "rewards/rejected": -143.76480102539062, "step": 5510 }, { "epoch": 0.32, "grad_norm": 151.9810333251953, "learning_rate": 0.0008960873098804133, "logits/chosen": -7.5043792724609375, "logits/rejected": -7.4457502365112305, "logps/chosen": -2034.4156494140625, "logps/rejected": -1817.8701171875, "loss": 16.5114, "rewards/accuracies": 0.5, "rewards/chosen": -52.30840301513672, "rewards/margins": -12.468597412109375, "rewards/rejected": -39.839805603027344, "step": 5520 }, { "epoch": 0.32, "grad_norm": 110.03237915039062, "learning_rate": 0.0008958938039397809, "logits/chosen": -8.003859519958496, "logits/rejected": -7.986342430114746, "logps/chosen": -1920.4124755859375, "logps/rejected": -1641.5364990234375, "loss": 28.2897, "rewards/accuracies": 0.5, "rewards/chosen": -116.20057678222656, "rewards/margins": -19.271472930908203, "rewards/rejected": -96.92911529541016, "step": 5530 }, { "epoch": 0.32, "grad_norm": 43.69084167480469, "learning_rate": 0.0008957002979991485, "logits/chosen": -7.861448764801025, "logits/rejected": -7.860171318054199, "logps/chosen": -1940.194580078125, "logps/rejected": -1756.9287109375, "loss": 8.2566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 56.13605880737305, "rewards/margins": 11.001448631286621, "rewards/rejected": 45.134605407714844, "step": 5540 }, { "epoch": 0.32, "grad_norm": 47.33710479736328, "learning_rate": 0.0008955067920585162, "logits/chosen": -9.308518409729004, "logits/rejected": -9.305434226989746, "logps/chosen": -1551.855712890625, "logps/rejected": -1448.459228515625, "loss": 23.549, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.89225769042969, "rewards/margins": -3.5937752723693848, "rewards/rejected": -85.29849243164062, "step": 5550 }, { "epoch": 0.32, "grad_norm": 0.002951902337372303, "learning_rate": 0.0008953132861178838, "logits/chosen": -10.215353965759277, "logits/rejected": -10.06898307800293, "logps/chosen": -2061.228515625, "logps/rejected": -1441.0595703125, "loss": 39.4789, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -138.59695434570312, "rewards/margins": -27.041156768798828, "rewards/rejected": -111.5558090209961, "step": 5560 }, { "epoch": 0.32, "grad_norm": 0.8384829163551331, "learning_rate": 0.0008951197801772515, "logits/chosen": -7.4640607833862305, "logits/rejected": -7.44695520401001, "logps/chosen": -1970.584228515625, "logps/rejected": -1740.7435302734375, "loss": 26.2136, "rewards/accuracies": 0.5, "rewards/chosen": -44.86262893676758, "rewards/margins": -11.464073181152344, "rewards/rejected": -33.3985481262207, "step": 5570 }, { "epoch": 0.32, "grad_norm": 0.21232756972312927, "learning_rate": 0.0008949262742366191, "logits/chosen": -7.2981858253479, "logits/rejected": -7.297068119049072, "logps/chosen": -2165.3681640625, "logps/rejected": -2074.0693359375, "loss": 23.0669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -58.289878845214844, "rewards/margins": 25.413345336914062, "rewards/rejected": -83.7032241821289, "step": 5580 }, { "epoch": 0.32, "grad_norm": 10.265420913696289, "learning_rate": 0.0008947327682959867, "logits/chosen": -7.916199684143066, "logits/rejected": -7.916760444641113, "logps/chosen": -1359.5455322265625, "logps/rejected": -901.1422729492188, "loss": 19.4011, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.85726547241211, "rewards/margins": 6.876105308532715, "rewards/rejected": -33.733375549316406, "step": 5590 }, { "epoch": 0.32, "grad_norm": 282.3177490234375, "learning_rate": 0.0008945392623553543, "logits/chosen": -10.801773071289062, "logits/rejected": -10.871581077575684, "logps/chosen": -2018.5316162109375, "logps/rejected": -1725.4609375, "loss": 30.2663, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -166.4390869140625, "rewards/margins": -22.965106964111328, "rewards/rejected": -143.47396850585938, "step": 5600 }, { "epoch": 0.32, "grad_norm": 1053.2127685546875, "learning_rate": 0.0008943457564147219, "logits/chosen": -10.717445373535156, "logits/rejected": -10.679351806640625, "logps/chosen": -4470.85009765625, "logps/rejected": -3805.12353515625, "loss": 38.1394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -312.6874084472656, "rewards/margins": -28.807361602783203, "rewards/rejected": -283.8800354003906, "step": 5610 }, { "epoch": 0.33, "grad_norm": 55.555992126464844, "learning_rate": 0.0008941522504740895, "logits/chosen": -8.937202453613281, "logits/rejected": -8.995853424072266, "logps/chosen": -3281.84033203125, "logps/rejected": -3281.522705078125, "loss": 26.9928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.20550537109375, "rewards/margins": 6.942538261413574, "rewards/rejected": -122.14802551269531, "step": 5620 }, { "epoch": 0.33, "grad_norm": 37.5225944519043, "learning_rate": 0.0008939587445334572, "logits/chosen": -9.855813980102539, "logits/rejected": -9.848995208740234, "logps/chosen": -1831.7073974609375, "logps/rejected": -1837.686279296875, "loss": 20.3283, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -90.20549011230469, "rewards/margins": -12.718440055847168, "rewards/rejected": -77.48704528808594, "step": 5630 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0008937652385928248, "logits/chosen": -9.88629150390625, "logits/rejected": -9.816731452941895, "logps/chosen": -2046.5853271484375, "logps/rejected": -1880.3499755859375, "loss": 34.4935, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -135.32176208496094, "rewards/margins": -18.312387466430664, "rewards/rejected": -117.00938415527344, "step": 5640 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0008935717326521925, "logits/chosen": -9.162881851196289, "logits/rejected": -9.182085037231445, "logps/chosen": -1788.220947265625, "logps/rejected": -1307.0723876953125, "loss": 39.6169, "rewards/accuracies": 0.5, "rewards/chosen": -113.81965637207031, "rewards/margins": -15.9968843460083, "rewards/rejected": -97.82276916503906, "step": 5650 }, { "epoch": 0.33, "grad_norm": 1315.727294921875, "learning_rate": 0.0008933782267115601, "logits/chosen": -10.541940689086914, "logits/rejected": -11.3097562789917, "logps/chosen": -2680.55126953125, "logps/rejected": -2413.861572265625, "loss": 48.2289, "rewards/accuracies": 0.5, "rewards/chosen": -201.1003875732422, "rewards/margins": -15.782928466796875, "rewards/rejected": -185.31747436523438, "step": 5660 }, { "epoch": 0.33, "grad_norm": 14.394914627075195, "learning_rate": 0.0008931847207709277, "logits/chosen": -8.868314743041992, "logits/rejected": -8.874650001525879, "logps/chosen": -1294.0738525390625, "logps/rejected": -959.6360473632812, "loss": 21.5731, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -38.44044876098633, "rewards/margins": -14.797101020812988, "rewards/rejected": -23.643346786499023, "step": 5670 }, { "epoch": 0.33, "grad_norm": 0.12642864882946014, "learning_rate": 0.0008929912148302954, "logits/chosen": -8.48918628692627, "logits/rejected": -8.476362228393555, "logps/chosen": -1685.5679931640625, "logps/rejected": -1399.6939697265625, "loss": 29.4106, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.881250381469727, "rewards/margins": -22.516414642333984, "rewards/rejected": -3.3648324012756348, "step": 5680 }, { "epoch": 0.33, "grad_norm": 140.94805908203125, "learning_rate": 0.000892797708889663, "logits/chosen": -11.38328742980957, "logits/rejected": -11.371163368225098, "logps/chosen": -1812.1702880859375, "logps/rejected": -1588.65185546875, "loss": 21.6316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.224254608154297, "rewards/margins": -13.610937118530273, "rewards/rejected": -7.613317966461182, "step": 5690 }, { "epoch": 0.33, "grad_norm": 65.49866485595703, "learning_rate": 0.0008926042029490306, "logits/chosen": -9.22482681274414, "logits/rejected": -9.209634780883789, "logps/chosen": -1651.965576171875, "logps/rejected": -1258.5791015625, "loss": 26.0073, "rewards/accuracies": 0.5, "rewards/chosen": -39.15375518798828, "rewards/margins": -7.431483268737793, "rewards/rejected": -31.722270965576172, "step": 5700 }, { "epoch": 0.33, "grad_norm": 4.79657064715866e-05, "learning_rate": 0.0008924106970083982, "logits/chosen": -7.6124701499938965, "logits/rejected": -7.685220241546631, "logps/chosen": -1903.776611328125, "logps/rejected": -1875.1754150390625, "loss": 25.3818, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.157103061676025, "rewards/margins": -9.588274002075195, "rewards/rejected": 13.745373725891113, "step": 5710 }, { "epoch": 0.33, "grad_norm": 1.5234928149579183e-17, "learning_rate": 0.0008922171910677658, "logits/chosen": -8.271783828735352, "logits/rejected": -8.243013381958008, "logps/chosen": -1419.049072265625, "logps/rejected": -1348.844970703125, "loss": 24.8255, "rewards/accuracies": 0.5, "rewards/chosen": -113.41175842285156, "rewards/margins": -8.170977592468262, "rewards/rejected": -105.24078369140625, "step": 5720 }, { "epoch": 0.33, "grad_norm": 32.40129089355469, "learning_rate": 0.0008920236851271333, "logits/chosen": -10.284769058227539, "logits/rejected": -10.192203521728516, "logps/chosen": -1705.725830078125, "logps/rejected": -1193.517822265625, "loss": 44.7909, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -62.533660888671875, "rewards/margins": -38.99213790893555, "rewards/rejected": -23.54151725769043, "step": 5730 }, { "epoch": 0.33, "grad_norm": 7.225237075303426e-11, "learning_rate": 0.000891830179186501, "logits/chosen": -9.341378211975098, "logits/rejected": -9.335490226745605, "logps/chosen": -1965.0394287109375, "logps/rejected": -1572.9708251953125, "loss": 23.2187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.19234848022461, "rewards/margins": 2.9550843238830566, "rewards/rejected": -37.147438049316406, "step": 5740 }, { "epoch": 0.33, "grad_norm": 0.11327631026506424, "learning_rate": 0.0008916366732458686, "logits/chosen": -8.474296569824219, "logits/rejected": -8.519545555114746, "logps/chosen": -1791.754638671875, "logps/rejected": -1499.66015625, "loss": 33.4797, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -54.34113311767578, "rewards/margins": -31.231613159179688, "rewards/rejected": -23.10952377319336, "step": 5750 }, { "epoch": 0.33, "grad_norm": 8.48308191052638e-05, "learning_rate": 0.0008914431673052363, "logits/chosen": -8.225336074829102, "logits/rejected": -8.234528541564941, "logps/chosen": -2228.408203125, "logps/rejected": -1780.3892822265625, "loss": 50.2481, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -122.06514739990234, "rewards/margins": -47.29398727416992, "rewards/rejected": -74.77116394042969, "step": 5760 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0008912496613646039, "logits/chosen": -8.962305068969727, "logits/rejected": -8.86397933959961, "logps/chosen": -1856.044189453125, "logps/rejected": -1333.978271484375, "loss": 27.3345, "rewards/accuracies": 0.5, "rewards/chosen": -41.66272735595703, "rewards/margins": 4.631596565246582, "rewards/rejected": -46.29432678222656, "step": 5770 }, { "epoch": 0.33, "grad_norm": 26.23997688293457, "learning_rate": 0.0008910561554239715, "logits/chosen": -8.619751930236816, "logits/rejected": -8.587921142578125, "logps/chosen": -2008.7982177734375, "logps/rejected": -1837.1829833984375, "loss": 33.2375, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 4.520230770111084, "rewards/margins": -27.66583251953125, "rewards/rejected": 32.18606948852539, "step": 5780 }, { "epoch": 0.34, "grad_norm": 88.07479095458984, "learning_rate": 0.0008908626494833391, "logits/chosen": -8.460312843322754, "logits/rejected": -8.513134002685547, "logps/chosen": -1864.097900390625, "logps/rejected": -1158.833984375, "loss": 49.3846, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -51.015830993652344, "rewards/margins": -34.3292236328125, "rewards/rejected": -16.68661117553711, "step": 5790 }, { "epoch": 0.34, "grad_norm": 58.54241943359375, "learning_rate": 0.0008906691435427068, "logits/chosen": -13.685696601867676, "logits/rejected": -13.664149284362793, "logps/chosen": -2466.77783203125, "logps/rejected": -1823.490478515625, "loss": 59.103, "rewards/accuracies": 0.5, "rewards/chosen": -134.30621337890625, "rewards/margins": -52.83629608154297, "rewards/rejected": -81.46990966796875, "step": 5800 }, { "epoch": 0.34, "grad_norm": 25.82744598388672, "learning_rate": 0.0008904756376020744, "logits/chosen": -7.705312252044678, "logits/rejected": -7.652497291564941, "logps/chosen": -1713.1771240234375, "logps/rejected": -1676.5718994140625, "loss": 32.1823, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.3120674192905426, "rewards/margins": -25.78310203552246, "rewards/rejected": 26.095169067382812, "step": 5810 }, { "epoch": 0.34, "grad_norm": 82.19534301757812, "learning_rate": 0.000890282131661442, "logits/chosen": -5.46566915512085, "logits/rejected": -5.552099704742432, "logps/chosen": -2210.06884765625, "logps/rejected": -2303.45263671875, "loss": 15.3194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.394464492797852, "rewards/margins": 24.703338623046875, "rewards/rejected": -39.097801208496094, "step": 5820 }, { "epoch": 0.34, "grad_norm": 37.12788772583008, "learning_rate": 0.0008900886257208096, "logits/chosen": -7.358950138092041, "logits/rejected": -7.24532413482666, "logps/chosen": -2054.56494140625, "logps/rejected": -1506.573486328125, "loss": 28.5056, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -86.39486694335938, "rewards/margins": -19.88088035583496, "rewards/rejected": -66.51399230957031, "step": 5830 }, { "epoch": 0.34, "grad_norm": 53.62042999267578, "learning_rate": 0.0008898951197801772, "logits/chosen": -9.171141624450684, "logits/rejected": -9.1994047164917, "logps/chosen": -1543.4395751953125, "logps/rejected": -1257.3856201171875, "loss": 31.369, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -61.1956787109375, "rewards/margins": -28.024028778076172, "rewards/rejected": -33.171653747558594, "step": 5840 }, { "epoch": 0.34, "grad_norm": 0.06519626080989838, "learning_rate": 0.0008897016138395449, "logits/chosen": -7.9544525146484375, "logits/rejected": -7.978518009185791, "logps/chosen": -2074.3486328125, "logps/rejected": -1992.493408203125, "loss": 7.1661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -75.90677642822266, "rewards/margins": 21.238853454589844, "rewards/rejected": -97.14561462402344, "step": 5850 }, { "epoch": 0.34, "grad_norm": 61.59912109375, "learning_rate": 0.0008895081078989126, "logits/chosen": -7.411720275878906, "logits/rejected": -7.333335876464844, "logps/chosen": -2450.93896484375, "logps/rejected": -1876.8203125, "loss": 45.662, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -95.83164978027344, "rewards/margins": -39.86481475830078, "rewards/rejected": -55.96683883666992, "step": 5860 }, { "epoch": 0.34, "grad_norm": 4.0046061258181e-06, "learning_rate": 0.0008893146019582802, "logits/chosen": -9.149152755737305, "logits/rejected": -9.165022850036621, "logps/chosen": -2169.086181640625, "logps/rejected": -2159.45361328125, "loss": 21.1009, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -114.97689056396484, "rewards/margins": -5.737664222717285, "rewards/rejected": -109.2392349243164, "step": 5870 }, { "epoch": 0.34, "grad_norm": 101.79722595214844, "learning_rate": 0.0008891210960176478, "logits/chosen": -9.123533248901367, "logits/rejected": -9.241991996765137, "logps/chosen": -1549.9248046875, "logps/rejected": -1700.453369140625, "loss": 21.3419, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.05876159667969, "rewards/margins": 9.118874549865723, "rewards/rejected": -133.17764282226562, "step": 5880 }, { "epoch": 0.34, "grad_norm": 2.4805970610941586e-09, "learning_rate": 0.0008889275900770154, "logits/chosen": -8.017684936523438, "logits/rejected": -8.137886047363281, "logps/chosen": -1554.6826171875, "logps/rejected": -1661.462158203125, "loss": 14.9427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -61.7480354309082, "rewards/margins": 26.163619995117188, "rewards/rejected": -87.91165161132812, "step": 5890 }, { "epoch": 0.34, "grad_norm": 3.1252645870533113e-12, "learning_rate": 0.000888734084136383, "logits/chosen": -6.989199638366699, "logits/rejected": -6.976412773132324, "logps/chosen": -1622.7894287109375, "logps/rejected": -1136.252197265625, "loss": 29.9648, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -63.626609802246094, "rewards/margins": -19.6286678314209, "rewards/rejected": -43.997947692871094, "step": 5900 }, { "epoch": 0.34, "grad_norm": 42.35055160522461, "learning_rate": 0.0008885405781957507, "logits/chosen": -8.294105529785156, "logits/rejected": -8.204206466674805, "logps/chosen": -1561.3355712890625, "logps/rejected": -1451.203857421875, "loss": 19.9686, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.94519805908203, "rewards/margins": -10.218587875366211, "rewards/rejected": -77.72660827636719, "step": 5910 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.0008883470722551183, "logits/chosen": -7.541112422943115, "logits/rejected": -7.572052955627441, "logps/chosen": -1293.0042724609375, "logps/rejected": -1072.255126953125, "loss": 29.2622, "rewards/accuracies": 0.5, "rewards/chosen": -54.27606201171875, "rewards/margins": -5.279963493347168, "rewards/rejected": -48.996097564697266, "step": 5920 }, { "epoch": 0.34, "grad_norm": 3.890247233040501e-22, "learning_rate": 0.0008881535663144859, "logits/chosen": -6.444117546081543, "logits/rejected": -6.209362983703613, "logps/chosen": -2006.8564453125, "logps/rejected": -1695.364013671875, "loss": 11.5768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.533451080322266, "rewards/margins": 10.35902214050293, "rewards/rejected": -32.89247512817383, "step": 5930 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.0008879600603738535, "logits/chosen": -7.180808067321777, "logits/rejected": -7.232378959655762, "logps/chosen": -1889.751220703125, "logps/rejected": -1783.936279296875, "loss": 18.233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -36.4546012878418, "rewards/margins": 12.834043502807617, "rewards/rejected": -49.28864669799805, "step": 5940 }, { "epoch": 0.34, "grad_norm": 62.33946228027344, "learning_rate": 0.000887766554433221, "logits/chosen": -5.840323448181152, "logits/rejected": -5.766055107116699, "logps/chosen": -1925.486083984375, "logps/rejected": -1387.731689453125, "loss": 23.4889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.882219314575195, "rewards/margins": 3.485215425491333, "rewards/rejected": -27.3674373626709, "step": 5950 }, { "epoch": 0.34, "grad_norm": 78.912109375, "learning_rate": 0.0008875730484925886, "logits/chosen": -6.92788553237915, "logits/rejected": -6.752841949462891, "logps/chosen": -2052.729736328125, "logps/rejected": -1845.146240234375, "loss": 25.5253, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -138.95327758789062, "rewards/margins": -21.676864624023438, "rewards/rejected": -117.27640533447266, "step": 5960 }, { "epoch": 0.35, "grad_norm": 145.68582153320312, "learning_rate": 0.0008873795425519564, "logits/chosen": -6.466187477111816, "logits/rejected": -6.446606636047363, "logps/chosen": -2474.1787109375, "logps/rejected": -2129.59033203125, "loss": 32.7754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -44.85563659667969, "rewards/margins": -23.202178955078125, "rewards/rejected": -21.653451919555664, "step": 5970 }, { "epoch": 0.35, "grad_norm": 6.685911968545653e-22, "learning_rate": 0.000887186036611324, "logits/chosen": -8.337911605834961, "logits/rejected": -8.206003189086914, "logps/chosen": -1722.131591796875, "logps/rejected": -1357.8692626953125, "loss": 33.607, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -84.29328155517578, "rewards/margins": -22.799152374267578, "rewards/rejected": -61.49412155151367, "step": 5980 }, { "epoch": 0.35, "grad_norm": 0.00046105391811579466, "learning_rate": 0.0008869925306706916, "logits/chosen": -7.07720947265625, "logits/rejected": -7.095664978027344, "logps/chosen": -2227.13134765625, "logps/rejected": -1851.7835693359375, "loss": 6.4536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.097698211669922, "rewards/margins": 13.333999633789062, "rewards/rejected": -25.43169593811035, "step": 5990 }, { "epoch": 0.35, "grad_norm": 32.54738235473633, "learning_rate": 0.0008867990247300592, "logits/chosen": -8.064435958862305, "logits/rejected": -7.929145812988281, "logps/chosen": -1698.635009765625, "logps/rejected": -1744.8004150390625, "loss": 19.2329, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.989280700683594, "rewards/margins": -5.163102626800537, "rewards/rejected": -44.82617950439453, "step": 6000 }, { "epoch": 0.35, "grad_norm": 9.989615440368652, "learning_rate": 0.0008866055187894268, "logits/chosen": -7.294398307800293, "logits/rejected": -7.256403923034668, "logps/chosen": -1656.679931640625, "logps/rejected": -1274.7772216796875, "loss": 19.904, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -65.31707763671875, "rewards/margins": -8.861141204833984, "rewards/rejected": -56.45594024658203, "step": 6010 }, { "epoch": 0.35, "grad_norm": 37.82470703125, "learning_rate": 0.0008864120128487945, "logits/chosen": -6.1953582763671875, "logits/rejected": -6.1819562911987305, "logps/chosen": -2016.4833984375, "logps/rejected": -1749.891845703125, "loss": 17.4265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -81.51441192626953, "rewards/margins": -6.439062595367432, "rewards/rejected": -75.07534790039062, "step": 6020 }, { "epoch": 0.35, "grad_norm": 57.133018493652344, "learning_rate": 0.0008862185069081621, "logits/chosen": -7.310229301452637, "logits/rejected": -7.2309136390686035, "logps/chosen": -2310.37451171875, "logps/rejected": -1852.905517578125, "loss": 42.0677, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -82.75735473632812, "rewards/margins": -40.501136779785156, "rewards/rejected": -42.2562141418457, "step": 6030 }, { "epoch": 0.35, "grad_norm": 0.17763634026050568, "learning_rate": 0.0008860250009675297, "logits/chosen": -8.084000587463379, "logits/rejected": -8.092185020446777, "logps/chosen": -1738.399169921875, "logps/rejected": -1460.1773681640625, "loss": 21.5922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.75576400756836, "rewards/margins": -4.928184509277344, "rewards/rejected": -16.82758331298828, "step": 6040 }, { "epoch": 0.35, "grad_norm": 4.976304808224086e-07, "learning_rate": 0.0008858314950268973, "logits/chosen": -8.426023483276367, "logits/rejected": -8.450749397277832, "logps/chosen": -1986.090576171875, "logps/rejected": -1377.6414794921875, "loss": 14.8279, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -66.51394653320312, "rewards/margins": -3.5972800254821777, "rewards/rejected": -62.916664123535156, "step": 6050 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0008856379890862649, "logits/chosen": -7.85089635848999, "logits/rejected": -7.839346408843994, "logps/chosen": -2014.0947265625, "logps/rejected": -1860.9183349609375, "loss": 9.305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.5710391998291, "rewards/margins": 19.26413917541504, "rewards/rejected": -46.835182189941406, "step": 6060 }, { "epoch": 0.35, "grad_norm": 0.004852696787565947, "learning_rate": 0.0008854444831456326, "logits/chosen": -8.797642707824707, "logits/rejected": -8.803516387939453, "logps/chosen": -1884.9486083984375, "logps/rejected": -1605.531005859375, "loss": 24.6439, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -53.30939483642578, "rewards/margins": -17.04891586303711, "rewards/rejected": -36.260475158691406, "step": 6070 }, { "epoch": 0.35, "grad_norm": 6.434332927574005e-11, "learning_rate": 0.0008852509772050003, "logits/chosen": -8.881488800048828, "logits/rejected": -8.902755737304688, "logps/chosen": -1833.2183837890625, "logps/rejected": -1963.184814453125, "loss": 2.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -76.67422485351562, "rewards/margins": 14.126177787780762, "rewards/rejected": -90.8004150390625, "step": 6080 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0008850574712643679, "logits/chosen": -9.446403503417969, "logits/rejected": -9.366372108459473, "logps/chosen": -1765.5111083984375, "logps/rejected": -1740.829345703125, "loss": 18.4098, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -77.15531921386719, "rewards/margins": -0.6348937749862671, "rewards/rejected": -76.52043151855469, "step": 6090 }, { "epoch": 0.35, "grad_norm": 1.4533212322476174e-07, "learning_rate": 0.0008848639653237355, "logits/chosen": -8.759702682495117, "logits/rejected": -8.827855110168457, "logps/chosen": -1778.6109619140625, "logps/rejected": -1539.6884765625, "loss": 44.9728, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -58.0482292175293, "rewards/margins": -41.25770568847656, "rewards/rejected": -16.7905330657959, "step": 6100 }, { "epoch": 0.35, "grad_norm": 52.709590911865234, "learning_rate": 0.0008846704593831031, "logits/chosen": -8.041159629821777, "logits/rejected": -8.087031364440918, "logps/chosen": -2149.97314453125, "logps/rejected": -1999.401611328125, "loss": 44.8684, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -49.19320297241211, "rewards/margins": -33.92534637451172, "rewards/rejected": -15.267855644226074, "step": 6110 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.0008844769534424707, "logits/chosen": -9.420099258422852, "logits/rejected": -9.412813186645508, "logps/chosen": -1605.7286376953125, "logps/rejected": -1471.409423828125, "loss": 11.253, "rewards/accuracies": 0.5, "rewards/chosen": -55.51710891723633, "rewards/margins": 9.699801445007324, "rewards/rejected": -65.21690368652344, "step": 6120 }, { "epoch": 0.35, "grad_norm": 0.00033246821840293705, "learning_rate": 0.0008842834475018384, "logits/chosen": -10.46668529510498, "logits/rejected": -10.412078857421875, "logps/chosen": -2105.96435546875, "logps/rejected": -1891.7447509765625, "loss": 39.8778, "rewards/accuracies": 0.5, "rewards/chosen": -103.9087905883789, "rewards/margins": -26.161113739013672, "rewards/rejected": -77.74767303466797, "step": 6130 }, { "epoch": 0.36, "grad_norm": 37.36909484863281, "learning_rate": 0.000884089941561206, "logits/chosen": -8.063346862792969, "logits/rejected": -8.174836158752441, "logps/chosen": -2197.24462890625, "logps/rejected": -2241.140869140625, "loss": 9.0902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.163671016693115, "rewards/margins": 12.87890338897705, "rewards/rejected": -17.04257583618164, "step": 6140 }, { "epoch": 0.36, "grad_norm": 0.010352320969104767, "learning_rate": 0.0008838964356205736, "logits/chosen": -8.360637664794922, "logits/rejected": -8.219576835632324, "logps/chosen": -1975.450439453125, "logps/rejected": -1610.33203125, "loss": 40.6046, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -136.96646118164062, "rewards/margins": -36.13361358642578, "rewards/rejected": -100.83285522460938, "step": 6150 }, { "epoch": 0.36, "grad_norm": 50.79758834838867, "learning_rate": 0.0008837029296799411, "logits/chosen": -8.80354118347168, "logits/rejected": -8.750330924987793, "logps/chosen": -2093.072998046875, "logps/rejected": -1896.637939453125, "loss": 22.8382, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -64.06664276123047, "rewards/margins": -12.3822021484375, "rewards/rejected": -51.68444061279297, "step": 6160 }, { "epoch": 0.36, "grad_norm": 12.176909446716309, "learning_rate": 0.0008835094237393087, "logits/chosen": -9.246511459350586, "logits/rejected": -9.198073387145996, "logps/chosen": -1890.6793212890625, "logps/rejected": -1859.748779296875, "loss": 5.7057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -53.364219665527344, "rewards/margins": 6.11114501953125, "rewards/rejected": -59.47536087036133, "step": 6170 }, { "epoch": 0.36, "grad_norm": 107.81075286865234, "learning_rate": 0.0008833159177986764, "logits/chosen": -8.133837699890137, "logits/rejected": -8.023698806762695, "logps/chosen": -2209.40234375, "logps/rejected": -2016.333984375, "loss": 20.4794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.93267059326172, "rewards/margins": -15.281248092651367, "rewards/rejected": -72.65142059326172, "step": 6180 }, { "epoch": 0.36, "grad_norm": 0.002329935785382986, "learning_rate": 0.000883122411858044, "logits/chosen": -9.390119552612305, "logits/rejected": -9.369481086730957, "logps/chosen": -2198.811279296875, "logps/rejected": -1596.3748779296875, "loss": 34.5834, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -107.27880859375, "rewards/margins": -27.545246124267578, "rewards/rejected": -79.73356628417969, "step": 6190 }, { "epoch": 0.36, "grad_norm": 5.738190650939941, "learning_rate": 0.0008829289059174117, "logits/chosen": -8.591021537780762, "logits/rejected": -8.587785720825195, "logps/chosen": -1513.2547607421875, "logps/rejected": -1357.2958984375, "loss": 17.445, "rewards/accuracies": 0.5, "rewards/chosen": -81.415283203125, "rewards/margins": -6.233270645141602, "rewards/rejected": -75.18201446533203, "step": 6200 }, { "epoch": 0.36, "grad_norm": 0.015136122703552246, "learning_rate": 0.0008827353999767793, "logits/chosen": -8.328716278076172, "logits/rejected": -8.35205078125, "logps/chosen": -1659.767578125, "logps/rejected": -1547.4951171875, "loss": 19.5617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -133.03793334960938, "rewards/margins": -8.997316360473633, "rewards/rejected": -124.0406265258789, "step": 6210 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.0008825418940361469, "logits/chosen": -9.209671020507812, "logits/rejected": -9.188681602478027, "logps/chosen": -2055.784423828125, "logps/rejected": -1730.3707275390625, "loss": 26.6457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.57003021240234, "rewards/margins": -14.765329360961914, "rewards/rejected": -84.80470275878906, "step": 6220 }, { "epoch": 0.36, "grad_norm": 47.564186096191406, "learning_rate": 0.0008823483880955145, "logits/chosen": -8.850110054016113, "logits/rejected": -8.796646118164062, "logps/chosen": -1714.845703125, "logps/rejected": -1202.2218017578125, "loss": 17.4199, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -61.68988800048828, "rewards/margins": -0.8568474054336548, "rewards/rejected": -60.833045959472656, "step": 6230 }, { "epoch": 0.36, "grad_norm": 31.021482467651367, "learning_rate": 0.0008821548821548821, "logits/chosen": -8.0305757522583, "logits/rejected": -8.009049415588379, "logps/chosen": -1714.4117431640625, "logps/rejected": -1431.778076171875, "loss": 6.0356, "rewards/accuracies": 0.5, "rewards/chosen": -8.99741268157959, "rewards/margins": 6.848056793212891, "rewards/rejected": -15.845471382141113, "step": 6240 }, { "epoch": 0.36, "grad_norm": 95.42561340332031, "learning_rate": 0.0008819613762142498, "logits/chosen": -9.213711738586426, "logits/rejected": -9.202284812927246, "logps/chosen": -1853.7470703125, "logps/rejected": -1878.1494140625, "loss": 9.6528, "rewards/accuracies": 0.5, "rewards/chosen": -114.2202377319336, "rewards/margins": 3.8681609630584717, "rewards/rejected": -118.08839416503906, "step": 6250 }, { "epoch": 0.36, "grad_norm": 0.0699557363986969, "learning_rate": 0.0008817678702736174, "logits/chosen": -9.283651351928711, "logits/rejected": -9.227164268493652, "logps/chosen": -2071.00634765625, "logps/rejected": -1661.290771484375, "loss": 34.3998, "rewards/accuracies": 0.5, "rewards/chosen": -134.41302490234375, "rewards/margins": -19.5996150970459, "rewards/rejected": -114.81340026855469, "step": 6260 }, { "epoch": 0.36, "grad_norm": 122.83963775634766, "learning_rate": 0.000881574364332985, "logits/chosen": -9.003091812133789, "logits/rejected": -8.817717552185059, "logps/chosen": -2118.56884765625, "logps/rejected": -1411.576416015625, "loss": 28.0403, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5858235359191895, "rewards/margins": -11.308359146118164, "rewards/rejected": 14.894182205200195, "step": 6270 }, { "epoch": 0.36, "grad_norm": 0.23256398737430573, "learning_rate": 0.0008813808583923527, "logits/chosen": -9.055439949035645, "logits/rejected": -9.023421287536621, "logps/chosen": -1621.95751953125, "logps/rejected": -1352.558837890625, "loss": 18.7696, "rewards/accuracies": 0.5, "rewards/chosen": -24.422531127929688, "rewards/margins": -1.0165717601776123, "rewards/rejected": -23.405956268310547, "step": 6280 }, { "epoch": 0.36, "grad_norm": 35.68690490722656, "learning_rate": 0.0008811873524517203, "logits/chosen": -10.113762855529785, "logits/rejected": -9.953336715698242, "logps/chosen": -1669.289794921875, "logps/rejected": -1114.515869140625, "loss": 23.6357, "rewards/accuracies": 0.5, "rewards/chosen": -63.98430252075195, "rewards/margins": -12.287538528442383, "rewards/rejected": -51.6967658996582, "step": 6290 }, { "epoch": 0.36, "grad_norm": 55.3538818359375, "learning_rate": 0.000880993846511088, "logits/chosen": -11.50457763671875, "logits/rejected": -11.398751258850098, "logps/chosen": -1595.195068359375, "logps/rejected": -1518.97998046875, "loss": 16.6694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -97.02625274658203, "rewards/margins": -5.250644207000732, "rewards/rejected": -91.7756118774414, "step": 6300 }, { "epoch": 0.37, "grad_norm": 60.98544692993164, "learning_rate": 0.0008808003405704556, "logits/chosen": -11.842698097229004, "logits/rejected": -11.904720306396484, "logps/chosen": -2307.276611328125, "logps/rejected": -1703.3363037109375, "loss": 31.0829, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.76763916015625, "rewards/margins": -15.93836498260498, "rewards/rejected": -72.82926940917969, "step": 6310 }, { "epoch": 0.37, "grad_norm": 76.64411926269531, "learning_rate": 0.0008806068346298232, "logits/chosen": -12.397570610046387, "logits/rejected": -12.103188514709473, "logps/chosen": -2167.468017578125, "logps/rejected": -1497.1048583984375, "loss": 49.3, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -104.3927001953125, "rewards/margins": -41.28083419799805, "rewards/rejected": -63.11186981201172, "step": 6320 }, { "epoch": 0.37, "grad_norm": 0.9372826218605042, "learning_rate": 0.0008804133286891908, "logits/chosen": -10.253549575805664, "logits/rejected": -10.212906837463379, "logps/chosen": -1814.470458984375, "logps/rejected": -1704.76953125, "loss": 30.7105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -59.32246780395508, "rewards/margins": -4.626654148101807, "rewards/rejected": -54.69581985473633, "step": 6330 }, { "epoch": 0.37, "grad_norm": 801.3389282226562, "learning_rate": 0.0008802198227485584, "logits/chosen": -10.487430572509766, "logits/rejected": -10.128129959106445, "logps/chosen": -1697.0648193359375, "logps/rejected": -1314.263427734375, "loss": 18.1222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -69.87406158447266, "rewards/margins": -2.3905844688415527, "rewards/rejected": -67.48347473144531, "step": 6340 }, { "epoch": 0.37, "grad_norm": 299.10955810546875, "learning_rate": 0.000880026316807926, "logits/chosen": -11.334198951721191, "logits/rejected": -11.015657424926758, "logps/chosen": -2177.337158203125, "logps/rejected": -1816.4107666015625, "loss": 30.3844, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -88.33360290527344, "rewards/margins": -26.550546646118164, "rewards/rejected": -61.7830696105957, "step": 6350 }, { "epoch": 0.37, "grad_norm": 73.7408447265625, "learning_rate": 0.0008798328108672937, "logits/chosen": -10.674715042114258, "logits/rejected": -10.643024444580078, "logps/chosen": -2146.74755859375, "logps/rejected": -1926.886474609375, "loss": 4.7303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -40.799407958984375, "rewards/margins": 9.453516006469727, "rewards/rejected": -50.252925872802734, "step": 6360 }, { "epoch": 0.37, "grad_norm": 6.230246219721802e-22, "learning_rate": 0.0008796393049266613, "logits/chosen": -11.161084175109863, "logits/rejected": -10.891260147094727, "logps/chosen": -1985.232666015625, "logps/rejected": -1625.2001953125, "loss": 41.5635, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -95.84555053710938, "rewards/margins": -31.97792625427246, "rewards/rejected": -63.86762619018555, "step": 6370 }, { "epoch": 0.37, "grad_norm": 3.3672972179696337e-13, "learning_rate": 0.000879445798986029, "logits/chosen": -9.750324249267578, "logits/rejected": -9.833476066589355, "logps/chosen": -1915.32421875, "logps/rejected": -1634.399658203125, "loss": 11.058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -73.15520477294922, "rewards/margins": 20.035228729248047, "rewards/rejected": -93.1904296875, "step": 6380 }, { "epoch": 0.37, "grad_norm": 36.720237731933594, "learning_rate": 0.0008792522930453965, "logits/chosen": -10.192198753356934, "logits/rejected": -10.049596786499023, "logps/chosen": -1484.219970703125, "logps/rejected": -1250.58447265625, "loss": 24.576, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -82.95191192626953, "rewards/margins": -22.603012084960938, "rewards/rejected": -60.3488883972168, "step": 6390 }, { "epoch": 0.37, "grad_norm": 83.5778579711914, "learning_rate": 0.0008790587871047641, "logits/chosen": -9.424667358398438, "logits/rejected": -9.350653648376465, "logps/chosen": -2052.67822265625, "logps/rejected": -1597.140380859375, "loss": 30.9069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -76.56095123291016, "rewards/margins": -11.884432792663574, "rewards/rejected": -64.67652130126953, "step": 6400 }, { "epoch": 0.37, "grad_norm": 32.602962493896484, "learning_rate": 0.0008788652811641317, "logits/chosen": -11.596015930175781, "logits/rejected": -11.62629222869873, "logps/chosen": -1741.3271484375, "logps/rejected": -1697.3880615234375, "loss": 15.2508, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.16241455078125, "rewards/margins": -5.267063140869141, "rewards/rejected": -107.89534759521484, "step": 6410 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.0008786717752234994, "logits/chosen": -12.543805122375488, "logits/rejected": -12.613815307617188, "logps/chosen": -2528.209228515625, "logps/rejected": -2133.246826171875, "loss": 8.6066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -50.34475326538086, "rewards/margins": 12.641194343566895, "rewards/rejected": -62.98594284057617, "step": 6420 }, { "epoch": 0.37, "grad_norm": 101.5355224609375, "learning_rate": 0.000878478269282867, "logits/chosen": -12.104226112365723, "logits/rejected": -12.110036849975586, "logps/chosen": -2032.781982421875, "logps/rejected": -1912.0426025390625, "loss": 23.7803, "rewards/accuracies": 0.5, "rewards/chosen": -97.5978012084961, "rewards/margins": 5.015685081481934, "rewards/rejected": -102.61348724365234, "step": 6430 }, { "epoch": 0.37, "grad_norm": 0.3152647912502289, "learning_rate": 0.0008782847633422346, "logits/chosen": -14.5423002243042, "logits/rejected": -14.163467407226562, "logps/chosen": -2264.447998046875, "logps/rejected": -2073.14013671875, "loss": 19.3031, "rewards/accuracies": 0.5, "rewards/chosen": -87.5877685546875, "rewards/margins": -11.966981887817383, "rewards/rejected": -75.62078857421875, "step": 6440 }, { "epoch": 0.37, "grad_norm": 6.571404531641178e-11, "learning_rate": 0.0008780912574016022, "logits/chosen": -14.21406364440918, "logits/rejected": -14.568181991577148, "logps/chosen": -2269.562255859375, "logps/rejected": -2069.960693359375, "loss": 10.7928, "rewards/accuracies": 0.5, "rewards/chosen": -82.32341003417969, "rewards/margins": 3.4461681842803955, "rewards/rejected": -85.76958465576172, "step": 6450 }, { "epoch": 0.37, "grad_norm": 59.4739990234375, "learning_rate": 0.0008778977514609698, "logits/chosen": -11.30133056640625, "logits/rejected": -11.05750846862793, "logps/chosen": -1760.4945068359375, "logps/rejected": -1506.529541015625, "loss": 22.1296, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.04769897460938, "rewards/margins": -14.63825511932373, "rewards/rejected": -122.4094467163086, "step": 6460 }, { "epoch": 0.37, "grad_norm": 3.432631731033325, "learning_rate": 0.0008777042455203374, "logits/chosen": -10.219499588012695, "logits/rejected": -10.26535701751709, "logps/chosen": -1721.1109619140625, "logps/rejected": -1587.36767578125, "loss": 20.097, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -67.68658447265625, "rewards/margins": -6.542067050933838, "rewards/rejected": -61.1445198059082, "step": 6470 }, { "epoch": 0.38, "grad_norm": 111.03977966308594, "learning_rate": 0.0008775107395797051, "logits/chosen": -10.037527084350586, "logits/rejected": -9.914299964904785, "logps/chosen": -2109.729248046875, "logps/rejected": -1599.85546875, "loss": 27.8775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -99.83526611328125, "rewards/margins": -1.439121961593628, "rewards/rejected": -98.39615631103516, "step": 6480 }, { "epoch": 0.38, "grad_norm": 56.555946350097656, "learning_rate": 0.0008773172336390728, "logits/chosen": -9.664022445678711, "logits/rejected": -9.39038372039795, "logps/chosen": -2398.631591796875, "logps/rejected": -1794.3541259765625, "loss": 26.6458, "rewards/accuracies": 0.5, "rewards/chosen": -30.592632293701172, "rewards/margins": -9.084268569946289, "rewards/rejected": -21.508363723754883, "step": 6490 }, { "epoch": 0.38, "grad_norm": 75.0048599243164, "learning_rate": 0.0008771237276984404, "logits/chosen": -11.61182975769043, "logits/rejected": -11.603328704833984, "logps/chosen": -2099.998291015625, "logps/rejected": -1990.1898193359375, "loss": 13.9689, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -33.73950958251953, "rewards/margins": -11.774518013000488, "rewards/rejected": -21.964996337890625, "step": 6500 }, { "epoch": 0.38, "grad_norm": 0.7912886738777161, "learning_rate": 0.000876930221757808, "logits/chosen": -11.045933723449707, "logits/rejected": -10.383996963500977, "logps/chosen": -2082.4736328125, "logps/rejected": -1772.7691650390625, "loss": 31.5694, "rewards/accuracies": 0.5, "rewards/chosen": -175.2657928466797, "rewards/margins": -25.06426239013672, "rewards/rejected": -150.20152282714844, "step": 6510 }, { "epoch": 0.38, "grad_norm": 64.32669830322266, "learning_rate": 0.0008767367158171756, "logits/chosen": -9.926701545715332, "logits/rejected": -9.719742774963379, "logps/chosen": -2737.47705078125, "logps/rejected": -2467.1943359375, "loss": 25.2922, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.39566040039062, "rewards/margins": -22.70621681213379, "rewards/rejected": -105.689453125, "step": 6520 }, { "epoch": 0.38, "grad_norm": 60.98577880859375, "learning_rate": 0.0008765432098765433, "logits/chosen": -9.729456901550293, "logits/rejected": -9.415105819702148, "logps/chosen": -2008.099853515625, "logps/rejected": -1419.1136474609375, "loss": 44.4519, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -74.33241271972656, "rewards/margins": -36.93561553955078, "rewards/rejected": -37.39679718017578, "step": 6530 }, { "epoch": 0.38, "grad_norm": 7.37584926469026e-08, "learning_rate": 0.0008763497039359109, "logits/chosen": -7.338755130767822, "logits/rejected": -7.27411413192749, "logps/chosen": -2770.127685546875, "logps/rejected": -2400.47412109375, "loss": 31.422, "rewards/accuracies": 0.5, "rewards/chosen": -82.56845092773438, "rewards/margins": -22.482067108154297, "rewards/rejected": -60.08638381958008, "step": 6540 }, { "epoch": 0.38, "grad_norm": 21.895925521850586, "learning_rate": 0.0008761561979952785, "logits/chosen": -9.165700912475586, "logits/rejected": -9.064946174621582, "logps/chosen": -1564.138427734375, "logps/rejected": -1068.3526611328125, "loss": 41.7461, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -90.67708587646484, "rewards/margins": -41.56708526611328, "rewards/rejected": -49.10999298095703, "step": 6550 }, { "epoch": 0.38, "grad_norm": 0.48325806856155396, "learning_rate": 0.0008759626920546461, "logits/chosen": -8.695196151733398, "logits/rejected": -8.609362602233887, "logps/chosen": -1702.6624755859375, "logps/rejected": -1287.8194580078125, "loss": 25.1077, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -54.68352508544922, "rewards/margins": -16.10295295715332, "rewards/rejected": -38.58057403564453, "step": 6560 }, { "epoch": 0.38, "grad_norm": 83.24081420898438, "learning_rate": 0.0008757691861140137, "logits/chosen": -7.5710129737854, "logits/rejected": -7.567612648010254, "logps/chosen": -1498.9371337890625, "logps/rejected": -1617.263916015625, "loss": 34.8491, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -54.49201583862305, "rewards/margins": -25.690933227539062, "rewards/rejected": -28.80108642578125, "step": 6570 }, { "epoch": 0.38, "grad_norm": 21.57459831237793, "learning_rate": 0.0008755756801733813, "logits/chosen": -4.845503807067871, "logits/rejected": -4.760678768157959, "logps/chosen": -2458.965576171875, "logps/rejected": -2231.3828125, "loss": 8.9925, "rewards/accuracies": 0.5, "rewards/chosen": -38.352272033691406, "rewards/margins": -1.2129243612289429, "rewards/rejected": -37.139347076416016, "step": 6580 }, { "epoch": 0.38, "grad_norm": 0.09874758124351501, "learning_rate": 0.0008753821742327491, "logits/chosen": -8.200390815734863, "logits/rejected": -8.0620698928833, "logps/chosen": -1987.8515625, "logps/rejected": -1545.03466796875, "loss": 49.9772, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -96.61250305175781, "rewards/margins": -46.85480880737305, "rewards/rejected": -49.757694244384766, "step": 6590 }, { "epoch": 0.38, "grad_norm": 52.678916931152344, "learning_rate": 0.0008751886682921167, "logits/chosen": -10.332071304321289, "logits/rejected": -10.313127517700195, "logps/chosen": -1400.9520263671875, "logps/rejected": -1190.1568603515625, "loss": 6.6426, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -78.89602661132812, "rewards/margins": 2.5890145301818848, "rewards/rejected": -81.48503112792969, "step": 6600 }, { "epoch": 0.38, "grad_norm": 110.99188995361328, "learning_rate": 0.0008749951623514842, "logits/chosen": -8.50182056427002, "logits/rejected": -8.329888343811035, "logps/chosen": -1978.921142578125, "logps/rejected": -1439.91943359375, "loss": 28.6149, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -55.1355094909668, "rewards/margins": -23.194950103759766, "rewards/rejected": -31.940563201904297, "step": 6610 }, { "epoch": 0.38, "grad_norm": 55.171173095703125, "learning_rate": 0.0008748016564108518, "logits/chosen": -8.54262924194336, "logits/rejected": -8.432287216186523, "logps/chosen": -1716.842041015625, "logps/rejected": -1679.072509765625, "loss": 18.058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -63.372398376464844, "rewards/margins": -7.541548252105713, "rewards/rejected": -55.830848693847656, "step": 6620 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.0008746081504702194, "logits/chosen": -8.696554183959961, "logits/rejected": -8.68676471710205, "logps/chosen": -1888.591064453125, "logps/rejected": -1816.947998046875, "loss": 21.5648, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -86.63678741455078, "rewards/margins": -1.5545852184295654, "rewards/rejected": -85.08219909667969, "step": 6630 }, { "epoch": 0.38, "grad_norm": 10.437118530273438, "learning_rate": 0.000874414644529587, "logits/chosen": -8.351812362670898, "logits/rejected": -8.069616317749023, "logps/chosen": -1971.835693359375, "logps/rejected": -1901.231689453125, "loss": 21.9273, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -42.341835021972656, "rewards/margins": -17.069473266601562, "rewards/rejected": -25.27235984802246, "step": 6640 }, { "epoch": 0.38, "grad_norm": 92.1724853515625, "learning_rate": 0.0008742211385889547, "logits/chosen": -8.880758285522461, "logits/rejected": -9.41309642791748, "logps/chosen": -2460.782470703125, "logps/rejected": -2717.57568359375, "loss": 9.6602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -56.85761642456055, "rewards/margins": 22.42013168334961, "rewards/rejected": -79.27774810791016, "step": 6650 }, { "epoch": 0.39, "grad_norm": 105.58134460449219, "learning_rate": 0.0008740276326483223, "logits/chosen": -10.781906127929688, "logits/rejected": -10.91404914855957, "logps/chosen": -1904.589111328125, "logps/rejected": -1763.4027099609375, "loss": 19.9493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -98.20540618896484, "rewards/margins": 0.0041336058638989925, "rewards/rejected": -98.20954895019531, "step": 6660 }, { "epoch": 0.39, "grad_norm": 56.656394958496094, "learning_rate": 0.0008738341267076899, "logits/chosen": -8.70432186126709, "logits/rejected": -8.451395034790039, "logps/chosen": -2897.909912109375, "logps/rejected": -2223.856689453125, "loss": 27.7183, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -83.73600769042969, "rewards/margins": -16.987863540649414, "rewards/rejected": -66.74813842773438, "step": 6670 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0008736406207670575, "logits/chosen": -8.253484725952148, "logits/rejected": -8.210945129394531, "logps/chosen": -1640.195068359375, "logps/rejected": -1293.936279296875, "loss": 11.0265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.322589874267578, "rewards/margins": 34.066871643066406, "rewards/rejected": -52.38945770263672, "step": 6680 }, { "epoch": 0.39, "grad_norm": 72.18408203125, "learning_rate": 0.0008734471148264251, "logits/chosen": -8.302095413208008, "logits/rejected": -8.160555839538574, "logps/chosen": -1891.5257568359375, "logps/rejected": -1514.2071533203125, "loss": 33.4036, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -65.93086242675781, "rewards/margins": -25.2603702545166, "rewards/rejected": -40.670494079589844, "step": 6690 }, { "epoch": 0.39, "grad_norm": 0.001026625162921846, "learning_rate": 0.0008732536088857929, "logits/chosen": -8.714038848876953, "logits/rejected": -8.702186584472656, "logps/chosen": -1988.5618896484375, "logps/rejected": -1798.91015625, "loss": 30.4519, "rewards/accuracies": 0.5, "rewards/chosen": -24.728710174560547, "rewards/margins": -10.604402542114258, "rewards/rejected": -14.124307632446289, "step": 6700 }, { "epoch": 0.39, "grad_norm": 71.90420532226562, "learning_rate": 0.0008730601029451605, "logits/chosen": -9.326496124267578, "logits/rejected": -9.103557586669922, "logps/chosen": -1756.0189208984375, "logps/rejected": -1256.5211181640625, "loss": 45.0133, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -138.20989990234375, "rewards/margins": -37.80144119262695, "rewards/rejected": -100.40846252441406, "step": 6710 }, { "epoch": 0.39, "grad_norm": 6.910803580870845e-14, "learning_rate": 0.0008728665970045281, "logits/chosen": -9.875821113586426, "logits/rejected": -9.839662551879883, "logps/chosen": -1438.241455078125, "logps/rejected": -1385.6259765625, "loss": 23.2806, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -58.99311447143555, "rewards/margins": -11.496040344238281, "rewards/rejected": -47.49706268310547, "step": 6720 }, { "epoch": 0.39, "grad_norm": 339.1512145996094, "learning_rate": 0.0008726730910638957, "logits/chosen": -16.635225296020508, "logits/rejected": -16.3215389251709, "logps/chosen": -2143.004638671875, "logps/rejected": -1783.410888671875, "loss": 29.3719, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -149.88919067382812, "rewards/margins": -27.6795597076416, "rewards/rejected": -122.2096176147461, "step": 6730 }, { "epoch": 0.39, "grad_norm": 0.11664097756147385, "learning_rate": 0.0008724795851232633, "logits/chosen": -11.477678298950195, "logits/rejected": -11.25867748260498, "logps/chosen": -2267.40234375, "logps/rejected": -1663.2562255859375, "loss": 28.1867, "rewards/accuracies": 0.5, "rewards/chosen": -82.41314697265625, "rewards/margins": -14.420344352722168, "rewards/rejected": -67.9927978515625, "step": 6740 }, { "epoch": 0.39, "grad_norm": 23.868513107299805, "learning_rate": 0.0008722860791826309, "logits/chosen": -9.939851760864258, "logits/rejected": -9.66556453704834, "logps/chosen": -1712.037841796875, "logps/rejected": -1116.851318359375, "loss": 46.8695, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -58.42435836791992, "rewards/margins": -42.68519973754883, "rewards/rejected": -15.739161491394043, "step": 6750 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0008720925732419986, "logits/chosen": -8.505311012268066, "logits/rejected": -8.480542182922363, "logps/chosen": -1820.8427734375, "logps/rejected": -1649.6380615234375, "loss": 29.2928, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -39.83086013793945, "rewards/margins": -9.822132110595703, "rewards/rejected": -30.008724212646484, "step": 6760 }, { "epoch": 0.39, "grad_norm": 5.814396217829199e-07, "learning_rate": 0.0008718990673013662, "logits/chosen": -12.049081802368164, "logits/rejected": -11.972306251525879, "logps/chosen": -2056.54638671875, "logps/rejected": -1654.0015869140625, "loss": 23.9367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -61.52448272705078, "rewards/margins": -0.9684234857559204, "rewards/rejected": -60.556060791015625, "step": 6770 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0008717055613607338, "logits/chosen": -17.8784122467041, "logits/rejected": -17.518264770507812, "logps/chosen": -3048.25537109375, "logps/rejected": -2550.13134765625, "loss": 35.2792, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -123.2108154296875, "rewards/margins": -17.172576904296875, "rewards/rejected": -106.03824615478516, "step": 6780 }, { "epoch": 0.39, "grad_norm": 29.724929809570312, "learning_rate": 0.0008715120554201014, "logits/chosen": -13.146807670593262, "logits/rejected": -12.931230545043945, "logps/chosen": -2103.91650390625, "logps/rejected": -1302.761474609375, "loss": 60.7547, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -89.19351959228516, "rewards/margins": -48.18867492675781, "rewards/rejected": -41.004844665527344, "step": 6790 }, { "epoch": 0.39, "grad_norm": 6.009246445175427e-15, "learning_rate": 0.0008713185494794691, "logits/chosen": -9.097837448120117, "logits/rejected": -8.940574645996094, "logps/chosen": -1836.085205078125, "logps/rejected": -1500.7086181640625, "loss": 27.8387, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -72.36978912353516, "rewards/margins": -19.586933135986328, "rewards/rejected": -52.7828483581543, "step": 6800 }, { "epoch": 0.39, "grad_norm": 0.0008511324413120747, "learning_rate": 0.0008711250435388368, "logits/chosen": -9.895654678344727, "logits/rejected": -9.671564102172852, "logps/chosen": -1868.553955078125, "logps/rejected": -1475.8887939453125, "loss": 12.2785, "rewards/accuracies": 0.5, "rewards/chosen": -42.9716796875, "rewards/margins": 4.450358867645264, "rewards/rejected": -47.422035217285156, "step": 6810 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0008709315375982043, "logits/chosen": -8.513850212097168, "logits/rejected": -8.168153762817383, "logps/chosen": -2531.919921875, "logps/rejected": -1908.637939453125, "loss": 39.1703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.850201606750488, "rewards/margins": -17.113534927368164, "rewards/rejected": 22.963741302490234, "step": 6820 }, { "epoch": 0.4, "grad_norm": 205.1920623779297, "learning_rate": 0.0008707380316575719, "logits/chosen": -8.491333961486816, "logits/rejected": -7.860390663146973, "logps/chosen": -1788.2415771484375, "logps/rejected": -931.5144653320312, "loss": 55.5017, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.77880859375, "rewards/margins": -45.491844177246094, "rewards/rejected": -61.286964416503906, "step": 6830 }, { "epoch": 0.4, "grad_norm": 35.77861785888672, "learning_rate": 0.0008705445257169395, "logits/chosen": -9.745943069458008, "logits/rejected": -9.554886817932129, "logps/chosen": -2038.266845703125, "logps/rejected": -1790.1168212890625, "loss": 21.77, "rewards/accuracies": 0.5, "rewards/chosen": -82.85193634033203, "rewards/margins": -9.718101501464844, "rewards/rejected": -73.13383483886719, "step": 6840 }, { "epoch": 0.4, "grad_norm": 56.10224914550781, "learning_rate": 0.0008703510197763071, "logits/chosen": -10.62658977508545, "logits/rejected": -10.54056167602539, "logps/chosen": -1866.6480712890625, "logps/rejected": -1356.3270263671875, "loss": 11.2596, "rewards/accuracies": 0.5, "rewards/chosen": -83.00439453125, "rewards/margins": 14.764147758483887, "rewards/rejected": -97.76853942871094, "step": 6850 }, { "epoch": 0.4, "grad_norm": 33.07064437866211, "learning_rate": 0.0008701575138356747, "logits/chosen": -9.818769454956055, "logits/rejected": -9.617403030395508, "logps/chosen": -1757.7855224609375, "logps/rejected": -1566.260498046875, "loss": 23.0703, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -103.19212341308594, "rewards/margins": -6.5943450927734375, "rewards/rejected": -96.59778594970703, "step": 6860 }, { "epoch": 0.4, "grad_norm": 69.66670227050781, "learning_rate": 0.0008699640078950423, "logits/chosen": -9.509882926940918, "logits/rejected": -9.450806617736816, "logps/chosen": -2152.11279296875, "logps/rejected": -1740.0224609375, "loss": 20.7602, "rewards/accuracies": 0.5, "rewards/chosen": -43.51544952392578, "rewards/margins": -8.904756546020508, "rewards/rejected": -34.610694885253906, "step": 6870 }, { "epoch": 0.4, "grad_norm": 87.37122344970703, "learning_rate": 0.00086977050195441, "logits/chosen": -9.934014320373535, "logits/rejected": -9.810328483581543, "logps/chosen": -1920.96484375, "logps/rejected": -1570.9703369140625, "loss": 28.1086, "rewards/accuracies": 0.5, "rewards/chosen": -88.52485656738281, "rewards/margins": -21.2171630859375, "rewards/rejected": -67.30770111083984, "step": 6880 }, { "epoch": 0.4, "grad_norm": 9.121141619772288e-10, "learning_rate": 0.0008695769960137776, "logits/chosen": -9.390485763549805, "logits/rejected": -9.201550483703613, "logps/chosen": -2215.08251953125, "logps/rejected": -1907.117431640625, "loss": 34.2525, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -74.56327056884766, "rewards/margins": -23.511070251464844, "rewards/rejected": -51.05219268798828, "step": 6890 }, { "epoch": 0.4, "grad_norm": 36.452667236328125, "learning_rate": 0.0008693834900731452, "logits/chosen": -9.949320793151855, "logits/rejected": -9.850556373596191, "logps/chosen": -1707.6322021484375, "logps/rejected": -1496.3780517578125, "loss": 20.0829, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -64.7446060180664, "rewards/margins": -18.261028289794922, "rewards/rejected": -46.48357391357422, "step": 6900 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.0008691899841325129, "logits/chosen": -12.316043853759766, "logits/rejected": -12.283346176147461, "logps/chosen": -1878.1439208984375, "logps/rejected": -1552.470458984375, "loss": 27.7495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -86.77204895019531, "rewards/margins": -4.784618377685547, "rewards/rejected": -81.98743438720703, "step": 6910 }, { "epoch": 0.4, "grad_norm": 4.272459008802798e-08, "learning_rate": 0.0008689964781918805, "logits/chosen": -12.106769561767578, "logits/rejected": -11.79650592803955, "logps/chosen": -1551.1884765625, "logps/rejected": -1070.089111328125, "loss": 43.9712, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -127.63228607177734, "rewards/margins": -39.098209381103516, "rewards/rejected": -88.53408813476562, "step": 6920 }, { "epoch": 0.4, "grad_norm": 333.85028076171875, "learning_rate": 0.0008688029722512482, "logits/chosen": -10.296130180358887, "logits/rejected": -10.134958267211914, "logps/chosen": -2464.86376953125, "logps/rejected": -1799.201904296875, "loss": 39.5336, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -137.78970336914062, "rewards/margins": -32.07671356201172, "rewards/rejected": -105.71297454833984, "step": 6930 }, { "epoch": 0.4, "grad_norm": 196.97244262695312, "learning_rate": 0.0008686094663106158, "logits/chosen": -10.66039752960205, "logits/rejected": -10.539789199829102, "logps/chosen": -2496.349365234375, "logps/rejected": -2228.713134765625, "loss": 39.5393, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -162.28585815429688, "rewards/margins": -31.05314064025879, "rewards/rejected": -131.23272705078125, "step": 6940 }, { "epoch": 0.4, "grad_norm": 64.68879699707031, "learning_rate": 0.0008684159603699834, "logits/chosen": -9.619019508361816, "logits/rejected": -9.589400291442871, "logps/chosen": -2568.568359375, "logps/rejected": -1923.59375, "loss": 24.5542, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.46043395996094, "rewards/margins": -12.967005729675293, "rewards/rejected": -87.4934310913086, "step": 6950 }, { "epoch": 0.4, "grad_norm": 75.92729187011719, "learning_rate": 0.000868222454429351, "logits/chosen": -12.125444412231445, "logits/rejected": -12.217458724975586, "logps/chosen": -2194.350830078125, "logps/rejected": -1885.751220703125, "loss": 48.8091, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -145.60336303710938, "rewards/margins": -28.27927589416504, "rewards/rejected": -117.32408142089844, "step": 6960 }, { "epoch": 0.4, "grad_norm": 35.767459869384766, "learning_rate": 0.0008680289484887186, "logits/chosen": -11.943679809570312, "logits/rejected": -11.893170356750488, "logps/chosen": -1889.0660400390625, "logps/rejected": -1850.424072265625, "loss": 8.9092, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -116.78236389160156, "rewards/margins": 0.23660048842430115, "rewards/rejected": -117.01895904541016, "step": 6970 }, { "epoch": 0.4, "grad_norm": 10.755067825317383, "learning_rate": 0.0008678354425480862, "logits/chosen": -13.4208984375, "logits/rejected": -13.377573013305664, "logps/chosen": -1918.362060546875, "logps/rejected": -1696.9654541015625, "loss": 21.0812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.56529235839844, "rewards/margins": -13.7930326461792, "rewards/rejected": -103.77226257324219, "step": 6980 }, { "epoch": 0.4, "grad_norm": 38.93715286254883, "learning_rate": 0.0008676419366074539, "logits/chosen": -11.873128890991211, "logits/rejected": -11.824071884155273, "logps/chosen": -2043.3511962890625, "logps/rejected": -1685.756103515625, "loss": 23.0697, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -96.01583099365234, "rewards/margins": -12.85528564453125, "rewards/rejected": -83.1605453491211, "step": 6990 }, { "epoch": 0.41, "grad_norm": 30.05426788330078, "learning_rate": 0.0008674484306668215, "logits/chosen": -8.227350234985352, "logits/rejected": -8.280291557312012, "logps/chosen": -1782.074951171875, "logps/rejected": -1385.265869140625, "loss": 19.1175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -104.52549743652344, "rewards/margins": -13.578641891479492, "rewards/rejected": -90.94685363769531, "step": 7000 }, { "epoch": 0.41, "grad_norm": 93.13705444335938, "learning_rate": 0.0008672549247261892, "logits/chosen": -9.409618377685547, "logits/rejected": -9.360101699829102, "logps/chosen": -1521.42822265625, "logps/rejected": -953.4093627929688, "loss": 29.5424, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -48.493011474609375, "rewards/margins": -19.12203598022461, "rewards/rejected": -29.370981216430664, "step": 7010 }, { "epoch": 0.41, "grad_norm": 52.18618392944336, "learning_rate": 0.0008670614187855568, "logits/chosen": -9.994035720825195, "logits/rejected": -9.891830444335938, "logps/chosen": -1623.8336181640625, "logps/rejected": -1635.3870849609375, "loss": 7.5479, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -128.6238250732422, "rewards/margins": 3.66984486579895, "rewards/rejected": -132.2936553955078, "step": 7020 }, { "epoch": 0.41, "grad_norm": 6.869473736514919e-07, "learning_rate": 0.0008668679128449244, "logits/chosen": -9.045571327209473, "logits/rejected": -8.929712295532227, "logps/chosen": -2129.859619140625, "logps/rejected": -2098.643310546875, "loss": 46.6433, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -75.76799774169922, "rewards/margins": -41.0828971862793, "rewards/rejected": -34.68510055541992, "step": 7030 }, { "epoch": 0.41, "grad_norm": 62.970760345458984, "learning_rate": 0.000866674406904292, "logits/chosen": -10.413273811340332, "logits/rejected": -10.11892032623291, "logps/chosen": -2503.54248046875, "logps/rejected": -1890.012451171875, "loss": 40.4578, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -109.56709289550781, "rewards/margins": -30.100055694580078, "rewards/rejected": -79.467041015625, "step": 7040 }, { "epoch": 0.41, "grad_norm": 8.997082245887356e-11, "learning_rate": 0.0008664809009636596, "logits/chosen": -10.112990379333496, "logits/rejected": -10.23200798034668, "logps/chosen": -3248.5166015625, "logps/rejected": -2686.328857421875, "loss": 18.932, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -102.60665130615234, "rewards/margins": -6.584376335144043, "rewards/rejected": -96.02228546142578, "step": 7050 }, { "epoch": 0.41, "grad_norm": 98.45115661621094, "learning_rate": 0.0008662873950230272, "logits/chosen": -11.272207260131836, "logits/rejected": -11.060379028320312, "logps/chosen": -2150.96728515625, "logps/rejected": -1915.894775390625, "loss": 18.5221, "rewards/accuracies": 0.5, "rewards/chosen": -98.56517028808594, "rewards/margins": -4.031407356262207, "rewards/rejected": -94.53376770019531, "step": 7060 }, { "epoch": 0.41, "grad_norm": 59.29792785644531, "learning_rate": 0.0008660938890823948, "logits/chosen": -10.88613510131836, "logits/rejected": -10.769407272338867, "logps/chosen": -2012.2650146484375, "logps/rejected": -1947.2672119140625, "loss": 14.6686, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -106.27659606933594, "rewards/margins": -6.697096824645996, "rewards/rejected": -99.57949829101562, "step": 7070 }, { "epoch": 0.41, "grad_norm": 90.96517944335938, "learning_rate": 0.0008659003831417624, "logits/chosen": -9.402990341186523, "logits/rejected": -9.072113990783691, "logps/chosen": -2213.71923828125, "logps/rejected": -1899.746337890625, "loss": 40.5732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -105.3582534790039, "rewards/margins": -22.811237335205078, "rewards/rejected": -82.54702758789062, "step": 7080 }, { "epoch": 0.41, "grad_norm": 81.0233383178711, "learning_rate": 0.00086570687720113, "logits/chosen": -9.507278442382812, "logits/rejected": -9.165011405944824, "logps/chosen": -2304.892333984375, "logps/rejected": -1805.515869140625, "loss": 47.5373, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -114.9049301147461, "rewards/margins": -40.16100311279297, "rewards/rejected": -74.74394226074219, "step": 7090 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.0008655133712604977, "logits/chosen": -10.860208511352539, "logits/rejected": -10.28819465637207, "logps/chosen": -2057.46923828125, "logps/rejected": -1670.893310546875, "loss": 9.2679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.14728546142578, "rewards/margins": 6.708780765533447, "rewards/rejected": -39.8560676574707, "step": 7100 }, { "epoch": 0.41, "grad_norm": 1.5943547641446005e-15, "learning_rate": 0.0008653198653198653, "logits/chosen": -11.220335006713867, "logits/rejected": -11.224405288696289, "logps/chosen": -1894.8193359375, "logps/rejected": -1955.211669921875, "loss": 3.5217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.30145263671875, "rewards/margins": 7.391821384429932, "rewards/rejected": -161.69326782226562, "step": 7110 }, { "epoch": 0.41, "grad_norm": 0.0006426826003007591, "learning_rate": 0.000865126359379233, "logits/chosen": -10.714062690734863, "logits/rejected": -10.57845401763916, "logps/chosen": -2067.119384765625, "logps/rejected": -1602.96728515625, "loss": 8.9666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -70.43340301513672, "rewards/margins": 13.750470161437988, "rewards/rejected": -84.18388366699219, "step": 7120 }, { "epoch": 0.41, "grad_norm": 4.3808021446545276e-12, "learning_rate": 0.0008649328534386006, "logits/chosen": -9.53447437286377, "logits/rejected": -9.439828872680664, "logps/chosen": -2233.49169921875, "logps/rejected": -2121.15966796875, "loss": 16.206, "rewards/accuracies": 0.5, "rewards/chosen": -47.1335334777832, "rewards/margins": -3.6246047019958496, "rewards/rejected": -43.5089225769043, "step": 7130 }, { "epoch": 0.41, "grad_norm": 1.1153709997109926e-16, "learning_rate": 0.0008647393474979682, "logits/chosen": -8.125386238098145, "logits/rejected": -8.179621696472168, "logps/chosen": -1624.083740234375, "logps/rejected": -1865.517578125, "loss": 7.8466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9450550079345703, "rewards/margins": 1.5835800170898438, "rewards/rejected": 2.3614742755889893, "step": 7140 }, { "epoch": 0.41, "grad_norm": 28.598825454711914, "learning_rate": 0.0008645458415573358, "logits/chosen": -8.409189224243164, "logits/rejected": -8.38463020324707, "logps/chosen": -2105.724609375, "logps/rejected": -1548.140869140625, "loss": 37.8584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.359548568725586, "rewards/margins": -17.486406326293945, "rewards/rejected": -13.873141288757324, "step": 7150 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.0008643523356167035, "logits/chosen": -10.857210159301758, "logits/rejected": -10.835911750793457, "logps/chosen": -2135.79638671875, "logps/rejected": -2047.0101318359375, "loss": 18.6286, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -66.50211334228516, "rewards/margins": 3.7343125343322754, "rewards/rejected": -70.2364273071289, "step": 7160 }, { "epoch": 0.42, "grad_norm": 9.99477063612364e-12, "learning_rate": 0.0008641588296760711, "logits/chosen": -10.438894271850586, "logits/rejected": -10.450922012329102, "logps/chosen": -1760.5142822265625, "logps/rejected": -1867.9351806640625, "loss": 6.934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -72.89271545410156, "rewards/margins": 4.87632942199707, "rewards/rejected": -77.76903533935547, "step": 7170 }, { "epoch": 0.42, "grad_norm": 30.941503524780273, "learning_rate": 0.0008639653237354387, "logits/chosen": -9.221868515014648, "logits/rejected": -9.16167163848877, "logps/chosen": -1686.3050537109375, "logps/rejected": -1288.6954345703125, "loss": 32.6282, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -37.6967658996582, "rewards/margins": -8.940925598144531, "rewards/rejected": -28.755834579467773, "step": 7180 }, { "epoch": 0.42, "grad_norm": 9.280323028564453, "learning_rate": 0.0008637718177948063, "logits/chosen": -9.071664810180664, "logits/rejected": -9.053747177124023, "logps/chosen": -1508.8365478515625, "logps/rejected": -1361.249267578125, "loss": 22.0691, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -50.18547821044922, "rewards/margins": -15.606466293334961, "rewards/rejected": -34.579002380371094, "step": 7190 }, { "epoch": 0.42, "grad_norm": 69.10192108154297, "learning_rate": 0.0008635783118541739, "logits/chosen": -9.314603805541992, "logits/rejected": -8.996822357177734, "logps/chosen": -1886.3372802734375, "logps/rejected": -1176.6353759765625, "loss": 50.6447, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -145.09207153320312, "rewards/margins": -50.23185348510742, "rewards/rejected": -94.86021423339844, "step": 7200 }, { "epoch": 0.42, "grad_norm": 3.154691163115141e-17, "learning_rate": 0.0008633848059135415, "logits/chosen": -10.162216186523438, "logits/rejected": -9.920019149780273, "logps/chosen": -1914.3765869140625, "logps/rejected": -1512.802490234375, "loss": 18.564, "rewards/accuracies": 0.5, "rewards/chosen": -22.877153396606445, "rewards/margins": -1.8285003900527954, "rewards/rejected": -21.04865264892578, "step": 7210 }, { "epoch": 0.42, "grad_norm": 114.33583068847656, "learning_rate": 0.0008631912999729093, "logits/chosen": -10.58851432800293, "logits/rejected": -10.445540428161621, "logps/chosen": -2082.4755859375, "logps/rejected": -1532.823974609375, "loss": 27.9002, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.824962615966797, "rewards/margins": -3.910768508911133, "rewards/rejected": -2.9141952991485596, "step": 7220 }, { "epoch": 0.42, "grad_norm": 236.30970764160156, "learning_rate": 0.0008629977940322769, "logits/chosen": -9.732476234436035, "logits/rejected": -9.477334976196289, "logps/chosen": -1999.3297119140625, "logps/rejected": -2046.708984375, "loss": 43.6912, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -66.07090759277344, "rewards/margins": -41.033729553222656, "rewards/rejected": -25.03718376159668, "step": 7230 }, { "epoch": 0.42, "grad_norm": 905.987060546875, "learning_rate": 0.0008628042880916445, "logits/chosen": -11.33475399017334, "logits/rejected": -11.38975715637207, "logps/chosen": -1723.585205078125, "logps/rejected": -1598.9794921875, "loss": 35.5533, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -118.3096694946289, "rewards/margins": -24.78799819946289, "rewards/rejected": -93.52165222167969, "step": 7240 }, { "epoch": 0.42, "grad_norm": 69.61677551269531, "learning_rate": 0.0008626107821510121, "logits/chosen": -12.84095573425293, "logits/rejected": -12.459924697875977, "logps/chosen": -2777.27392578125, "logps/rejected": -2317.8583984375, "loss": 38.6379, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -194.69911193847656, "rewards/margins": -35.92977523803711, "rewards/rejected": -158.76934814453125, "step": 7250 }, { "epoch": 0.42, "grad_norm": 68.9135513305664, "learning_rate": 0.0008624172762103796, "logits/chosen": -10.586821556091309, "logits/rejected": -10.641862869262695, "logps/chosen": -1729.5859375, "logps/rejected": -1618.8072509765625, "loss": 8.4419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -64.92793273925781, "rewards/margins": 3.5594794750213623, "rewards/rejected": -68.48741149902344, "step": 7260 }, { "epoch": 0.42, "grad_norm": 0.6808004975318909, "learning_rate": 0.0008622237702697473, "logits/chosen": -10.685372352600098, "logits/rejected": -10.555482864379883, "logps/chosen": -1470.7177734375, "logps/rejected": -1499.8193359375, "loss": 27.8021, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -85.59321594238281, "rewards/margins": -25.465091705322266, "rewards/rejected": -60.12813186645508, "step": 7270 }, { "epoch": 0.42, "grad_norm": 158.60073852539062, "learning_rate": 0.0008620302643291149, "logits/chosen": -12.343809127807617, "logits/rejected": -12.9099702835083, "logps/chosen": -2507.3291015625, "logps/rejected": -2453.65087890625, "loss": 17.2259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.2856903076172, "rewards/margins": 4.303152561187744, "rewards/rejected": -146.58885192871094, "step": 7280 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.0008618367583884825, "logits/chosen": -12.260185241699219, "logits/rejected": -12.111019134521484, "logps/chosen": -3045.6416015625, "logps/rejected": -2931.9638671875, "loss": 32.6896, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -74.06229400634766, "rewards/margins": 5.314451694488525, "rewards/rejected": -79.37674713134766, "step": 7290 }, { "epoch": 0.42, "grad_norm": 3.5468706300889608e-06, "learning_rate": 0.0008616432524478501, "logits/chosen": -13.952531814575195, "logits/rejected": -13.840314865112305, "logps/chosen": -2347.287109375, "logps/rejected": -2387.161376953125, "loss": 14.7022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.4220733642578, "rewards/margins": 13.956225395202637, "rewards/rejected": -175.3782958984375, "step": 7300 }, { "epoch": 0.42, "grad_norm": 175.4353790283203, "learning_rate": 0.0008614497465072177, "logits/chosen": -13.991750717163086, "logits/rejected": -13.541491508483887, "logps/chosen": -2654.54833984375, "logps/rejected": -2047.039794921875, "loss": 59.8454, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -223.9896240234375, "rewards/margins": -54.64417266845703, "rewards/rejected": -169.34544372558594, "step": 7310 }, { "epoch": 0.42, "grad_norm": 178.7499542236328, "learning_rate": 0.0008612562405665853, "logits/chosen": -13.912376403808594, "logits/rejected": -12.908197402954102, "logps/chosen": -2787.952880859375, "logps/rejected": -2230.482177734375, "loss": 42.324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.90023040771484, "rewards/margins": -29.808582305908203, "rewards/rejected": -96.09165954589844, "step": 7320 }, { "epoch": 0.42, "grad_norm": 66.24224090576172, "learning_rate": 0.0008610627346259531, "logits/chosen": -11.963964462280273, "logits/rejected": -11.762716293334961, "logps/chosen": -1944.1790771484375, "logps/rejected": -1640.463134765625, "loss": 27.4872, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -48.808624267578125, "rewards/margins": -23.975168228149414, "rewards/rejected": -24.83345603942871, "step": 7330 }, { "epoch": 0.42, "grad_norm": 51.05625534057617, "learning_rate": 0.0008608692286853207, "logits/chosen": -12.25973892211914, "logits/rejected": -11.931768417358398, "logps/chosen": -2241.258056640625, "logps/rejected": -1505.7384033203125, "loss": 34.04, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -54.838539123535156, "rewards/margins": -13.859725952148438, "rewards/rejected": -40.97880935668945, "step": 7340 }, { "epoch": 0.43, "grad_norm": 2.938853327361102e-11, "learning_rate": 0.0008606757227446883, "logits/chosen": -14.34484577178955, "logits/rejected": -14.91022777557373, "logps/chosen": -2473.5634765625, "logps/rejected": -2709.3857421875, "loss": 19.0695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -76.3145751953125, "rewards/margins": -4.740095615386963, "rewards/rejected": -71.5744857788086, "step": 7350 }, { "epoch": 0.43, "grad_norm": 63.76220703125, "learning_rate": 0.0008604822168040559, "logits/chosen": -13.418672561645508, "logits/rejected": -13.244318008422852, "logps/chosen": -2472.10595703125, "logps/rejected": -2368.41455078125, "loss": 43.234, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -124.6176986694336, "rewards/margins": -31.30227279663086, "rewards/rejected": -93.31542205810547, "step": 7360 }, { "epoch": 0.43, "grad_norm": 70.53128814697266, "learning_rate": 0.0008602887108634235, "logits/chosen": -13.919126510620117, "logits/rejected": -13.41283130645752, "logps/chosen": -1545.131591796875, "logps/rejected": -1319.330810546875, "loss": 28.5491, "rewards/accuracies": 0.5, "rewards/chosen": -71.91690063476562, "rewards/margins": -24.661588668823242, "rewards/rejected": -47.25531768798828, "step": 7370 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0008600952049227911, "logits/chosen": -11.21763801574707, "logits/rejected": -11.075881004333496, "logps/chosen": -1990.0357666015625, "logps/rejected": -1719.478271484375, "loss": 16.5097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 27.056249618530273, "rewards/margins": 27.777271270751953, "rewards/rejected": -0.7210174798965454, "step": 7380 }, { "epoch": 0.43, "grad_norm": 55.49911880493164, "learning_rate": 0.0008599016989821588, "logits/chosen": -10.670808792114258, "logits/rejected": -10.766693115234375, "logps/chosen": -1640.532470703125, "logps/rejected": -1761.3076171875, "loss": 10.8029, "rewards/accuracies": 0.5, "rewards/chosen": -63.75934600830078, "rewards/margins": 5.598813056945801, "rewards/rejected": -69.358154296875, "step": 7390 }, { "epoch": 0.43, "grad_norm": 33.85157012939453, "learning_rate": 0.0008597081930415264, "logits/chosen": -11.148295402526855, "logits/rejected": -11.168340682983398, "logps/chosen": -1657.944091796875, "logps/rejected": -1549.59765625, "loss": 9.0303, "rewards/accuracies": 0.5, "rewards/chosen": -65.01802062988281, "rewards/margins": 16.752948760986328, "rewards/rejected": -81.7709732055664, "step": 7400 }, { "epoch": 0.43, "grad_norm": 23.458742141723633, "learning_rate": 0.000859514687100894, "logits/chosen": -10.419133186340332, "logits/rejected": -10.378741264343262, "logps/chosen": -1524.0718994140625, "logps/rejected": -1498.58544921875, "loss": 15.9588, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -47.28992462158203, "rewards/margins": 2.3131699562072754, "rewards/rejected": -49.60309600830078, "step": 7410 }, { "epoch": 0.43, "grad_norm": 51.932220458984375, "learning_rate": 0.0008593211811602616, "logits/chosen": -8.555538177490234, "logits/rejected": -8.4712553024292, "logps/chosen": -2343.60986328125, "logps/rejected": -1954.9456787109375, "loss": 16.4421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 21.113067626953125, "rewards/margins": 5.2650604248046875, "rewards/rejected": 15.848004341125488, "step": 7420 }, { "epoch": 0.43, "grad_norm": 2.714874003639225e-15, "learning_rate": 0.0008591276752196293, "logits/chosen": -10.5311918258667, "logits/rejected": -10.475287437438965, "logps/chosen": -1879.2633056640625, "logps/rejected": -1715.966552734375, "loss": 17.8073, "rewards/accuracies": 0.5, "rewards/chosen": -44.36362075805664, "rewards/margins": 17.495418548583984, "rewards/rejected": -61.859031677246094, "step": 7430 }, { "epoch": 0.43, "grad_norm": 8.895235061645508, "learning_rate": 0.000858934169278997, "logits/chosen": -12.29762077331543, "logits/rejected": -12.234722137451172, "logps/chosen": -1872.721923828125, "logps/rejected": -1689.2132568359375, "loss": 10.3556, "rewards/accuracies": 0.5, "rewards/chosen": -112.29573059082031, "rewards/margins": 6.411862373352051, "rewards/rejected": -118.70759582519531, "step": 7440 }, { "epoch": 0.43, "grad_norm": 0.006625693291425705, "learning_rate": 0.0008587406633383646, "logits/chosen": -10.627034187316895, "logits/rejected": -10.319314956665039, "logps/chosen": -2146.94580078125, "logps/rejected": -1225.8446044921875, "loss": 73.802, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -170.8644561767578, "rewards/margins": -72.88534545898438, "rewards/rejected": -97.97911834716797, "step": 7450 }, { "epoch": 0.43, "grad_norm": 159.89886474609375, "learning_rate": 0.0008585471573977322, "logits/chosen": -8.775386810302734, "logits/rejected": -8.766707420349121, "logps/chosen": -2155.64013671875, "logps/rejected": -2239.296630859375, "loss": 23.1788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -61.67158889770508, "rewards/margins": -5.077703952789307, "rewards/rejected": -56.5938835144043, "step": 7460 }, { "epoch": 0.43, "grad_norm": 41.292274475097656, "learning_rate": 0.0008583536514570998, "logits/chosen": -10.377801895141602, "logits/rejected": -10.168482780456543, "logps/chosen": -1884.473876953125, "logps/rejected": -1506.676025390625, "loss": 35.5934, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -122.548828125, "rewards/margins": -34.62006759643555, "rewards/rejected": -87.92875671386719, "step": 7470 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0008581601455164673, "logits/chosen": -12.32559585571289, "logits/rejected": -12.089773178100586, "logps/chosen": -1994.5823974609375, "logps/rejected": -1611.52685546875, "loss": 25.5808, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -109.24283599853516, "rewards/margins": -5.0883307456970215, "rewards/rejected": -104.15449523925781, "step": 7480 }, { "epoch": 0.43, "grad_norm": 70.25814056396484, "learning_rate": 0.0008579666395758349, "logits/chosen": -12.386465072631836, "logits/rejected": -12.438722610473633, "logps/chosen": -2062.7353515625, "logps/rejected": -1664.3648681640625, "loss": 28.121, "rewards/accuracies": 0.5, "rewards/chosen": -60.6330680847168, "rewards/margins": -18.37996482849121, "rewards/rejected": -42.25310516357422, "step": 7490 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0008577731336352026, "logits/chosen": -12.448206901550293, "logits/rejected": -12.279117584228516, "logps/chosen": -2428.901611328125, "logps/rejected": -2022.83984375, "loss": 13.5744, "rewards/accuracies": 0.5, "rewards/chosen": -57.0822639465332, "rewards/margins": 18.69806671142578, "rewards/rejected": -75.78032684326172, "step": 7500 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.0008575796276945702, "logits/chosen": -13.49262523651123, "logits/rejected": -13.266426086425781, "logps/chosen": -1923.5562744140625, "logps/rejected": -1600.2044677734375, "loss": 32.6896, "rewards/accuracies": 0.5, "rewards/chosen": -24.505176544189453, "rewards/margins": -4.693928241729736, "rewards/rejected": -19.81124496459961, "step": 7510 }, { "epoch": 0.44, "grad_norm": 3.743392066509216e-23, "learning_rate": 0.0008573861217539378, "logits/chosen": -13.73973560333252, "logits/rejected": -13.367835998535156, "logps/chosen": -1881.698486328125, "logps/rejected": -1606.5831298828125, "loss": 32.6267, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -79.62825012207031, "rewards/margins": -21.638731002807617, "rewards/rejected": -57.9895133972168, "step": 7520 }, { "epoch": 0.44, "grad_norm": 0.000322170730214566, "learning_rate": 0.0008571926158133054, "logits/chosen": -13.988980293273926, "logits/rejected": -13.48460578918457, "logps/chosen": -2041.745361328125, "logps/rejected": -1579.96484375, "loss": 24.065, "rewards/accuracies": 0.5, "rewards/chosen": -52.18518829345703, "rewards/margins": -2.102670192718506, "rewards/rejected": -50.082515716552734, "step": 7530 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.0008569991098726731, "logits/chosen": -15.434438705444336, "logits/rejected": -15.218017578125, "logps/chosen": -1803.469970703125, "logps/rejected": -1660.462890625, "loss": 12.704, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -79.84783935546875, "rewards/margins": 2.5555641651153564, "rewards/rejected": -82.40340423583984, "step": 7540 }, { "epoch": 0.44, "grad_norm": 51.51584243774414, "learning_rate": 0.0008568056039320407, "logits/chosen": -11.89107894897461, "logits/rejected": -11.798151969909668, "logps/chosen": -2096.5634765625, "logps/rejected": -1558.1580810546875, "loss": 58.0417, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.0933837890625, "rewards/margins": -41.69196319580078, "rewards/rejected": -95.40142059326172, "step": 7550 }, { "epoch": 0.44, "grad_norm": 158.88973999023438, "learning_rate": 0.0008566120979914084, "logits/chosen": -11.855794906616211, "logits/rejected": -11.65953254699707, "logps/chosen": -2107.677001953125, "logps/rejected": -1540.8572998046875, "loss": 21.5911, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -44.10844039916992, "rewards/margins": 4.8038835525512695, "rewards/rejected": -48.912322998046875, "step": 7560 }, { "epoch": 0.44, "grad_norm": 0.029460452497005463, "learning_rate": 0.000856418592050776, "logits/chosen": -12.594331741333008, "logits/rejected": -12.331802368164062, "logps/chosen": -2262.41162109375, "logps/rejected": -2058.181640625, "loss": 5.8242, "rewards/accuracies": 0.5, "rewards/chosen": -55.06853103637695, "rewards/margins": 2.9947009086608887, "rewards/rejected": -58.063232421875, "step": 7570 }, { "epoch": 0.44, "grad_norm": 0.010104164481163025, "learning_rate": 0.0008562250861101436, "logits/chosen": -17.29134750366211, "logits/rejected": -17.124954223632812, "logps/chosen": -2138.873291015625, "logps/rejected": -2063.718505859375, "loss": 9.6866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -123.1226577758789, "rewards/margins": -0.9925479888916016, "rewards/rejected": -122.1301040649414, "step": 7580 }, { "epoch": 0.44, "grad_norm": 0.01651068963110447, "learning_rate": 0.0008560315801695112, "logits/chosen": -15.449792861938477, "logits/rejected": -15.407590866088867, "logps/chosen": -2665.929931640625, "logps/rejected": -2362.27685546875, "loss": 7.1147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -123.834228515625, "rewards/margins": 11.719388961791992, "rewards/rejected": -135.55361938476562, "step": 7590 }, { "epoch": 0.44, "grad_norm": 112.38958740234375, "learning_rate": 0.0008558380742288788, "logits/chosen": -15.060714721679688, "logits/rejected": -14.969882011413574, "logps/chosen": -2593.329345703125, "logps/rejected": -2209.73486328125, "loss": 38.6667, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -226.02767944335938, "rewards/margins": -28.973073959350586, "rewards/rejected": -197.0546112060547, "step": 7600 }, { "epoch": 0.44, "grad_norm": 86.1656494140625, "learning_rate": 0.0008556445682882465, "logits/chosen": -17.636648178100586, "logits/rejected": -17.656320571899414, "logps/chosen": -2013.1341552734375, "logps/rejected": -1683.445556640625, "loss": 34.317, "rewards/accuracies": 0.5, "rewards/chosen": -137.94558715820312, "rewards/margins": -24.111431121826172, "rewards/rejected": -113.83415222167969, "step": 7610 }, { "epoch": 0.44, "grad_norm": 204.59176635742188, "learning_rate": 0.0008554510623476141, "logits/chosen": -11.648767471313477, "logits/rejected": -11.522418975830078, "logps/chosen": -2041.2855224609375, "logps/rejected": -2087.197265625, "loss": 35.7372, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.22689437866211, "rewards/margins": -9.626382827758789, "rewards/rejected": -21.60051727294922, "step": 7620 }, { "epoch": 0.44, "grad_norm": 36.23899841308594, "learning_rate": 0.0008552575564069817, "logits/chosen": -11.205111503601074, "logits/rejected": -11.166365623474121, "logps/chosen": -1908.8277587890625, "logps/rejected": -1532.169677734375, "loss": 27.0476, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.35691261291504, "rewards/margins": -9.77667236328125, "rewards/rejected": -18.58024024963379, "step": 7630 }, { "epoch": 0.44, "grad_norm": 129.73167419433594, "learning_rate": 0.0008550640504663494, "logits/chosen": -8.920096397399902, "logits/rejected": -8.88783073425293, "logps/chosen": -2096.984375, "logps/rejected": -1826.028076171875, "loss": 0.7686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.326141357421875, "rewards/margins": 27.27508544921875, "rewards/rejected": -29.601226806640625, "step": 7640 }, { "epoch": 0.44, "grad_norm": 103.6015625, "learning_rate": 0.000854870544525717, "logits/chosen": -7.016652584075928, "logits/rejected": -6.999429225921631, "logps/chosen": -2583.97705078125, "logps/rejected": -2429.036865234375, "loss": 10.396, "rewards/accuracies": 0.5, "rewards/chosen": -75.07349395751953, "rewards/margins": -1.090182900428772, "rewards/rejected": -73.98331451416016, "step": 7650 }, { "epoch": 0.44, "grad_norm": 1.9930006265640259, "learning_rate": 0.0008546770385850846, "logits/chosen": -7.7720746994018555, "logits/rejected": -7.630414009094238, "logps/chosen": -2438.921142578125, "logps/rejected": -2180.074462890625, "loss": 22.4826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -58.9058723449707, "rewards/margins": -13.977168083190918, "rewards/rejected": -44.9286994934082, "step": 7660 }, { "epoch": 0.44, "grad_norm": 25.264162063598633, "learning_rate": 0.0008544835326444523, "logits/chosen": -10.706464767456055, "logits/rejected": -10.558186531066895, "logps/chosen": -1708.9720458984375, "logps/rejected": -1162.9739990234375, "loss": 47.5712, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.4449462890625, "rewards/margins": -41.368186950683594, "rewards/rejected": -28.076763153076172, "step": 7670 }, { "epoch": 0.44, "grad_norm": 85.95588684082031, "learning_rate": 0.0008542900267038199, "logits/chosen": -12.865801811218262, "logits/rejected": -12.809163093566895, "logps/chosen": -1925.784423828125, "logps/rejected": -2010.3636474609375, "loss": 7.5402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -92.86473083496094, "rewards/margins": 12.195963859558105, "rewards/rejected": -105.0606918334961, "step": 7680 }, { "epoch": 0.45, "grad_norm": 32.45277786254883, "learning_rate": 0.0008540965207631875, "logits/chosen": -13.4279146194458, "logits/rejected": -13.236867904663086, "logps/chosen": -2208.823486328125, "logps/rejected": -2152.209716796875, "loss": 20.5261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.01931762695312, "rewards/margins": -4.026934623718262, "rewards/rejected": -148.99237060546875, "step": 7690 }, { "epoch": 0.45, "grad_norm": 89.14533996582031, "learning_rate": 0.000853903014822555, "logits/chosen": -12.312238693237305, "logits/rejected": -12.288602828979492, "logps/chosen": -2223.75244140625, "logps/rejected": -1885.881591796875, "loss": 20.2414, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -35.92803955078125, "rewards/margins": 2.4051513671875, "rewards/rejected": -38.333187103271484, "step": 7700 }, { "epoch": 0.45, "grad_norm": 95.84602355957031, "learning_rate": 0.0008537095088819226, "logits/chosen": -11.648237228393555, "logits/rejected": -11.662137031555176, "logps/chosen": -2127.9755859375, "logps/rejected": -1775.212646484375, "loss": 41.0621, "rewards/accuracies": 0.5, "rewards/chosen": -136.0567626953125, "rewards/margins": -24.421485900878906, "rewards/rejected": -111.6352767944336, "step": 7710 }, { "epoch": 0.45, "grad_norm": 58.2656364440918, "learning_rate": 0.0008535160029412902, "logits/chosen": -10.806682586669922, "logits/rejected": -10.825296401977539, "logps/chosen": -2708.27880859375, "logps/rejected": -2295.876953125, "loss": 23.6138, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -41.439842224121094, "rewards/margins": -3.124314069747925, "rewards/rejected": -38.315528869628906, "step": 7720 }, { "epoch": 0.45, "grad_norm": 33.40242004394531, "learning_rate": 0.0008533224970006579, "logits/chosen": -12.755836486816406, "logits/rejected": -12.284402847290039, "logps/chosen": -1780.244873046875, "logps/rejected": -1602.46337890625, "loss": 31.4495, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -134.7027130126953, "rewards/margins": -29.198261260986328, "rewards/rejected": -105.50447082519531, "step": 7730 }, { "epoch": 0.45, "grad_norm": 59.18415451049805, "learning_rate": 0.0008531289910600255, "logits/chosen": -10.79655933380127, "logits/rejected": -10.818564414978027, "logps/chosen": -2093.695068359375, "logps/rejected": -2034.786865234375, "loss": 12.5194, "rewards/accuracies": 0.5, "rewards/chosen": -44.02165222167969, "rewards/margins": 3.349499225616455, "rewards/rejected": -47.371150970458984, "step": 7740 }, { "epoch": 0.45, "grad_norm": 78.2341537475586, "learning_rate": 0.0008529354851193932, "logits/chosen": -11.505974769592285, "logits/rejected": -11.463403701782227, "logps/chosen": -2101.4580078125, "logps/rejected": -1619.1748046875, "loss": 21.8571, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -105.03129577636719, "rewards/margins": -9.349931716918945, "rewards/rejected": -95.68136596679688, "step": 7750 }, { "epoch": 0.45, "grad_norm": 27.889381408691406, "learning_rate": 0.0008527419791787608, "logits/chosen": -12.589147567749023, "logits/rejected": -12.451037406921387, "logps/chosen": -1950.156982421875, "logps/rejected": -1636.8897705078125, "loss": 32.5589, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -101.91912841796875, "rewards/margins": -31.644893646240234, "rewards/rejected": -70.27423095703125, "step": 7760 }, { "epoch": 0.45, "grad_norm": 1.5679115676903166e-05, "learning_rate": 0.0008525484732381284, "logits/chosen": -11.872479438781738, "logits/rejected": -11.755937576293945, "logps/chosen": -2496.850341796875, "logps/rejected": -2056.18505859375, "loss": 38.2713, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -96.31331634521484, "rewards/margins": -32.71796417236328, "rewards/rejected": -63.59535598754883, "step": 7770 }, { "epoch": 0.45, "grad_norm": 55.343421936035156, "learning_rate": 0.000852354967297496, "logits/chosen": -12.4028959274292, "logits/rejected": -12.172440528869629, "logps/chosen": -2109.47021484375, "logps/rejected": -1916.0992431640625, "loss": 28.9006, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -145.97158813476562, "rewards/margins": -25.788589477539062, "rewards/rejected": -120.1830062866211, "step": 7780 }, { "epoch": 0.45, "grad_norm": 2.269619618751418e-21, "learning_rate": 0.0008521614613568637, "logits/chosen": -11.470766067504883, "logits/rejected": -11.288606643676758, "logps/chosen": -2055.059814453125, "logps/rejected": -2007.6575927734375, "loss": 8.1926, "rewards/accuracies": 0.5, "rewards/chosen": -46.33222198486328, "rewards/margins": 18.496545791625977, "rewards/rejected": -64.82877349853516, "step": 7790 }, { "epoch": 0.45, "grad_norm": 75.76029968261719, "learning_rate": 0.0008519679554162313, "logits/chosen": -12.246456146240234, "logits/rejected": -11.931106567382812, "logps/chosen": -2136.385009765625, "logps/rejected": -1818.2718505859375, "loss": 17.8877, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -65.51808166503906, "rewards/margins": -12.554842948913574, "rewards/rejected": -52.963233947753906, "step": 7800 }, { "epoch": 0.45, "grad_norm": 1.3982375094201416e-05, "learning_rate": 0.0008517744494755989, "logits/chosen": -14.284814834594727, "logits/rejected": -14.168462753295898, "logps/chosen": -2063.62060546875, "logps/rejected": -1928.785888671875, "loss": 11.3678, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -121.71510314941406, "rewards/margins": -0.3022560179233551, "rewards/rejected": -121.412841796875, "step": 7810 }, { "epoch": 0.45, "grad_norm": 174.43836975097656, "learning_rate": 0.0008515809435349665, "logits/chosen": -15.808034896850586, "logits/rejected": -15.321508407592773, "logps/chosen": -3090.02880859375, "logps/rejected": -2779.741455078125, "loss": 27.2724, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.5367889404297, "rewards/margins": -18.712663650512695, "rewards/rejected": -135.82412719726562, "step": 7820 }, { "epoch": 0.45, "grad_norm": 61.598716735839844, "learning_rate": 0.0008513874375943341, "logits/chosen": -13.191482543945312, "logits/rejected": -13.300302505493164, "logps/chosen": -2347.914794921875, "logps/rejected": -2402.32470703125, "loss": 25.5308, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.02464294433594, "rewards/margins": -8.22734260559082, "rewards/rejected": -97.79731750488281, "step": 7830 }, { "epoch": 0.45, "grad_norm": 73.76385498046875, "learning_rate": 0.0008511939316537018, "logits/chosen": -11.1047945022583, "logits/rejected": -10.79496955871582, "logps/chosen": -1848.3336181640625, "logps/rejected": -1567.7027587890625, "loss": 37.0224, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.39286041259766, "rewards/margins": -20.17038917541504, "rewards/rejected": -96.22246551513672, "step": 7840 }, { "epoch": 0.45, "grad_norm": 85.5385971069336, "learning_rate": 0.0008510004257130695, "logits/chosen": -9.578425407409668, "logits/rejected": -9.61109733581543, "logps/chosen": -2028.943359375, "logps/rejected": -2190.8046875, "loss": 10.859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 42.65761184692383, "rewards/margins": 13.125833511352539, "rewards/rejected": 29.531780242919922, "step": 7850 }, { "epoch": 0.45, "grad_norm": 12.131361961364746, "learning_rate": 0.0008508069197724371, "logits/chosen": -10.643616676330566, "logits/rejected": -10.80659294128418, "logps/chosen": -2128.9140625, "logps/rejected": -1922.0269775390625, "loss": 16.5083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.750722885131836, "rewards/margins": -0.5891166925430298, "rewards/rejected": -28.161602020263672, "step": 7860 }, { "epoch": 0.46, "grad_norm": 72.14212036132812, "learning_rate": 0.0008506134138318047, "logits/chosen": -9.919371604919434, "logits/rejected": -9.910785675048828, "logps/chosen": -1581.3267822265625, "logps/rejected": -1438.288330078125, "loss": 6.605, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -56.7694091796875, "rewards/margins": -0.48773232102394104, "rewards/rejected": -56.28166961669922, "step": 7870 }, { "epoch": 0.46, "grad_norm": 3.558673137919586e-16, "learning_rate": 0.0008504199078911723, "logits/chosen": -9.450790405273438, "logits/rejected": -9.342379570007324, "logps/chosen": -2305.01806640625, "logps/rejected": -1776.437255859375, "loss": 34.9914, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -82.30255126953125, "rewards/margins": -28.137752532958984, "rewards/rejected": -54.164794921875, "step": 7880 }, { "epoch": 0.46, "grad_norm": 2.401040477550964e-10, "learning_rate": 0.00085022640195054, "logits/chosen": -9.542730331420898, "logits/rejected": -9.438782691955566, "logps/chosen": -2354.869873046875, "logps/rejected": -1636.3115234375, "loss": 15.1918, "rewards/accuracies": 0.5, "rewards/chosen": -20.693206787109375, "rewards/margins": 2.3164801597595215, "rewards/rejected": -23.009689331054688, "step": 7890 }, { "epoch": 0.46, "grad_norm": 111.53289794921875, "learning_rate": 0.0008500328960099076, "logits/chosen": -8.982931137084961, "logits/rejected": -8.963179588317871, "logps/chosen": -2167.5048828125, "logps/rejected": -1723.2943115234375, "loss": 27.2762, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -81.33428955078125, "rewards/margins": -16.833683013916016, "rewards/rejected": -64.50060272216797, "step": 7900 }, { "epoch": 0.46, "grad_norm": 36.73378372192383, "learning_rate": 0.0008498393900692751, "logits/chosen": -11.18378734588623, "logits/rejected": -10.669061660766602, "logps/chosen": -2392.798095703125, "logps/rejected": -1994.5667724609375, "loss": 28.2672, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -80.53648376464844, "rewards/margins": -22.77521514892578, "rewards/rejected": -57.761260986328125, "step": 7910 }, { "epoch": 0.46, "grad_norm": 33.61345291137695, "learning_rate": 0.0008496458841286427, "logits/chosen": -11.669434547424316, "logits/rejected": -11.659773826599121, "logps/chosen": -1918.838623046875, "logps/rejected": -1599.888916015625, "loss": 21.2289, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -85.86065673828125, "rewards/margins": -16.151687622070312, "rewards/rejected": -69.70896911621094, "step": 7920 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.0008494523781880103, "logits/chosen": -10.327880859375, "logits/rejected": -10.259561538696289, "logps/chosen": -2171.49072265625, "logps/rejected": -1577.91015625, "loss": 23.4269, "rewards/accuracies": 0.5, "rewards/chosen": -58.73308181762695, "rewards/margins": 23.448123931884766, "rewards/rejected": -82.18120574951172, "step": 7930 }, { "epoch": 0.46, "grad_norm": 1.227417419005089e-13, "learning_rate": 0.0008492588722473779, "logits/chosen": -13.19688606262207, "logits/rejected": -13.110623359680176, "logps/chosen": -2024.623779296875, "logps/rejected": -1792.395263671875, "loss": 22.1613, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -167.2003936767578, "rewards/margins": -11.394377708435059, "rewards/rejected": -155.80601501464844, "step": 7940 }, { "epoch": 0.46, "grad_norm": 0.04607251659035683, "learning_rate": 0.0008490653663067457, "logits/chosen": -12.025535583496094, "logits/rejected": -12.360681533813477, "logps/chosen": -2263.19287109375, "logps/rejected": -2420.89501953125, "loss": 13.1298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -132.03932189941406, "rewards/margins": 10.340827941894531, "rewards/rejected": -142.38015747070312, "step": 7950 }, { "epoch": 0.46, "grad_norm": 0.9125139117240906, "learning_rate": 0.0008488718603661133, "logits/chosen": -11.432101249694824, "logits/rejected": -11.10465145111084, "logps/chosen": -1857.2421875, "logps/rejected": -1929.006591796875, "loss": 23.6786, "rewards/accuracies": 0.5, "rewards/chosen": -75.81059265136719, "rewards/margins": -11.727150917053223, "rewards/rejected": -64.08344268798828, "step": 7960 }, { "epoch": 0.46, "grad_norm": 34.09746170043945, "learning_rate": 0.0008486783544254809, "logits/chosen": -9.504829406738281, "logits/rejected": -10.799922943115234, "logps/chosen": -2221.48193359375, "logps/rejected": -1687.7161865234375, "loss": 18.3733, "rewards/accuracies": 0.5, "rewards/chosen": -54.1668815612793, "rewards/margins": 4.530727386474609, "rewards/rejected": -58.697608947753906, "step": 7970 }, { "epoch": 0.46, "grad_norm": 53.77346420288086, "learning_rate": 0.0008484848484848485, "logits/chosen": -11.140767097473145, "logits/rejected": -11.063794136047363, "logps/chosen": -2320.19287109375, "logps/rejected": -2046.0289306640625, "loss": 23.2567, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.98161315917969, "rewards/margins": -21.87795639038086, "rewards/rejected": -95.10365295410156, "step": 7980 }, { "epoch": 0.46, "grad_norm": 122.67521667480469, "learning_rate": 0.0008482913425442161, "logits/chosen": -13.522825241088867, "logits/rejected": -13.283981323242188, "logps/chosen": -1839.0384521484375, "logps/rejected": -1688.1090087890625, "loss": 19.781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -148.38531494140625, "rewards/margins": -9.087632179260254, "rewards/rejected": -139.29769897460938, "step": 7990 }, { "epoch": 0.46, "grad_norm": 53.39884948730469, "learning_rate": 0.0008480978366035837, "logits/chosen": -11.215818405151367, "logits/rejected": -11.068151473999023, "logps/chosen": -1949.723388671875, "logps/rejected": -1838.3890380859375, "loss": 29.0639, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -136.89024353027344, "rewards/margins": -18.84918975830078, "rewards/rejected": -118.0410385131836, "step": 8000 }, { "epoch": 0.46, "grad_norm": 164.59031677246094, "learning_rate": 0.0008479043306629514, "logits/chosen": -10.836142539978027, "logits/rejected": -10.826489448547363, "logps/chosen": -2140.16455078125, "logps/rejected": -1649.2109375, "loss": 28.8972, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -111.80313873291016, "rewards/margins": -19.1331729888916, "rewards/rejected": -92.66997528076172, "step": 8010 }, { "epoch": 0.46, "grad_norm": 59.950199127197266, "learning_rate": 0.000847710824722319, "logits/chosen": -11.577058792114258, "logits/rejected": -11.527121543884277, "logps/chosen": -1833.629638671875, "logps/rejected": -1716.2152099609375, "loss": 21.106, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -96.60964965820312, "rewards/margins": 3.2478904724121094, "rewards/rejected": -99.85753631591797, "step": 8020 }, { "epoch": 0.46, "grad_norm": 5.623223273687472e-07, "learning_rate": 0.0008475173187816866, "logits/chosen": -14.824604988098145, "logits/rejected": -15.238489151000977, "logps/chosen": -1457.5186767578125, "logps/rejected": -1514.427734375, "loss": 8.2272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -86.37115478515625, "rewards/margins": 8.669894218444824, "rewards/rejected": -95.0410385131836, "step": 8030 }, { "epoch": 0.47, "grad_norm": 246.82176208496094, "learning_rate": 0.0008473238128410542, "logits/chosen": -12.137395858764648, "logits/rejected": -12.136857986450195, "logps/chosen": -2623.232177734375, "logps/rejected": -2244.251220703125, "loss": 34.8879, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -153.53506469726562, "rewards/margins": -27.354883193969727, "rewards/rejected": -126.18017578125, "step": 8040 }, { "epoch": 0.47, "grad_norm": 76.14635467529297, "learning_rate": 0.0008471303069004218, "logits/chosen": -10.005620956420898, "logits/rejected": -9.932570457458496, "logps/chosen": -2421.09423828125, "logps/rejected": -2258.4130859375, "loss": 22.76, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -101.04017639160156, "rewards/margins": -14.626215934753418, "rewards/rejected": -86.4139633178711, "step": 8050 }, { "epoch": 0.47, "grad_norm": 1.905393669178381e-14, "learning_rate": 0.0008469368009597896, "logits/chosen": -9.294036865234375, "logits/rejected": -9.289582252502441, "logps/chosen": -2479.90576171875, "logps/rejected": -2165.013671875, "loss": 17.8846, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.77447509765625, "rewards/margins": -10.655488967895508, "rewards/rejected": -15.118985176086426, "step": 8060 }, { "epoch": 0.47, "grad_norm": 190.00572204589844, "learning_rate": 0.0008467432950191572, "logits/chosen": -11.728143692016602, "logits/rejected": -11.793774604797363, "logps/chosen": -2700.55712890625, "logps/rejected": -2530.964111328125, "loss": 12.1028, "rewards/accuracies": 0.5, "rewards/chosen": -119.81783294677734, "rewards/margins": -8.84725570678711, "rewards/rejected": -110.9705810546875, "step": 8070 }, { "epoch": 0.47, "grad_norm": 151.68380737304688, "learning_rate": 0.0008465497890785248, "logits/chosen": -9.92879581451416, "logits/rejected": -9.654382705688477, "logps/chosen": -2984.638427734375, "logps/rejected": -2481.59423828125, "loss": 46.1502, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -95.94146728515625, "rewards/margins": -30.585430145263672, "rewards/rejected": -65.35603332519531, "step": 8080 }, { "epoch": 0.47, "grad_norm": 2.173075847480402e-18, "learning_rate": 0.0008463562831378924, "logits/chosen": -10.694259643554688, "logits/rejected": -11.15701961517334, "logps/chosen": -2005.349853515625, "logps/rejected": -2210.86083984375, "loss": 8.3382, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -39.57722091674805, "rewards/margins": 21.50415802001953, "rewards/rejected": -61.081382751464844, "step": 8090 }, { "epoch": 0.47, "grad_norm": 4.684049770276033e-07, "learning_rate": 0.00084616277719726, "logits/chosen": -9.759657859802246, "logits/rejected": -9.878532409667969, "logps/chosen": -1894.0201416015625, "logps/rejected": -1974.580078125, "loss": 11.7937, "rewards/accuracies": 0.5, "rewards/chosen": -107.1146240234375, "rewards/margins": -5.541449546813965, "rewards/rejected": -101.57317352294922, "step": 8100 }, { "epoch": 0.47, "grad_norm": 32.46553039550781, "learning_rate": 0.0008459692712566276, "logits/chosen": -9.095251083374023, "logits/rejected": -9.072362899780273, "logps/chosen": -2226.31640625, "logps/rejected": -1768.139404296875, "loss": 13.2156, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -116.78787994384766, "rewards/margins": -9.449020385742188, "rewards/rejected": -107.33885192871094, "step": 8110 }, { "epoch": 0.47, "grad_norm": 52.633968353271484, "learning_rate": 0.0008457757653159953, "logits/chosen": -10.361441612243652, "logits/rejected": -10.132490158081055, "logps/chosen": -1798.9820556640625, "logps/rejected": -1824.8013916015625, "loss": 22.827, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -104.63157653808594, "rewards/margins": -19.318538665771484, "rewards/rejected": -85.31304168701172, "step": 8120 }, { "epoch": 0.47, "grad_norm": 0.018893392756581306, "learning_rate": 0.0008455822593753628, "logits/chosen": -9.904010772705078, "logits/rejected": -9.758195877075195, "logps/chosen": -2011.467041015625, "logps/rejected": -1889.169921875, "loss": 24.9745, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -99.32208251953125, "rewards/margins": -10.256352424621582, "rewards/rejected": -89.06573486328125, "step": 8130 }, { "epoch": 0.47, "grad_norm": 82.18084716796875, "learning_rate": 0.0008453887534347304, "logits/chosen": -10.124241828918457, "logits/rejected": -10.18982982635498, "logps/chosen": -2305.292236328125, "logps/rejected": -2108.49462890625, "loss": 24.6027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.77630615234375, "rewards/margins": -14.239511489868164, "rewards/rejected": -93.53679656982422, "step": 8140 }, { "epoch": 0.47, "grad_norm": 26.987703323364258, "learning_rate": 0.000845195247494098, "logits/chosen": -10.829859733581543, "logits/rejected": -10.65172004699707, "logps/chosen": -2220.873046875, "logps/rejected": -1861.8140869140625, "loss": 29.4934, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.13509368896484, "rewards/margins": -27.271427154541016, "rewards/rejected": -88.86368560791016, "step": 8150 }, { "epoch": 0.47, "grad_norm": 6.463294506072998, "learning_rate": 0.0008450017415534657, "logits/chosen": -8.579316139221191, "logits/rejected": -8.798896789550781, "logps/chosen": -2357.614013671875, "logps/rejected": -1802.0462646484375, "loss": 14.5441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -49.49935531616211, "rewards/margins": 17.250394821166992, "rewards/rejected": -66.74974822998047, "step": 8160 }, { "epoch": 0.47, "grad_norm": 195.1300506591797, "learning_rate": 0.0008448082356128333, "logits/chosen": -12.134222984313965, "logits/rejected": -12.128385543823242, "logps/chosen": -1629.123046875, "logps/rejected": -1617.99951171875, "loss": 15.7629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -131.7943115234375, "rewards/margins": 3.2113075256347656, "rewards/rejected": -135.00559997558594, "step": 8170 }, { "epoch": 0.47, "grad_norm": 55.78606414794922, "learning_rate": 0.000844614729672201, "logits/chosen": -11.9310941696167, "logits/rejected": -11.755595207214355, "logps/chosen": -2169.7158203125, "logps/rejected": -1844.8984375, "loss": 18.4547, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.06243896484375, "rewards/margins": -0.980572521686554, "rewards/rejected": -112.08187103271484, "step": 8180 }, { "epoch": 0.47, "grad_norm": 68.81363677978516, "learning_rate": 0.0008444212237315686, "logits/chosen": -11.805244445800781, "logits/rejected": -11.87159538269043, "logps/chosen": -2424.05224609375, "logps/rejected": -2116.826416015625, "loss": 8.3358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -78.90839385986328, "rewards/margins": 21.778799057006836, "rewards/rejected": -100.68719482421875, "step": 8190 }, { "epoch": 0.47, "grad_norm": 16.27164077758789, "learning_rate": 0.0008442277177909362, "logits/chosen": -10.343965530395508, "logits/rejected": -10.11832332611084, "logps/chosen": -2417.442626953125, "logps/rejected": -2419.223876953125, "loss": 1.0011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -66.60313415527344, "rewards/margins": 10.956657409667969, "rewards/rejected": -77.55979919433594, "step": 8200 }, { "epoch": 0.48, "grad_norm": 5.419545495428772e-14, "learning_rate": 0.0008440342118503038, "logits/chosen": -11.332145690917969, "logits/rejected": -10.860944747924805, "logps/chosen": -2104.8984375, "logps/rejected": -2024.7366943359375, "loss": 26.6495, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -102.62774658203125, "rewards/margins": -22.21798324584961, "rewards/rejected": -80.40975952148438, "step": 8210 }, { "epoch": 0.48, "grad_norm": 55.50010681152344, "learning_rate": 0.0008438407059096714, "logits/chosen": -9.449422836303711, "logits/rejected": -9.287893295288086, "logps/chosen": -2465.54345703125, "logps/rejected": -2418.518310546875, "loss": 5.0309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -63.760894775390625, "rewards/margins": 15.65533447265625, "rewards/rejected": -79.41622924804688, "step": 8220 }, { "epoch": 0.48, "grad_norm": 76.96441650390625, "learning_rate": 0.000843647199969039, "logits/chosen": -13.551446914672852, "logits/rejected": -13.653173446655273, "logps/chosen": -1291.97265625, "logps/rejected": -1319.843017578125, "loss": 5.7743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.82544708251953, "rewards/margins": 6.035165309906006, "rewards/rejected": -97.86061096191406, "step": 8230 }, { "epoch": 0.48, "grad_norm": 1.3468817172679337e-08, "learning_rate": 0.0008434536940284067, "logits/chosen": -12.322429656982422, "logits/rejected": -12.955485343933105, "logps/chosen": -2626.82080078125, "logps/rejected": -2081.065673828125, "loss": 33.9583, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -90.73112487792969, "rewards/margins": -24.18910026550293, "rewards/rejected": -66.54202270507812, "step": 8240 }, { "epoch": 0.48, "grad_norm": 126.15281677246094, "learning_rate": 0.0008432601880877743, "logits/chosen": -15.128094673156738, "logits/rejected": -15.560551643371582, "logps/chosen": -2130.46337890625, "logps/rejected": -2027.793701171875, "loss": 17.9067, "rewards/accuracies": 0.5, "rewards/chosen": -117.32340240478516, "rewards/margins": -9.196980476379395, "rewards/rejected": -108.12642669677734, "step": 8250 }, { "epoch": 0.48, "grad_norm": 197.87710571289062, "learning_rate": 0.0008430666821471419, "logits/chosen": -16.888172149658203, "logits/rejected": -15.646547317504883, "logps/chosen": -2941.6328125, "logps/rejected": -2853.857666015625, "loss": 11.416, "rewards/accuracies": 0.5, "rewards/chosen": -113.99869537353516, "rewards/margins": 1.2827022075653076, "rewards/rejected": -115.2813949584961, "step": 8260 }, { "epoch": 0.48, "grad_norm": 1.267991649456235e-07, "learning_rate": 0.0008428731762065096, "logits/chosen": -21.229656219482422, "logits/rejected": -21.88473892211914, "logps/chosen": -1926.027099609375, "logps/rejected": -2025.5552978515625, "loss": 10.8362, "rewards/accuracies": 0.5, "rewards/chosen": -141.68309020996094, "rewards/margins": 11.948188781738281, "rewards/rejected": -153.6312713623047, "step": 8270 }, { "epoch": 0.48, "grad_norm": 54.55937194824219, "learning_rate": 0.0008426796702658772, "logits/chosen": -16.936878204345703, "logits/rejected": -19.71988868713379, "logps/chosen": -2651.92919921875, "logps/rejected": -1892.8687744140625, "loss": 41.7542, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -178.75897216796875, "rewards/margins": -39.538082122802734, "rewards/rejected": -139.2209014892578, "step": 8280 }, { "epoch": 0.48, "grad_norm": 57.115135192871094, "learning_rate": 0.0008424861643252449, "logits/chosen": -11.812050819396973, "logits/rejected": -12.156229019165039, "logps/chosen": -2221.550537109375, "logps/rejected": -2028.2486572265625, "loss": 8.2338, "rewards/accuracies": 0.5, "rewards/chosen": -54.4315299987793, "rewards/margins": -1.1598904132843018, "rewards/rejected": -53.271629333496094, "step": 8290 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.0008422926583846125, "logits/chosen": -12.164145469665527, "logits/rejected": -12.415181159973145, "logps/chosen": -1927.4564208984375, "logps/rejected": -1937.6865234375, "loss": 18.2688, "rewards/accuracies": 0.5, "rewards/chosen": -118.6424789428711, "rewards/margins": 4.443634033203125, "rewards/rejected": -123.08612060546875, "step": 8300 }, { "epoch": 0.48, "grad_norm": 35.439491271972656, "learning_rate": 0.0008420991524439801, "logits/chosen": -10.769277572631836, "logits/rejected": -11.569595336914062, "logps/chosen": -1897.421875, "logps/rejected": -1528.496337890625, "loss": 32.392, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -66.01297760009766, "rewards/margins": -25.368488311767578, "rewards/rejected": -40.64448165893555, "step": 8310 }, { "epoch": 0.48, "grad_norm": 0.7777907848358154, "learning_rate": 0.0008419056465033477, "logits/chosen": -10.965091705322266, "logits/rejected": -10.89487075805664, "logps/chosen": -2059.223876953125, "logps/rejected": -1599.799560546875, "loss": 20.4123, "rewards/accuracies": 0.5, "rewards/chosen": -110.80357360839844, "rewards/margins": -9.070321083068848, "rewards/rejected": -101.73324584960938, "step": 8320 }, { "epoch": 0.48, "grad_norm": 2.186071115795052e-11, "learning_rate": 0.0008417121405627153, "logits/chosen": -10.098885536193848, "logits/rejected": -10.481282234191895, "logps/chosen": -2343.927734375, "logps/rejected": -1804.0845947265625, "loss": 22.7744, "rewards/accuracies": 0.5, "rewards/chosen": -110.09410095214844, "rewards/margins": -7.3849897384643555, "rewards/rejected": -102.70912170410156, "step": 8330 }, { "epoch": 0.48, "grad_norm": 0.005003906320780516, "learning_rate": 0.0008415186346220829, "logits/chosen": -10.790088653564453, "logits/rejected": -9.892715454101562, "logps/chosen": -1832.3919677734375, "logps/rejected": -2081.65380859375, "loss": 17.6165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.5755615234375, "rewards/margins": -5.13577127456665, "rewards/rejected": -98.4397964477539, "step": 8340 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.0008413251286814505, "logits/chosen": -7.9017815589904785, "logits/rejected": -8.01932430267334, "logps/chosen": -2764.565185546875, "logps/rejected": -2847.2607421875, "loss": 8.6129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.02964782714844, "rewards/margins": 12.192486763000488, "rewards/rejected": -138.22213745117188, "step": 8350 }, { "epoch": 0.48, "grad_norm": 114.2970962524414, "learning_rate": 0.0008411316227408181, "logits/chosen": -9.168791770935059, "logits/rejected": -10.37912368774414, "logps/chosen": -2476.1328125, "logps/rejected": -2092.85400390625, "loss": 15.2328, "rewards/accuracies": 0.5, "rewards/chosen": -103.0724105834961, "rewards/margins": -6.0121965408325195, "rewards/rejected": -97.06021881103516, "step": 8360 }, { "epoch": 0.48, "grad_norm": 2.3430538566071362e-15, "learning_rate": 0.0008409381168001858, "logits/chosen": -11.775053977966309, "logits/rejected": -18.082279205322266, "logps/chosen": -2466.6826171875, "logps/rejected": -1647.877685546875, "loss": 46.9403, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.71279907226562, "rewards/margins": -39.2866325378418, "rewards/rejected": -114.4261703491211, "step": 8370 }, { "epoch": 0.49, "grad_norm": 201.15440368652344, "learning_rate": 0.0008407446108595534, "logits/chosen": -8.81181526184082, "logits/rejected": -10.129087448120117, "logps/chosen": -3067.069091796875, "logps/rejected": -2593.327880859375, "loss": 22.5345, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -44.485939025878906, "rewards/margins": -13.438130378723145, "rewards/rejected": -31.047815322875977, "step": 8380 }, { "epoch": 0.49, "grad_norm": 0.19978487491607666, "learning_rate": 0.000840551104918921, "logits/chosen": -10.145123481750488, "logits/rejected": -10.58099365234375, "logps/chosen": -2211.23486328125, "logps/rejected": -2327.75732421875, "loss": 28.137, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -118.717041015625, "rewards/margins": -24.52126693725586, "rewards/rejected": -94.19576263427734, "step": 8390 }, { "epoch": 0.49, "grad_norm": 1055.5909423828125, "learning_rate": 0.0008403575989782886, "logits/chosen": -10.658533096313477, "logits/rejected": -10.360422134399414, "logps/chosen": -2028.1331787109375, "logps/rejected": -1695.206787109375, "loss": 35.2796, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -136.19993591308594, "rewards/margins": -33.89434051513672, "rewards/rejected": -102.30559539794922, "step": 8400 }, { "epoch": 0.49, "grad_norm": 84.32150268554688, "learning_rate": 0.0008401640930376563, "logits/chosen": -9.620615005493164, "logits/rejected": -9.657184600830078, "logps/chosen": -1758.3499755859375, "logps/rejected": -1372.2476806640625, "loss": 15.4027, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -72.2677993774414, "rewards/margins": -8.203953742980957, "rewards/rejected": -64.06385803222656, "step": 8410 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.0008399705870970239, "logits/chosen": -12.731863021850586, "logits/rejected": -13.029154777526855, "logps/chosen": -1712.0286865234375, "logps/rejected": -1446.7193603515625, "loss": 24.0072, "rewards/accuracies": 0.5, "rewards/chosen": -75.96727752685547, "rewards/margins": -4.805319309234619, "rewards/rejected": -71.1619644165039, "step": 8420 }, { "epoch": 0.49, "grad_norm": 1.425751267806856e-14, "learning_rate": 0.0008397770811563915, "logits/chosen": -10.665189743041992, "logits/rejected": -10.700213432312012, "logps/chosen": -2491.06298828125, "logps/rejected": -2123.582763671875, "loss": 24.8058, "rewards/accuracies": 0.5, "rewards/chosen": -89.42921447753906, "rewards/margins": -14.451799392700195, "rewards/rejected": -74.9774169921875, "step": 8430 }, { "epoch": 0.49, "grad_norm": 45.98768997192383, "learning_rate": 0.0008395835752157591, "logits/chosen": -10.27708625793457, "logits/rejected": -10.341035842895508, "logps/chosen": -2374.47705078125, "logps/rejected": -2093.74072265625, "loss": 4.1201, "rewards/accuracies": 0.5, "rewards/chosen": 8.599682807922363, "rewards/margins": 10.373054504394531, "rewards/rejected": -1.7733714580535889, "step": 8440 }, { "epoch": 0.49, "grad_norm": 31.996688842773438, "learning_rate": 0.0008393900692751267, "logits/chosen": -11.16533374786377, "logits/rejected": -11.771347045898438, "logps/chosen": -2291.773193359375, "logps/rejected": -2206.63818359375, "loss": 32.7245, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -108.71529388427734, "rewards/margins": -29.213146209716797, "rewards/rejected": -79.50215148925781, "step": 8450 }, { "epoch": 0.49, "grad_norm": 83.6252670288086, "learning_rate": 0.0008391965633344943, "logits/chosen": -11.619424819946289, "logits/rejected": -12.637880325317383, "logps/chosen": -2434.246826171875, "logps/rejected": -2015.918701171875, "loss": 27.0406, "rewards/accuracies": 0.5, "rewards/chosen": -99.0856704711914, "rewards/margins": -15.525548934936523, "rewards/rejected": -83.56011962890625, "step": 8460 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.000839003057393862, "logits/chosen": -9.304041862487793, "logits/rejected": -9.261514663696289, "logps/chosen": -2246.88037109375, "logps/rejected": -2311.43408203125, "loss": 18.0344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -58.7180061340332, "rewards/margins": 9.49821949005127, "rewards/rejected": -68.21622467041016, "step": 8470 }, { "epoch": 0.49, "grad_norm": 110.33686828613281, "learning_rate": 0.0008388095514532297, "logits/chosen": -8.931520462036133, "logits/rejected": -9.540136337280273, "logps/chosen": -2228.6962890625, "logps/rejected": -1769.8580322265625, "loss": 32.7996, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -84.82833099365234, "rewards/margins": -23.85268783569336, "rewards/rejected": -60.97563552856445, "step": 8480 }, { "epoch": 0.49, "grad_norm": 57.962772369384766, "learning_rate": 0.0008386160455125973, "logits/chosen": -8.721284866333008, "logits/rejected": -8.851568222045898, "logps/chosen": -1987.2532958984375, "logps/rejected": -1868.7962646484375, "loss": 21.7811, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -90.76643371582031, "rewards/margins": -9.359211921691895, "rewards/rejected": -81.40721893310547, "step": 8490 }, { "epoch": 0.49, "grad_norm": 32.5440788269043, "learning_rate": 0.0008384225395719649, "logits/chosen": -8.457216262817383, "logits/rejected": -8.443089485168457, "logps/chosen": -2129.139892578125, "logps/rejected": -2412.09033203125, "loss": 2.678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -79.34676361083984, "rewards/margins": 17.607746124267578, "rewards/rejected": -96.95450592041016, "step": 8500 }, { "epoch": 0.49, "grad_norm": 164.6028594970703, "learning_rate": 0.0008382290336313325, "logits/chosen": -10.897418975830078, "logits/rejected": -11.076875686645508, "logps/chosen": -2080.12451171875, "logps/rejected": -1394.989501953125, "loss": 47.7541, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -143.5867156982422, "rewards/margins": -42.047969818115234, "rewards/rejected": -101.53874206542969, "step": 8510 }, { "epoch": 0.49, "grad_norm": 63.67047882080078, "learning_rate": 0.0008380355276907002, "logits/chosen": -12.555000305175781, "logits/rejected": -12.725152969360352, "logps/chosen": -1798.983642578125, "logps/rejected": -1658.7093505859375, "loss": 18.5671, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -83.82731628417969, "rewards/margins": -11.034858703613281, "rewards/rejected": -72.79244232177734, "step": 8520 }, { "epoch": 0.49, "grad_norm": 167.66360473632812, "learning_rate": 0.0008378420217500678, "logits/chosen": -15.735153198242188, "logits/rejected": -16.51795768737793, "logps/chosen": -2280.107666015625, "logps/rejected": -2113.99365234375, "loss": 22.6399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -83.37484741210938, "rewards/margins": -4.236047744750977, "rewards/rejected": -79.13878631591797, "step": 8530 }, { "epoch": 0.49, "grad_norm": 211.40501403808594, "learning_rate": 0.0008376485158094354, "logits/chosen": -16.773576736450195, "logits/rejected": -17.238080978393555, "logps/chosen": -2985.765380859375, "logps/rejected": -2402.73193359375, "loss": 35.0874, "rewards/accuracies": 0.5, "rewards/chosen": -132.1049346923828, "rewards/margins": -23.486400604248047, "rewards/rejected": -108.6185302734375, "step": 8540 }, { "epoch": 0.49, "grad_norm": 0.012498087249696255, "learning_rate": 0.000837455009868803, "logits/chosen": -14.4105806350708, "logits/rejected": -14.903825759887695, "logps/chosen": -2556.722412109375, "logps/rejected": -2323.173095703125, "loss": 15.879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -78.79692840576172, "rewards/margins": -11.71911907196045, "rewards/rejected": -67.07780456542969, "step": 8550 }, { "epoch": 0.5, "grad_norm": 100.24597930908203, "learning_rate": 0.0008372615039281706, "logits/chosen": -13.85369873046875, "logits/rejected": -15.272079467773438, "logps/chosen": -1970.7259521484375, "logps/rejected": -2049.757080078125, "loss": 23.096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.181640625, "rewards/margins": -17.544612884521484, "rewards/rejected": -111.63703918457031, "step": 8560 }, { "epoch": 0.5, "grad_norm": 92.67337036132812, "learning_rate": 0.0008370679979875381, "logits/chosen": -14.468609809875488, "logits/rejected": -14.767440795898438, "logps/chosen": -2592.253173828125, "logps/rejected": -2426.17236328125, "loss": 23.0588, "rewards/accuracies": 0.5, "rewards/chosen": -147.9979248046875, "rewards/margins": -9.086984634399414, "rewards/rejected": -138.9109344482422, "step": 8570 }, { "epoch": 0.5, "grad_norm": 86.2777328491211, "learning_rate": 0.0008368744920469059, "logits/chosen": -9.923972129821777, "logits/rejected": -11.06432819366455, "logps/chosen": -2555.32763671875, "logps/rejected": -2435.97802734375, "loss": 14.2719, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -122.39833068847656, "rewards/margins": -9.830674171447754, "rewards/rejected": -112.5676498413086, "step": 8580 }, { "epoch": 0.5, "grad_norm": 129.52195739746094, "learning_rate": 0.0008366809861062735, "logits/chosen": -6.764063835144043, "logits/rejected": -7.365988254547119, "logps/chosen": -2714.314697265625, "logps/rejected": -2654.322509765625, "loss": 13.7158, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -82.22480773925781, "rewards/margins": -0.4986690580844879, "rewards/rejected": -81.72612762451172, "step": 8590 }, { "epoch": 0.5, "grad_norm": 119.84036254882812, "learning_rate": 0.0008364874801656411, "logits/chosen": -10.897699356079102, "logits/rejected": -13.731022834777832, "logps/chosen": -2369.996826171875, "logps/rejected": -1790.9368896484375, "loss": 36.3701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -91.71202087402344, "rewards/margins": -26.904132843017578, "rewards/rejected": -64.8078842163086, "step": 8600 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.0008362939742250087, "logits/chosen": -15.440678596496582, "logits/rejected": -15.865567207336426, "logps/chosen": -1618.2735595703125, "logps/rejected": -1651.752197265625, "loss": 9.9593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -103.08016204833984, "rewards/margins": 25.281734466552734, "rewards/rejected": -128.3618927001953, "step": 8610 }, { "epoch": 0.5, "grad_norm": 42.67727279663086, "learning_rate": 0.0008361004682843763, "logits/chosen": -12.985132217407227, "logits/rejected": -20.85915756225586, "logps/chosen": -1911.113037109375, "logps/rejected": -1385.902099609375, "loss": 43.752, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.79367065429688, "rewards/margins": -41.511680603027344, "rewards/rejected": -115.28199768066406, "step": 8620 }, { "epoch": 0.5, "grad_norm": 17.301694869995117, "learning_rate": 0.000835906962343744, "logits/chosen": -11.002397537231445, "logits/rejected": -12.65334701538086, "logps/chosen": -2149.40185546875, "logps/rejected": -1765.650146484375, "loss": 8.9464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -56.6067008972168, "rewards/margins": 12.493922233581543, "rewards/rejected": -69.10062408447266, "step": 8630 }, { "epoch": 0.5, "grad_norm": 134.56903076171875, "learning_rate": 0.0008357134564031116, "logits/chosen": -14.692767143249512, "logits/rejected": -15.019001960754395, "logps/chosen": -2010.163330078125, "logps/rejected": -2031.676513671875, "loss": 18.6365, "rewards/accuracies": 0.5, "rewards/chosen": -129.99905395507812, "rewards/margins": 3.8068275451660156, "rewards/rejected": -133.80589294433594, "step": 8640 }, { "epoch": 0.5, "grad_norm": 71.99047088623047, "learning_rate": 0.0008355199504624792, "logits/chosen": -12.200343132019043, "logits/rejected": -14.417745590209961, "logps/chosen": -2332.827880859375, "logps/rejected": -1960.860595703125, "loss": 28.1239, "rewards/accuracies": 0.5, "rewards/chosen": -82.43843078613281, "rewards/margins": -7.918875694274902, "rewards/rejected": -74.51956176757812, "step": 8650 }, { "epoch": 0.5, "grad_norm": 96.91634368896484, "learning_rate": 0.0008353264445218468, "logits/chosen": -13.890901565551758, "logits/rejected": -13.073022842407227, "logps/chosen": -2257.52001953125, "logps/rejected": -2032.2825927734375, "loss": 26.2599, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -86.43733215332031, "rewards/margins": -16.157968521118164, "rewards/rejected": -70.27936553955078, "step": 8660 }, { "epoch": 0.5, "grad_norm": 3.242233859168664e-08, "learning_rate": 0.0008351329385812144, "logits/chosen": -10.407221794128418, "logits/rejected": -13.916603088378906, "logps/chosen": -2172.440185546875, "logps/rejected": -2073.61279296875, "loss": 13.2797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -79.78577423095703, "rewards/margins": 0.8656761050224304, "rewards/rejected": -80.65145111083984, "step": 8670 }, { "epoch": 0.5, "grad_norm": 75.07627868652344, "learning_rate": 0.000834939432640582, "logits/chosen": -10.35789966583252, "logits/rejected": -11.150481224060059, "logps/chosen": -2310.406005859375, "logps/rejected": -2009.203125, "loss": 30.2972, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -117.91844177246094, "rewards/margins": -23.699174880981445, "rewards/rejected": -94.21924591064453, "step": 8680 }, { "epoch": 0.5, "grad_norm": 3.675381888953604e-20, "learning_rate": 0.0008347459266999498, "logits/chosen": -12.101452827453613, "logits/rejected": -12.161050796508789, "logps/chosen": -2054.143798828125, "logps/rejected": -1774.36328125, "loss": 26.2887, "rewards/accuracies": 0.5, "rewards/chosen": -95.43323516845703, "rewards/margins": -3.6124510765075684, "rewards/rejected": -91.8207778930664, "step": 8690 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.0008345524207593174, "logits/chosen": -10.472663879394531, "logits/rejected": -10.80270767211914, "logps/chosen": -1861.579833984375, "logps/rejected": -1462.846923828125, "loss": 17.6142, "rewards/accuracies": 0.5, "rewards/chosen": -68.99253845214844, "rewards/margins": -5.356464385986328, "rewards/rejected": -63.636077880859375, "step": 8700 }, { "epoch": 0.5, "grad_norm": 42.144676208496094, "learning_rate": 0.000834358914818685, "logits/chosen": -9.243796348571777, "logits/rejected": -9.491683959960938, "logps/chosen": -2350.65576171875, "logps/rejected": -2039.7034912109375, "loss": 14.2977, "rewards/accuracies": 0.5, "rewards/chosen": -91.54059600830078, "rewards/margins": 3.205195188522339, "rewards/rejected": -94.74578857421875, "step": 8710 }, { "epoch": 0.5, "grad_norm": 54.634986877441406, "learning_rate": 0.0008341654088780526, "logits/chosen": -9.840879440307617, "logits/rejected": -10.626132011413574, "logps/chosen": -2617.442626953125, "logps/rejected": -2023.320068359375, "loss": 25.4256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -81.29325103759766, "rewards/margins": -5.9141387939453125, "rewards/rejected": -75.37910461425781, "step": 8720 }, { "epoch": 0.51, "grad_norm": 80.1915054321289, "learning_rate": 0.0008339719029374202, "logits/chosen": -11.385339736938477, "logits/rejected": -12.413383483886719, "logps/chosen": -2302.2890625, "logps/rejected": -2000.4671630859375, "loss": 24.4953, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -124.73834228515625, "rewards/margins": -23.632816314697266, "rewards/rejected": -101.10552215576172, "step": 8730 }, { "epoch": 0.51, "grad_norm": 0.14379402995109558, "learning_rate": 0.0008337783969967878, "logits/chosen": -11.483896255493164, "logits/rejected": -12.27702808380127, "logps/chosen": -1997.017822265625, "logps/rejected": -1780.813720703125, "loss": 17.5402, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -82.0831298828125, "rewards/margins": -15.489590644836426, "rewards/rejected": -66.59353637695312, "step": 8740 }, { "epoch": 0.51, "grad_norm": 16.86178970336914, "learning_rate": 0.0008335848910561555, "logits/chosen": -12.376623153686523, "logits/rejected": -12.641206741333008, "logps/chosen": -2103.08154296875, "logps/rejected": -1641.4306640625, "loss": 3.8319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -51.46773147583008, "rewards/margins": 12.743520736694336, "rewards/rejected": -64.21125793457031, "step": 8750 }, { "epoch": 0.51, "grad_norm": 60.55537796020508, "learning_rate": 0.0008333913851155231, "logits/chosen": -11.268869400024414, "logits/rejected": -12.432171821594238, "logps/chosen": -2364.931396484375, "logps/rejected": -1695.2952880859375, "loss": 27.9313, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -155.04904174804688, "rewards/margins": -16.26106071472168, "rewards/rejected": -138.78799438476562, "step": 8760 }, { "epoch": 0.51, "grad_norm": 0.002626590197905898, "learning_rate": 0.0008331978791748907, "logits/chosen": -9.881933212280273, "logits/rejected": -10.734254837036133, "logps/chosen": -1986.465087890625, "logps/rejected": -1774.8648681640625, "loss": 22.8902, "rewards/accuracies": 0.5, "rewards/chosen": -40.22833251953125, "rewards/margins": -8.267722129821777, "rewards/rejected": -31.960607528686523, "step": 8770 }, { "epoch": 0.51, "grad_norm": 104.79503631591797, "learning_rate": 0.0008330043732342583, "logits/chosen": -10.665251731872559, "logits/rejected": -11.193120956420898, "logps/chosen": -2416.0302734375, "logps/rejected": -2032.2711181640625, "loss": 17.9628, "rewards/accuracies": 0.5, "rewards/chosen": -79.06703186035156, "rewards/margins": -5.622580528259277, "rewards/rejected": -73.44445037841797, "step": 8780 }, { "epoch": 0.51, "grad_norm": 6.31557035446167, "learning_rate": 0.0008328108672936259, "logits/chosen": -10.988439559936523, "logits/rejected": -10.772183418273926, "logps/chosen": -2577.70947265625, "logps/rejected": -2393.514404296875, "loss": 17.5325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -111.52632904052734, "rewards/margins": -8.236982345581055, "rewards/rejected": -103.28934478759766, "step": 8790 }, { "epoch": 0.51, "grad_norm": 118.4758529663086, "learning_rate": 0.0008326173613529935, "logits/chosen": -10.802563667297363, "logits/rejected": -12.83941650390625, "logps/chosen": -2825.70458984375, "logps/rejected": -2618.272705078125, "loss": 20.6498, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -44.185752868652344, "rewards/margins": -12.935188293457031, "rewards/rejected": -31.250560760498047, "step": 8800 }, { "epoch": 0.51, "grad_norm": 102.6593017578125, "learning_rate": 0.0008324238554123612, "logits/chosen": -13.652326583862305, "logits/rejected": -15.926362991333008, "logps/chosen": -2585.658447265625, "logps/rejected": -2340.5068359375, "loss": 21.5338, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -71.1918716430664, "rewards/margins": -9.319119453430176, "rewards/rejected": -61.87274932861328, "step": 8810 }, { "epoch": 0.51, "grad_norm": 1.2688340378375074e-09, "learning_rate": 0.0008322303494717288, "logits/chosen": -14.893896102905273, "logits/rejected": -13.659103393554688, "logps/chosen": -2258.7568359375, "logps/rejected": -2248.580810546875, "loss": 7.4543, "rewards/accuracies": 0.5, "rewards/chosen": -77.45433807373047, "rewards/margins": 4.771981239318848, "rewards/rejected": -82.226318359375, "step": 8820 }, { "epoch": 0.51, "grad_norm": 0.003566168248653412, "learning_rate": 0.0008320368435310964, "logits/chosen": -18.413280487060547, "logits/rejected": -16.67293357849121, "logps/chosen": -2208.911376953125, "logps/rejected": -2077.30419921875, "loss": 44.9053, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -175.25689697265625, "rewards/margins": -35.69640350341797, "rewards/rejected": -139.56048583984375, "step": 8830 }, { "epoch": 0.51, "grad_norm": 68.14888763427734, "learning_rate": 0.000831843337590464, "logits/chosen": -13.053186416625977, "logits/rejected": -12.305240631103516, "logps/chosen": -2321.607177734375, "logps/rejected": -2360.88232421875, "loss": 8.0633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -124.70591735839844, "rewards/margins": 12.251199722290039, "rewards/rejected": -136.95713806152344, "step": 8840 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.0008316498316498316, "logits/chosen": -10.499658584594727, "logits/rejected": -11.893592834472656, "logps/chosen": -2336.58935546875, "logps/rejected": -1786.211669921875, "loss": 16.1625, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.6756820678711, "rewards/margins": -4.036545276641846, "rewards/rejected": -102.63914489746094, "step": 8850 }, { "epoch": 0.51, "grad_norm": 116.74126434326172, "learning_rate": 0.0008314563257091993, "logits/chosen": -13.93596076965332, "logits/rejected": -14.626673698425293, "logps/chosen": -1986.6507568359375, "logps/rejected": -1704.6920166015625, "loss": 22.3458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.7268524169922, "rewards/margins": -9.050116539001465, "rewards/rejected": -123.6767349243164, "step": 8860 }, { "epoch": 0.51, "grad_norm": 178.73480224609375, "learning_rate": 0.0008312628197685669, "logits/chosen": -10.267219543457031, "logits/rejected": -10.45649242401123, "logps/chosen": -2096.4560546875, "logps/rejected": -2148.703125, "loss": 23.0441, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.819408416748047, "rewards/margins": -14.628385543823242, "rewards/rejected": -11.191020011901855, "step": 8870 }, { "epoch": 0.51, "grad_norm": 405.42755126953125, "learning_rate": 0.0008310693138279345, "logits/chosen": -12.011720657348633, "logits/rejected": -17.788860321044922, "logps/chosen": -1856.583740234375, "logps/rejected": -1554.4124755859375, "loss": 45.0818, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -148.66409301757812, "rewards/margins": -42.536590576171875, "rewards/rejected": -106.12748718261719, "step": 8880 }, { "epoch": 0.51, "grad_norm": 272.3133850097656, "learning_rate": 0.0008308758078873021, "logits/chosen": -8.325084686279297, "logits/rejected": -8.589461326599121, "logps/chosen": -1899.7191162109375, "logps/rejected": -1774.4840087890625, "loss": 21.599, "rewards/accuracies": 0.5, "rewards/chosen": -97.38414001464844, "rewards/margins": 4.519256591796875, "rewards/rejected": -101.90339660644531, "step": 8890 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.0008306823019466698, "logits/chosen": -8.654577255249023, "logits/rejected": -11.6696138381958, "logps/chosen": -2156.40966796875, "logps/rejected": -2068.71923828125, "loss": 13.3938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.38392639160156, "rewards/margins": 11.205782890319824, "rewards/rejected": -80.58970642089844, "step": 8900 }, { "epoch": 0.52, "grad_norm": 3.874604814768645e-08, "learning_rate": 0.0008304887960060374, "logits/chosen": -13.424728393554688, "logits/rejected": -14.594533920288086, "logps/chosen": -1906.845458984375, "logps/rejected": -1459.196044921875, "loss": 40.587, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -153.34503173828125, "rewards/margins": -37.111541748046875, "rewards/rejected": -116.2334976196289, "step": 8910 }, { "epoch": 0.52, "grad_norm": 33.25035858154297, "learning_rate": 0.0008302952900654051, "logits/chosen": -15.123936653137207, "logits/rejected": -16.481746673583984, "logps/chosen": -1933.386962890625, "logps/rejected": -1589.2315673828125, "loss": 17.5503, "rewards/accuracies": 0.5, "rewards/chosen": -119.31233215332031, "rewards/margins": -6.360229969024658, "rewards/rejected": -112.95208740234375, "step": 8920 }, { "epoch": 0.52, "grad_norm": 25.451852798461914, "learning_rate": 0.0008301017841247727, "logits/chosen": -12.194811820983887, "logits/rejected": -12.834663391113281, "logps/chosen": -2508.87548828125, "logps/rejected": -1788.4827880859375, "loss": 18.0909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -77.64529418945312, "rewards/margins": 9.94716739654541, "rewards/rejected": -87.59245300292969, "step": 8930 }, { "epoch": 0.52, "grad_norm": 102.91036987304688, "learning_rate": 0.0008299082781841403, "logits/chosen": -9.542208671569824, "logits/rejected": -11.248008728027344, "logps/chosen": -2263.349609375, "logps/rejected": -2191.68115234375, "loss": 20.0857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -78.76920318603516, "rewards/margins": -4.223578453063965, "rewards/rejected": -74.54562377929688, "step": 8940 }, { "epoch": 0.52, "grad_norm": 0.898160457611084, "learning_rate": 0.0008297147722435079, "logits/chosen": -10.316808700561523, "logits/rejected": -12.855987548828125, "logps/chosen": -2406.214111328125, "logps/rejected": -2196.955810546875, "loss": 24.1377, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -159.3536376953125, "rewards/margins": -9.976371765136719, "rewards/rejected": -149.37725830078125, "step": 8950 }, { "epoch": 0.52, "grad_norm": 155.93467712402344, "learning_rate": 0.0008295212663028755, "logits/chosen": -9.884881019592285, "logits/rejected": -10.904083251953125, "logps/chosen": -2101.21484375, "logps/rejected": -1766.4202880859375, "loss": 19.0848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.28138732910156, "rewards/margins": -12.643484115600586, "rewards/rejected": -126.63790130615234, "step": 8960 }, { "epoch": 0.52, "grad_norm": 455.29205322265625, "learning_rate": 0.0008293277603622431, "logits/chosen": -9.785070419311523, "logits/rejected": -10.942636489868164, "logps/chosen": -2081.48486328125, "logps/rejected": -2075.90966796875, "loss": 15.2713, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -51.15850830078125, "rewards/margins": -6.713853359222412, "rewards/rejected": -44.44465255737305, "step": 8970 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.0008291342544216108, "logits/chosen": -12.282742500305176, "logits/rejected": -12.889829635620117, "logps/chosen": -2230.459716796875, "logps/rejected": -2031.380615234375, "loss": 21.3681, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.30581665039062, "rewards/margins": -13.069925308227539, "rewards/rejected": -146.2358856201172, "step": 8980 }, { "epoch": 0.52, "grad_norm": 1.1754489949610153e-21, "learning_rate": 0.0008289407484809784, "logits/chosen": -13.323480606079102, "logits/rejected": -13.4592924118042, "logps/chosen": -2075.1123046875, "logps/rejected": -1695.7738037109375, "loss": 15.3045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.41594696044922, "rewards/margins": 1.5917861461639404, "rewards/rejected": -71.00773620605469, "step": 8990 }, { "epoch": 0.52, "grad_norm": 63.40122604370117, "learning_rate": 0.0008287472425403461, "logits/chosen": -12.251176834106445, "logits/rejected": -11.198819160461426, "logps/chosen": -2694.69140625, "logps/rejected": -2295.784423828125, "loss": 10.5543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -71.49627685546875, "rewards/margins": 10.63618278503418, "rewards/rejected": -82.13246154785156, "step": 9000 }, { "epoch": 0.52, "grad_norm": 4.397655061825653e-19, "learning_rate": 0.0008285537365997136, "logits/chosen": -11.558141708374023, "logits/rejected": -12.617240905761719, "logps/chosen": -2562.150390625, "logps/rejected": -2328.61962890625, "loss": 17.6277, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -141.85691833496094, "rewards/margins": -10.074060440063477, "rewards/rejected": -131.78285217285156, "step": 9010 }, { "epoch": 0.52, "grad_norm": 87.57040405273438, "learning_rate": 0.0008283602306590812, "logits/chosen": -13.631719589233398, "logits/rejected": -17.063358306884766, "logps/chosen": -1972.6988525390625, "logps/rejected": -1529.208740234375, "loss": 41.386, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -170.14646911621094, "rewards/margins": -37.612281799316406, "rewards/rejected": -132.53421020507812, "step": 9020 }, { "epoch": 0.52, "grad_norm": 63.806678771972656, "learning_rate": 0.0008281667247184489, "logits/chosen": -9.998491287231445, "logits/rejected": -10.632314682006836, "logps/chosen": -2151.078125, "logps/rejected": -2123.09375, "loss": 15.0771, "rewards/accuracies": 0.5, "rewards/chosen": -54.071800231933594, "rewards/margins": -3.135472059249878, "rewards/rejected": -50.93633270263672, "step": 9030 }, { "epoch": 0.52, "grad_norm": 6.277541637420654, "learning_rate": 0.0008279732187778165, "logits/chosen": -10.74350357055664, "logits/rejected": -12.352733612060547, "logps/chosen": -1706.4498291015625, "logps/rejected": -1786.473876953125, "loss": 12.5146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1922500133514404, "rewards/margins": 1.710906982421875, "rewards/rejected": -3.9031569957733154, "step": 9040 }, { "epoch": 0.52, "grad_norm": 1335.204345703125, "learning_rate": 0.0008277797128371841, "logits/chosen": -10.516438484191895, "logits/rejected": -11.214517593383789, "logps/chosen": -2288.966552734375, "logps/rejected": -2689.185546875, "loss": 28.8084, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -76.7777099609375, "rewards/margins": 12.013806343078613, "rewards/rejected": -88.79150390625, "step": 9050 }, { "epoch": 0.52, "grad_norm": 12.139188766479492, "learning_rate": 0.0008275862068965517, "logits/chosen": -8.904858589172363, "logits/rejected": -8.910046577453613, "logps/chosen": -3441.232421875, "logps/rejected": -2276.962158203125, "loss": 80.8171, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -218.2306671142578, "rewards/margins": -70.34574890136719, "rewards/rejected": -147.88491821289062, "step": 9060 }, { "epoch": 0.53, "grad_norm": 7.095157930148491e-15, "learning_rate": 0.0008273927009559193, "logits/chosen": -10.384624481201172, "logits/rejected": -11.571101188659668, "logps/chosen": -2096.615234375, "logps/rejected": -2100.17626953125, "loss": 9.4261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -48.984527587890625, "rewards/margins": 6.778067111968994, "rewards/rejected": -55.7625846862793, "step": 9070 }, { "epoch": 0.53, "grad_norm": 58.066871643066406, "learning_rate": 0.0008271991950152869, "logits/chosen": -11.003621101379395, "logits/rejected": -11.36296272277832, "logps/chosen": -2195.460693359375, "logps/rejected": -2094.3515625, "loss": 13.7208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -105.8720932006836, "rewards/margins": -2.3301568031311035, "rewards/rejected": -103.54194641113281, "step": 9080 }, { "epoch": 0.53, "grad_norm": 38.515113830566406, "learning_rate": 0.0008270056890746546, "logits/chosen": -10.850716590881348, "logits/rejected": -10.968320846557617, "logps/chosen": -2408.284912109375, "logps/rejected": -2016.962890625, "loss": 11.8649, "rewards/accuracies": 0.5, "rewards/chosen": -127.79984283447266, "rewards/margins": -5.133469581604004, "rewards/rejected": -122.66636657714844, "step": 9090 }, { "epoch": 0.53, "grad_norm": 34.334930419921875, "learning_rate": 0.0008268121831340222, "logits/chosen": -8.835197448730469, "logits/rejected": -9.236567497253418, "logps/chosen": -2241.03369140625, "logps/rejected": -1784.2073974609375, "loss": 29.1697, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.48197555541992, "rewards/margins": -16.284019470214844, "rewards/rejected": -33.197959899902344, "step": 9100 }, { "epoch": 0.53, "grad_norm": 37.45695114135742, "learning_rate": 0.0008266186771933899, "logits/chosen": -10.145214080810547, "logits/rejected": -10.225927352905273, "logps/chosen": -1889.9547119140625, "logps/rejected": -1545.9683837890625, "loss": 10.5994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -56.85972213745117, "rewards/margins": 3.1277832984924316, "rewards/rejected": -59.98749542236328, "step": 9110 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.0008264251712527575, "logits/chosen": -11.845251083374023, "logits/rejected": -12.03662395477295, "logps/chosen": -2002.4183349609375, "logps/rejected": -1817.880615234375, "loss": 9.4699, "rewards/accuracies": 0.5, "rewards/chosen": -101.03169250488281, "rewards/margins": 15.874505996704102, "rewards/rejected": -116.90620422363281, "step": 9120 }, { "epoch": 0.53, "grad_norm": 93.3855972290039, "learning_rate": 0.0008262316653121251, "logits/chosen": -11.097436904907227, "logits/rejected": -11.18120288848877, "logps/chosen": -2187.70263671875, "logps/rejected": -1942.685546875, "loss": 42.4407, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -95.54930877685547, "rewards/margins": -41.91950225830078, "rewards/rejected": -53.62981033325195, "step": 9130 }, { "epoch": 0.53, "grad_norm": 289.0438537597656, "learning_rate": 0.0008260381593714927, "logits/chosen": -12.097888946533203, "logits/rejected": -12.13855266571045, "logps/chosen": -1994.136474609375, "logps/rejected": -1954.688720703125, "loss": 21.833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.7875518798828, "rewards/margins": -4.068692207336426, "rewards/rejected": -125.7188720703125, "step": 9140 }, { "epoch": 0.53, "grad_norm": 931.577880859375, "learning_rate": 0.0008258446534308604, "logits/chosen": -13.139033317565918, "logits/rejected": -14.836530685424805, "logps/chosen": -1851.4820556640625, "logps/rejected": -1842.4888916015625, "loss": 37.921, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -103.44264221191406, "rewards/margins": -23.802227020263672, "rewards/rejected": -79.6404037475586, "step": 9150 }, { "epoch": 0.53, "grad_norm": 32.64853286743164, "learning_rate": 0.000825651147490228, "logits/chosen": -13.355486869812012, "logits/rejected": -14.517115592956543, "logps/chosen": -1790.807861328125, "logps/rejected": -1514.279296875, "loss": 16.4409, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.99940490722656, "rewards/margins": -3.0768818855285645, "rewards/rejected": -110.92252349853516, "step": 9160 }, { "epoch": 0.53, "grad_norm": 9.904084925088661e-23, "learning_rate": 0.0008254576415495956, "logits/chosen": -11.468954086303711, "logits/rejected": -12.835226058959961, "logps/chosen": -2627.63037109375, "logps/rejected": -1857.3492431640625, "loss": 29.3484, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.5005874633789, "rewards/margins": -11.342364311218262, "rewards/rejected": -114.15821838378906, "step": 9170 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.0008252641356089632, "logits/chosen": -14.438798904418945, "logits/rejected": -14.942463874816895, "logps/chosen": -2540.48779296875, "logps/rejected": -2337.59228515625, "loss": 34.105, "rewards/accuracies": 0.5, "rewards/chosen": -140.9961395263672, "rewards/margins": -14.764528274536133, "rewards/rejected": -126.23160552978516, "step": 9180 }, { "epoch": 0.53, "grad_norm": 5.768340471945521e-08, "learning_rate": 0.0008250706296683308, "logits/chosen": -13.667268753051758, "logits/rejected": -16.46430015563965, "logps/chosen": -2151.13427734375, "logps/rejected": -1760.7171630859375, "loss": 19.3243, "rewards/accuracies": 0.5, "rewards/chosen": -59.7153434753418, "rewards/margins": -5.393930912017822, "rewards/rejected": -54.3214111328125, "step": 9190 }, { "epoch": 0.53, "grad_norm": 9.689595792394812e-10, "learning_rate": 0.0008248771237276985, "logits/chosen": -10.279787063598633, "logits/rejected": -11.078729629516602, "logps/chosen": -2658.763427734375, "logps/rejected": -2341.802490234375, "loss": 19.3646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -114.14299011230469, "rewards/margins": 3.5692267417907715, "rewards/rejected": -117.71221923828125, "step": 9200 }, { "epoch": 0.53, "grad_norm": 2.687352180480957, "learning_rate": 0.0008246836177870662, "logits/chosen": -11.846173286437988, "logits/rejected": -11.595508575439453, "logps/chosen": -2489.6318359375, "logps/rejected": -2415.840087890625, "loss": 14.6525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -101.1736068725586, "rewards/margins": -6.315918922424316, "rewards/rejected": -94.8576889038086, "step": 9210 }, { "epoch": 0.53, "grad_norm": 158.8249969482422, "learning_rate": 0.0008244901118464338, "logits/chosen": -12.773447036743164, "logits/rejected": -15.0693998336792, "logps/chosen": -1924.984619140625, "logps/rejected": -1690.483154296875, "loss": 23.6071, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -94.05915069580078, "rewards/margins": -16.23952293395996, "rewards/rejected": -77.81962585449219, "step": 9220 }, { "epoch": 0.53, "grad_norm": 20.87021827697754, "learning_rate": 0.0008242966059058013, "logits/chosen": -14.007112503051758, "logits/rejected": -14.23765754699707, "logps/chosen": -2180.34619140625, "logps/rejected": -1925.5406494140625, "loss": 26.1363, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -119.72090148925781, "rewards/margins": -24.955204010009766, "rewards/rejected": -94.76568603515625, "step": 9230 }, { "epoch": 0.53, "grad_norm": 0.013048894703388214, "learning_rate": 0.0008241030999651689, "logits/chosen": -12.918479919433594, "logits/rejected": -11.968243598937988, "logps/chosen": -2468.30810546875, "logps/rejected": -2481.009033203125, "loss": 10.1932, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -98.40420532226562, "rewards/margins": -3.4973456859588623, "rewards/rejected": -94.90686798095703, "step": 9240 }, { "epoch": 0.54, "grad_norm": 0.00012203202641103417, "learning_rate": 0.0008239095940245365, "logits/chosen": -10.072267532348633, "logits/rejected": -11.66235065460205, "logps/chosen": -2860.320556640625, "logps/rejected": -2520.76220703125, "loss": 21.033, "rewards/accuracies": 0.5, "rewards/chosen": -98.27503967285156, "rewards/margins": -17.533266067504883, "rewards/rejected": -80.74177551269531, "step": 9250 }, { "epoch": 0.54, "grad_norm": 1.9695301034516888e-06, "learning_rate": 0.0008237160880839042, "logits/chosen": -11.27623176574707, "logits/rejected": -14.224752426147461, "logps/chosen": -2860.755126953125, "logps/rejected": -2420.46337890625, "loss": 12.066, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -68.60883331298828, "rewards/margins": -1.6656593084335327, "rewards/rejected": -66.94317626953125, "step": 9260 }, { "epoch": 0.54, "grad_norm": 202.675537109375, "learning_rate": 0.0008235225821432718, "logits/chosen": -12.886335372924805, "logits/rejected": -15.506538391113281, "logps/chosen": -2296.033935546875, "logps/rejected": -2105.817138671875, "loss": 18.2568, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -130.1437530517578, "rewards/margins": -13.418130874633789, "rewards/rejected": -116.72562408447266, "step": 9270 }, { "epoch": 0.54, "grad_norm": 11.793557167053223, "learning_rate": 0.0008233290762026394, "logits/chosen": -13.178936958312988, "logits/rejected": -13.380398750305176, "logps/chosen": -1655.115966796875, "logps/rejected": -1573.714599609375, "loss": 6.9525, "rewards/accuracies": 0.5, "rewards/chosen": -94.19901275634766, "rewards/margins": -0.3781314790248871, "rewards/rejected": -93.82087707519531, "step": 9280 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.000823135570262007, "logits/chosen": -10.931343078613281, "logits/rejected": -10.949737548828125, "logps/chosen": -2485.64404296875, "logps/rejected": -2006.208984375, "loss": 27.5601, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -104.06172180175781, "rewards/margins": -18.609174728393555, "rewards/rejected": -85.45256042480469, "step": 9290 }, { "epoch": 0.54, "grad_norm": 42.85221862792969, "learning_rate": 0.0008229420643213746, "logits/chosen": -11.57166862487793, "logits/rejected": -11.48002815246582, "logps/chosen": -2328.886962890625, "logps/rejected": -1900.1810302734375, "loss": 39.9992, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -158.37008666992188, "rewards/margins": -34.206581115722656, "rewards/rejected": -124.16349792480469, "step": 9300 }, { "epoch": 0.54, "grad_norm": 0.0022866094950586557, "learning_rate": 0.0008227485583807422, "logits/chosen": -12.623444557189941, "logits/rejected": -16.205917358398438, "logps/chosen": -1847.2138671875, "logps/rejected": -1321.2369384765625, "loss": 42.9973, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -153.86770629882812, "rewards/margins": -41.85033416748047, "rewards/rejected": -112.0173568725586, "step": 9310 }, { "epoch": 0.54, "grad_norm": 8.560800779378042e-08, "learning_rate": 0.00082255505244011, "logits/chosen": -9.599631309509277, "logits/rejected": -10.024946212768555, "logps/chosen": -2073.232421875, "logps/rejected": -2086.015625, "loss": 38.3388, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -103.964111328125, "rewards/margins": -29.858911514282227, "rewards/rejected": -74.1052017211914, "step": 9320 }, { "epoch": 0.54, "grad_norm": 9.356824104894912e-19, "learning_rate": 0.0008223615464994776, "logits/chosen": -10.536792755126953, "logits/rejected": -12.97068977355957, "logps/chosen": -1983.870361328125, "logps/rejected": -1297.036865234375, "loss": 43.6158, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -128.53939819335938, "rewards/margins": -26.82571029663086, "rewards/rejected": -101.71366882324219, "step": 9330 }, { "epoch": 0.54, "grad_norm": 45.94268035888672, "learning_rate": 0.0008221680405588452, "logits/chosen": -14.764120101928711, "logits/rejected": -15.033167839050293, "logps/chosen": -1795.800048828125, "logps/rejected": -1702.24609375, "loss": 14.8691, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -150.46121215820312, "rewards/margins": -6.221024513244629, "rewards/rejected": -144.24017333984375, "step": 9340 }, { "epoch": 0.54, "grad_norm": 123.841796875, "learning_rate": 0.0008219745346182128, "logits/chosen": -11.259889602661133, "logits/rejected": -13.9977445602417, "logps/chosen": -2015.2281494140625, "logps/rejected": -1758.702880859375, "loss": 27.8736, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -105.080322265625, "rewards/margins": -24.69887924194336, "rewards/rejected": -80.38145446777344, "step": 9350 }, { "epoch": 0.54, "grad_norm": 112.91950225830078, "learning_rate": 0.0008217810286775804, "logits/chosen": -13.084157943725586, "logits/rejected": -15.057512283325195, "logps/chosen": -2171.442138671875, "logps/rejected": -1972.863525390625, "loss": 28.0144, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.2085723876953, "rewards/margins": -14.636126518249512, "rewards/rejected": -163.5724639892578, "step": 9360 }, { "epoch": 0.54, "grad_norm": 52.70233917236328, "learning_rate": 0.000821587522736948, "logits/chosen": -12.8746337890625, "logits/rejected": -15.628274917602539, "logps/chosen": -2299.9013671875, "logps/rejected": -2093.0576171875, "loss": 31.708, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -127.45903015136719, "rewards/margins": -20.28015899658203, "rewards/rejected": -107.1788558959961, "step": 9370 }, { "epoch": 0.54, "grad_norm": 0.0030086925253272057, "learning_rate": 0.0008213940167963157, "logits/chosen": -10.73341178894043, "logits/rejected": -9.305550575256348, "logps/chosen": -1959.4779052734375, "logps/rejected": -2351.35107421875, "loss": 3.6097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -78.10276794433594, "rewards/margins": 22.89084815979004, "rewards/rejected": -100.99360656738281, "step": 9380 }, { "epoch": 0.54, "grad_norm": 1.7670190334320068, "learning_rate": 0.0008212005108556833, "logits/chosen": -4.709872722625732, "logits/rejected": -5.606122016906738, "logps/chosen": -2643.8642578125, "logps/rejected": -2390.008544921875, "loss": 18.7341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.289052963256836, "rewards/margins": -7.165470123291016, "rewards/rejected": -17.123584747314453, "step": 9390 }, { "epoch": 0.54, "grad_norm": 123.60616302490234, "learning_rate": 0.0008210070049150509, "logits/chosen": -12.605390548706055, "logits/rejected": -12.652070999145508, "logps/chosen": -1554.0621337890625, "logps/rejected": -1497.7220458984375, "loss": 17.4244, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -126.89418029785156, "rewards/margins": -3.009294033050537, "rewards/rejected": -123.8848876953125, "step": 9400 }, { "epoch": 0.54, "grad_norm": 1.870528480168326e-15, "learning_rate": 0.0008208134989744185, "logits/chosen": -12.689577102661133, "logits/rejected": -12.570475578308105, "logps/chosen": -2236.90771484375, "logps/rejected": -2008.886474609375, "loss": 11.999, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.15023040771484, "rewards/margins": 2.990631103515625, "rewards/rejected": -121.14085388183594, "step": 9410 }, { "epoch": 0.55, "grad_norm": 31.167707443237305, "learning_rate": 0.0008206199930337862, "logits/chosen": -14.73560619354248, "logits/rejected": -17.897262573242188, "logps/chosen": -1879.652587890625, "logps/rejected": -1888.34375, "loss": 7.1945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.34805297851562, "rewards/margins": 4.543313026428223, "rewards/rejected": -157.891357421875, "step": 9420 }, { "epoch": 0.55, "grad_norm": 65.51554107666016, "learning_rate": 0.0008204264870931539, "logits/chosen": -11.544507026672363, "logits/rejected": -13.22834587097168, "logps/chosen": -2308.16845703125, "logps/rejected": -1931.599853515625, "loss": 34.8912, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -115.28236389160156, "rewards/margins": -28.174907684326172, "rewards/rejected": -87.10746002197266, "step": 9430 }, { "epoch": 0.55, "grad_norm": 5.448008550956729e-07, "learning_rate": 0.0008202329811525215, "logits/chosen": -10.847806930541992, "logits/rejected": -11.602781295776367, "logps/chosen": -2157.336669921875, "logps/rejected": -2113.345947265625, "loss": 5.7947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -64.8131103515625, "rewards/margins": 0.3827091157436371, "rewards/rejected": -65.19581604003906, "step": 9440 }, { "epoch": 0.55, "grad_norm": 65.6622085571289, "learning_rate": 0.000820039475211889, "logits/chosen": -13.604113578796387, "logits/rejected": -13.83837604522705, "logps/chosen": -2147.584228515625, "logps/rejected": -1747.028076171875, "loss": 34.9639, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.47789001464844, "rewards/margins": -25.51144790649414, "rewards/rejected": -91.9664306640625, "step": 9450 }, { "epoch": 0.55, "grad_norm": 0.0015886790351942182, "learning_rate": 0.0008198459692712566, "logits/chosen": -12.68348503112793, "logits/rejected": -13.408116340637207, "logps/chosen": -2243.40283203125, "logps/rejected": -1513.94189453125, "loss": 21.8027, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.46076965332031, "rewards/margins": -10.282439231872559, "rewards/rejected": -113.1783218383789, "step": 9460 }, { "epoch": 0.55, "grad_norm": 9.347408294677734, "learning_rate": 0.0008196524633306242, "logits/chosen": -13.87774658203125, "logits/rejected": -14.019277572631836, "logps/chosen": -2032.9925537109375, "logps/rejected": -1926.488525390625, "loss": 12.2859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -129.9187469482422, "rewards/margins": -2.854235887527466, "rewards/rejected": -127.06449890136719, "step": 9470 }, { "epoch": 0.55, "grad_norm": 137.6370391845703, "learning_rate": 0.0008194589573899918, "logits/chosen": -13.2901029586792, "logits/rejected": -14.030738830566406, "logps/chosen": -1983.8828125, "logps/rejected": -1676.4547119140625, "loss": 22.0688, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -128.95118713378906, "rewards/margins": -21.6484375, "rewards/rejected": -107.302734375, "step": 9480 }, { "epoch": 0.55, "grad_norm": 157.30047607421875, "learning_rate": 0.0008192654514493595, "logits/chosen": -13.29333782196045, "logits/rejected": -14.197527885437012, "logps/chosen": -2551.377685546875, "logps/rejected": -2224.91845703125, "loss": 26.6288, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -141.33987426757812, "rewards/margins": -24.389026641845703, "rewards/rejected": -116.95084381103516, "step": 9490 }, { "epoch": 0.55, "grad_norm": 153.16653442382812, "learning_rate": 0.0008190719455087271, "logits/chosen": -15.329946517944336, "logits/rejected": -17.89394760131836, "logps/chosen": -2618.682861328125, "logps/rejected": -2108.26416015625, "loss": 46.1504, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -202.24929809570312, "rewards/margins": -44.67238235473633, "rewards/rejected": -157.57693481445312, "step": 9500 }, { "epoch": 0.55, "grad_norm": 97.2415771484375, "learning_rate": 0.0008188784395680947, "logits/chosen": -13.069684028625488, "logits/rejected": -14.118746757507324, "logps/chosen": -1869.7421875, "logps/rejected": -1867.3372802734375, "loss": 10.5184, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.36351013183594, "rewards/margins": 1.3165992498397827, "rewards/rejected": -119.68011474609375, "step": 9510 }, { "epoch": 0.55, "grad_norm": 216.78829956054688, "learning_rate": 0.0008186849336274623, "logits/chosen": -14.086462020874023, "logits/rejected": -12.693609237670898, "logps/chosen": -1925.612060546875, "logps/rejected": -1901.5015869140625, "loss": 4.5926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -127.94816589355469, "rewards/margins": 1.0508416891098022, "rewards/rejected": -128.99900817871094, "step": 9520 }, { "epoch": 0.55, "grad_norm": 111.29039764404297, "learning_rate": 0.00081849142768683, "logits/chosen": -12.428119659423828, "logits/rejected": -15.0492525100708, "logps/chosen": -2089.381591796875, "logps/rejected": -2043.737060546875, "loss": 21.1497, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -89.95316314697266, "rewards/margins": -5.621490478515625, "rewards/rejected": -84.33168029785156, "step": 9530 }, { "epoch": 0.55, "grad_norm": 1.054912335263225e-08, "learning_rate": 0.0008182979217461977, "logits/chosen": -13.438592910766602, "logits/rejected": -15.182182312011719, "logps/chosen": -1721.1396484375, "logps/rejected": -1491.5618896484375, "loss": 19.9846, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -141.54745483398438, "rewards/margins": -14.136856079101562, "rewards/rejected": -127.41060638427734, "step": 9540 }, { "epoch": 0.55, "grad_norm": 68.04295349121094, "learning_rate": 0.0008181044158055653, "logits/chosen": -13.105778694152832, "logits/rejected": -17.412616729736328, "logps/chosen": -2056.965087890625, "logps/rejected": -1892.180419921875, "loss": 12.2438, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -169.26235961914062, "rewards/margins": -3.914142608642578, "rewards/rejected": -165.34820556640625, "step": 9550 }, { "epoch": 0.55, "grad_norm": 2.03547824639827e-06, "learning_rate": 0.0008179109098649329, "logits/chosen": -10.61835765838623, "logits/rejected": -10.968077659606934, "logps/chosen": -2330.36328125, "logps/rejected": -1774.272216796875, "loss": 22.9013, "rewards/accuracies": 0.5, "rewards/chosen": -131.39315795898438, "rewards/margins": -6.209153652191162, "rewards/rejected": -125.18399810791016, "step": 9560 }, { "epoch": 0.55, "grad_norm": 8.493576331147779e-08, "learning_rate": 0.0008177174039243005, "logits/chosen": -10.946969985961914, "logits/rejected": -10.362699508666992, "logps/chosen": -2109.11083984375, "logps/rejected": -2043.341064453125, "loss": 17.0163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -100.10649108886719, "rewards/margins": 0.3516899049282074, "rewards/rejected": -100.45818328857422, "step": 9570 }, { "epoch": 0.55, "grad_norm": 59.422325134277344, "learning_rate": 0.0008175238979836681, "logits/chosen": -9.663399696350098, "logits/rejected": -10.45376968383789, "logps/chosen": -2240.4140625, "logps/rejected": -2161.5986328125, "loss": 6.9429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -61.351966857910156, "rewards/margins": 4.958788871765137, "rewards/rejected": -66.31076049804688, "step": 9580 }, { "epoch": 0.56, "grad_norm": 93.50257110595703, "learning_rate": 0.0008173303920430357, "logits/chosen": -13.68664264678955, "logits/rejected": -16.67251968383789, "logps/chosen": -2221.343017578125, "logps/rejected": -1827.4000244140625, "loss": 31.5426, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -184.47821044921875, "rewards/margins": -29.061471939086914, "rewards/rejected": -155.416748046875, "step": 9590 }, { "epoch": 0.56, "grad_norm": 1.010399273582152e-06, "learning_rate": 0.0008171368861024034, "logits/chosen": -11.032011032104492, "logits/rejected": -12.911949157714844, "logps/chosen": -2404.41259765625, "logps/rejected": -1849.030517578125, "loss": 15.8506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -89.66154479980469, "rewards/margins": 17.390544891357422, "rewards/rejected": -107.05207824707031, "step": 9600 }, { "epoch": 0.56, "grad_norm": 6.609341198782204e-07, "learning_rate": 0.000816943380161771, "logits/chosen": -13.20196533203125, "logits/rejected": -13.947955131530762, "logps/chosen": -2022.3291015625, "logps/rejected": -1695.206787109375, "loss": 17.1719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -134.63084411621094, "rewards/margins": -5.593421936035156, "rewards/rejected": -129.0374298095703, "step": 9610 }, { "epoch": 0.56, "grad_norm": 10.77879524230957, "learning_rate": 0.0008167498742211386, "logits/chosen": -11.752837181091309, "logits/rejected": -11.610013961791992, "logps/chosen": -2312.151123046875, "logps/rejected": -2225.67626953125, "loss": 6.6763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.0223388671875, "rewards/margins": -1.221543550491333, "rewards/rejected": -130.80078125, "step": 9620 }, { "epoch": 0.56, "grad_norm": 114.01460266113281, "learning_rate": 0.0008165563682805063, "logits/chosen": -10.968283653259277, "logits/rejected": -11.204283714294434, "logps/chosen": -2475.881591796875, "logps/rejected": -2379.093017578125, "loss": 2.694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.19474029541016, "rewards/margins": 13.685453414916992, "rewards/rejected": -111.88018798828125, "step": 9630 }, { "epoch": 0.56, "grad_norm": 1.4145904970064294e-05, "learning_rate": 0.0008163628623398739, "logits/chosen": -16.662626266479492, "logits/rejected": -17.580074310302734, "logps/chosen": -2598.02734375, "logps/rejected": -2524.24462890625, "loss": 18.9901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -148.58554077148438, "rewards/margins": -3.2575364112854004, "rewards/rejected": -145.3280029296875, "step": 9640 }, { "epoch": 0.56, "grad_norm": 342.2525634765625, "learning_rate": 0.0008161693563992416, "logits/chosen": -16.459646224975586, "logits/rejected": -20.533031463623047, "logps/chosen": -2162.137939453125, "logps/rejected": -2011.792236328125, "loss": 9.1095, "rewards/accuracies": 0.5, "rewards/chosen": -150.43199157714844, "rewards/margins": -3.6015846729278564, "rewards/rejected": -146.83041381835938, "step": 9650 }, { "epoch": 0.56, "grad_norm": 142.2134552001953, "learning_rate": 0.0008159758504586092, "logits/chosen": -12.118268966674805, "logits/rejected": -13.48572063446045, "logps/chosen": -2611.421875, "logps/rejected": -2186.361083984375, "loss": 28.9856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -114.92079162597656, "rewards/margins": -20.452194213867188, "rewards/rejected": -94.46859741210938, "step": 9660 }, { "epoch": 0.56, "grad_norm": 6.151932047045099e-21, "learning_rate": 0.0008157823445179767, "logits/chosen": -11.532811164855957, "logits/rejected": -12.66594409942627, "logps/chosen": -2160.42626953125, "logps/rejected": -1693.835205078125, "loss": 12.6022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -74.81261444091797, "rewards/margins": 14.179158210754395, "rewards/rejected": -88.99177551269531, "step": 9670 }, { "epoch": 0.56, "grad_norm": 153.7386932373047, "learning_rate": 0.0008155888385773443, "logits/chosen": -14.817545890808105, "logits/rejected": -15.042956352233887, "logps/chosen": -2094.99462890625, "logps/rejected": -2021.8978271484375, "loss": 13.1437, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -142.20547485351562, "rewards/margins": -7.843331813812256, "rewards/rejected": -134.3621368408203, "step": 9680 }, { "epoch": 0.56, "grad_norm": 152.37193298339844, "learning_rate": 0.0008153953326367119, "logits/chosen": -11.687958717346191, "logits/rejected": -11.724588394165039, "logps/chosen": -2099.8193359375, "logps/rejected": -2280.37451171875, "loss": 18.7454, "rewards/accuracies": 0.5, "rewards/chosen": -105.5866928100586, "rewards/margins": -1.5652450323104858, "rewards/rejected": -104.02144622802734, "step": 9690 }, { "epoch": 0.56, "grad_norm": 69.84230041503906, "learning_rate": 0.0008152018266960795, "logits/chosen": -11.68011474609375, "logits/rejected": -12.39799690246582, "logps/chosen": -2425.177734375, "logps/rejected": -1980.1177978515625, "loss": 19.4294, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -97.80769348144531, "rewards/margins": -12.199910163879395, "rewards/rejected": -85.60777282714844, "step": 9700 }, { "epoch": 0.56, "grad_norm": 129.51907348632812, "learning_rate": 0.0008150083207554471, "logits/chosen": -10.654571533203125, "logits/rejected": -10.546390533447266, "logps/chosen": -2122.279052734375, "logps/rejected": -1983.3258056640625, "loss": 4.2853, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.0965576171875, "rewards/margins": 5.391696929931641, "rewards/rejected": -54.488258361816406, "step": 9710 }, { "epoch": 0.56, "grad_norm": 5.042804828126965e-16, "learning_rate": 0.0008148148148148148, "logits/chosen": -10.549596786499023, "logits/rejected": -11.287389755249023, "logps/chosen": -2276.50146484375, "logps/rejected": -2069.882568359375, "loss": 20.1043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.74592590332031, "rewards/margins": -10.48997688293457, "rewards/rejected": -110.25593566894531, "step": 9720 }, { "epoch": 0.56, "grad_norm": 0.009204215370118618, "learning_rate": 0.0008146213088741825, "logits/chosen": -11.742853164672852, "logits/rejected": -12.819241523742676, "logps/chosen": -2300.08740234375, "logps/rejected": -1992.831787109375, "loss": 25.5611, "rewards/accuracies": 0.5, "rewards/chosen": -147.17379760742188, "rewards/margins": -14.3836088180542, "rewards/rejected": -132.79019165039062, "step": 9730 }, { "epoch": 0.56, "grad_norm": 5.970980884750267e-15, "learning_rate": 0.0008144278029335501, "logits/chosen": -12.094931602478027, "logits/rejected": -13.305315971374512, "logps/chosen": -1921.061279296875, "logps/rejected": -1870.3092041015625, "loss": 6.6695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -97.10111999511719, "rewards/margins": 16.423782348632812, "rewards/rejected": -113.52490234375, "step": 9740 }, { "epoch": 0.56, "grad_norm": 2.4252139349084352e-15, "learning_rate": 0.0008142342969929177, "logits/chosen": -11.505024909973145, "logits/rejected": -12.117867469787598, "logps/chosen": -1978.196044921875, "logps/rejected": -1857.2359619140625, "loss": 21.407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -94.6145248413086, "rewards/margins": -9.345014572143555, "rewards/rejected": -85.26951599121094, "step": 9750 }, { "epoch": 0.56, "grad_norm": 38.198429107666016, "learning_rate": 0.0008140407910522853, "logits/chosen": -10.870759963989258, "logits/rejected": -10.592488288879395, "logps/chosen": -2245.942138671875, "logps/rejected": -2301.28369140625, "loss": 12.4907, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.25923156738281, "rewards/margins": 1.9756702184677124, "rewards/rejected": -90.23490142822266, "step": 9760 }, { "epoch": 0.57, "grad_norm": 58.076717376708984, "learning_rate": 0.000813847285111653, "logits/chosen": -12.209421157836914, "logits/rejected": -14.326794624328613, "logps/chosen": -2134.402099609375, "logps/rejected": -1951.305908203125, "loss": 18.9411, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.73756408691406, "rewards/margins": -5.9144697189331055, "rewards/rejected": -131.82308959960938, "step": 9770 }, { "epoch": 0.57, "grad_norm": 96.45757293701172, "learning_rate": 0.0008136537791710206, "logits/chosen": -13.606592178344727, "logits/rejected": -14.936346054077148, "logps/chosen": -2547.07763671875, "logps/rejected": -2278.72021484375, "loss": 31.1011, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -149.38551330566406, "rewards/margins": -26.008047103881836, "rewards/rejected": -123.3774642944336, "step": 9780 }, { "epoch": 0.57, "grad_norm": 43.43875503540039, "learning_rate": 0.0008134602732303882, "logits/chosen": -11.674620628356934, "logits/rejected": -12.225374221801758, "logps/chosen": -2559.059814453125, "logps/rejected": -2181.497314453125, "loss": 24.383, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -110.96610260009766, "rewards/margins": -18.244354248046875, "rewards/rejected": -92.72175598144531, "step": 9790 }, { "epoch": 0.57, "grad_norm": 77.05213165283203, "learning_rate": 0.0008132667672897558, "logits/chosen": -11.7524995803833, "logits/rejected": -11.944887161254883, "logps/chosen": -2083.619873046875, "logps/rejected": -1996.3531494140625, "loss": 9.6483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -139.1990203857422, "rewards/margins": 2.7757046222686768, "rewards/rejected": -141.97471618652344, "step": 9800 }, { "epoch": 0.57, "grad_norm": 5.871984958648682, "learning_rate": 0.0008130732613491234, "logits/chosen": -11.36268424987793, "logits/rejected": -10.99272346496582, "logps/chosen": -1756.9521484375, "logps/rejected": -1689.1416015625, "loss": 8.0768, "rewards/accuracies": 0.5, "rewards/chosen": -90.03778839111328, "rewards/margins": 5.755528926849365, "rewards/rejected": -95.79331970214844, "step": 9810 }, { "epoch": 0.57, "grad_norm": 5.083691172018401e-14, "learning_rate": 0.000812879755408491, "logits/chosen": -10.554896354675293, "logits/rejected": -10.38363265991211, "logps/chosen": -2271.612548828125, "logps/rejected": -2324.105224609375, "loss": 7.5172, "rewards/accuracies": 0.5, "rewards/chosen": -79.17463684082031, "rewards/margins": 3.9587721824645996, "rewards/rejected": -83.13340759277344, "step": 9820 }, { "epoch": 0.57, "grad_norm": 0.3029027581214905, "learning_rate": 0.0008126862494678587, "logits/chosen": -12.355805397033691, "logits/rejected": -12.233280181884766, "logps/chosen": -2074.97412109375, "logps/rejected": -2114.395263671875, "loss": 8.5608, "rewards/accuracies": 0.5, "rewards/chosen": -123.88581848144531, "rewards/margins": 2.0759761333465576, "rewards/rejected": -125.9617919921875, "step": 9830 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 0.0008124927435272264, "logits/chosen": -12.295319557189941, "logits/rejected": -12.533897399902344, "logps/chosen": -2403.55322265625, "logps/rejected": -2252.057373046875, "loss": 1.1229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -96.18389129638672, "rewards/margins": 16.810285568237305, "rewards/rejected": -112.9941635131836, "step": 9840 }, { "epoch": 0.57, "grad_norm": 51.383644104003906, "learning_rate": 0.000812299237586594, "logits/chosen": -12.749072074890137, "logits/rejected": -12.801861763000488, "logps/chosen": -2085.2109375, "logps/rejected": -2171.98828125, "loss": 16.9021, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -102.09233856201172, "rewards/margins": -16.525897979736328, "rewards/rejected": -85.56644439697266, "step": 9850 }, { "epoch": 0.57, "grad_norm": 1.563256859779358, "learning_rate": 0.0008121057316459616, "logits/chosen": -12.138263702392578, "logits/rejected": -12.339334487915039, "logps/chosen": -2080.177734375, "logps/rejected": -1960.3369140625, "loss": 33.0766, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.42698669433594, "rewards/margins": -26.563003540039062, "rewards/rejected": -91.8639907836914, "step": 9860 }, { "epoch": 0.57, "grad_norm": 13.901864051818848, "learning_rate": 0.0008119122257053292, "logits/chosen": -11.932510375976562, "logits/rejected": -11.819125175476074, "logps/chosen": -2353.06201171875, "logps/rejected": -2329.762939453125, "loss": 10.4809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.240234375, "rewards/margins": 6.6669158935546875, "rewards/rejected": -108.90715026855469, "step": 9870 }, { "epoch": 0.57, "grad_norm": 62.5307731628418, "learning_rate": 0.0008117187197646967, "logits/chosen": -10.117940902709961, "logits/rejected": -11.37667179107666, "logps/chosen": -2727.2177734375, "logps/rejected": -2398.58447265625, "loss": 12.4602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -57.074012756347656, "rewards/margins": -6.9993391036987305, "rewards/rejected": -50.074668884277344, "step": 9880 }, { "epoch": 0.57, "grad_norm": 2.571602653045346e-10, "learning_rate": 0.0008115252138240644, "logits/chosen": -10.288829803466797, "logits/rejected": -10.213254928588867, "logps/chosen": -2946.0361328125, "logps/rejected": -2835.12939453125, "loss": 10.0361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.60501098632812, "rewards/margins": 4.984689712524414, "rewards/rejected": -96.58970642089844, "step": 9890 }, { "epoch": 0.57, "grad_norm": 129.60333251953125, "learning_rate": 0.000811331707883432, "logits/chosen": -11.163722038269043, "logits/rejected": -13.9684419631958, "logps/chosen": -2475.223876953125, "logps/rejected": -2551.076171875, "loss": 23.013, "rewards/accuracies": 0.5, "rewards/chosen": -98.42587280273438, "rewards/margins": -6.523746490478516, "rewards/rejected": -91.9021224975586, "step": 9900 }, { "epoch": 0.57, "grad_norm": 2.9755231025774265e-06, "learning_rate": 0.0008111382019427996, "logits/chosen": -12.232996940612793, "logits/rejected": -14.090143203735352, "logps/chosen": -2774.530029296875, "logps/rejected": -2527.002685546875, "loss": 19.6009, "rewards/accuracies": 0.5, "rewards/chosen": -88.79605102539062, "rewards/margins": -4.650814056396484, "rewards/rejected": -84.14522552490234, "step": 9910 }, { "epoch": 0.57, "grad_norm": 47.623043060302734, "learning_rate": 0.0008109446960021672, "logits/chosen": -12.901286125183105, "logits/rejected": -14.769037246704102, "logps/chosen": -2010.156494140625, "logps/rejected": -1739.495361328125, "loss": 8.9352, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -138.6757049560547, "rewards/margins": 2.1522769927978516, "rewards/rejected": -140.82797241210938, "step": 9920 }, { "epoch": 0.57, "grad_norm": 0.03816075995564461, "learning_rate": 0.0008107511900615348, "logits/chosen": -11.541097640991211, "logits/rejected": -11.885915756225586, "logps/chosen": -2233.47216796875, "logps/rejected": -2184.30029296875, "loss": 8.0009, "rewards/accuracies": 0.5, "rewards/chosen": -110.6299819946289, "rewards/margins": 2.881948947906494, "rewards/rejected": -113.51192474365234, "step": 9930 }, { "epoch": 0.58, "grad_norm": 56.23078536987305, "learning_rate": 0.0008105576841209026, "logits/chosen": -10.855475425720215, "logits/rejected": -10.970120429992676, "logps/chosen": -2614.212646484375, "logps/rejected": -2538.869140625, "loss": 14.6141, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -49.29011917114258, "rewards/margins": -4.5595197677612305, "rewards/rejected": -44.73059844970703, "step": 9940 }, { "epoch": 0.58, "grad_norm": 89.28201293945312, "learning_rate": 0.0008103641781802702, "logits/chosen": -13.973526000976562, "logits/rejected": -14.771026611328125, "logps/chosen": -2288.79638671875, "logps/rejected": -2074.807861328125, "loss": 18.1398, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -165.72650146484375, "rewards/margins": -10.081003189086914, "rewards/rejected": -155.6455078125, "step": 9950 }, { "epoch": 0.58, "grad_norm": 80.27143096923828, "learning_rate": 0.0008101706722396378, "logits/chosen": -11.260457038879395, "logits/rejected": -12.736414909362793, "logps/chosen": -2432.70751953125, "logps/rejected": -2086.272705078125, "loss": 17.7016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.34976196289062, "rewards/margins": -11.75238037109375, "rewards/rejected": -90.59737396240234, "step": 9960 }, { "epoch": 0.58, "grad_norm": 4.7326003855421056e-17, "learning_rate": 0.0008099771662990054, "logits/chosen": -12.420536994934082, "logits/rejected": -14.486486434936523, "logps/chosen": -2407.95947265625, "logps/rejected": -1871.821044921875, "loss": 28.6512, "rewards/accuracies": 0.5, "rewards/chosen": -127.78287506103516, "rewards/margins": -18.320636749267578, "rewards/rejected": -109.46223449707031, "step": 9970 }, { "epoch": 0.58, "grad_norm": 5.137393177534477e-09, "learning_rate": 0.000809783660358373, "logits/chosen": -13.009714126586914, "logits/rejected": -12.783025741577148, "logps/chosen": -2109.97607421875, "logps/rejected": -1760.408203125, "loss": 19.9107, "rewards/accuracies": 0.5, "rewards/chosen": -140.8122100830078, "rewards/margins": -5.3908843994140625, "rewards/rejected": -135.4213104248047, "step": 9980 }, { "epoch": 0.58, "grad_norm": 99.3053207397461, "learning_rate": 0.0008095901544177406, "logits/chosen": -13.14976978302002, "logits/rejected": -14.143655776977539, "logps/chosen": -2643.01025390625, "logps/rejected": -2353.74951171875, "loss": 15.9978, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -183.95950317382812, "rewards/margins": -12.696640968322754, "rewards/rejected": -171.26287841796875, "step": 9990 }, { "epoch": 0.58, "grad_norm": 5.5997464443180434e-08, "learning_rate": 0.0008093966484771083, "logits/chosen": -14.131141662597656, "logits/rejected": -13.994781494140625, "logps/chosen": -2438.180908203125, "logps/rejected": -2068.4111328125, "loss": 32.4372, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -168.11001586914062, "rewards/margins": -23.992712020874023, "rewards/rejected": -144.11729431152344, "step": 10000 }, { "epoch": 0.58, "grad_norm": 3.420417560295874e-10, "learning_rate": 0.0008092031425364759, "logits/chosen": -13.879406929016113, "logits/rejected": -13.716636657714844, "logps/chosen": -2104.41650390625, "logps/rejected": -2431.9599609375, "loss": 12.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.90599060058594, "rewards/margins": 0.6070358157157898, "rewards/rejected": -142.51303100585938, "step": 10010 }, { "epoch": 0.58, "grad_norm": 131.8125, "learning_rate": 0.0008090096365958435, "logits/chosen": -12.695362091064453, "logits/rejected": -12.925582885742188, "logps/chosen": -2575.844482421875, "logps/rejected": -2234.220458984375, "loss": 19.0464, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -171.5371551513672, "rewards/margins": -11.203272819519043, "rewards/rejected": -160.33389282226562, "step": 10020 }, { "epoch": 0.58, "grad_norm": 118.51277160644531, "learning_rate": 0.0008088161306552111, "logits/chosen": -14.206738471984863, "logits/rejected": -21.22146987915039, "logps/chosen": -2140.228271484375, "logps/rejected": -1934.8583984375, "loss": 23.0436, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -152.990478515625, "rewards/margins": -12.66595458984375, "rewards/rejected": -140.32452392578125, "step": 10030 }, { "epoch": 0.58, "grad_norm": 58.60726547241211, "learning_rate": 0.0008086226247145787, "logits/chosen": -12.987271308898926, "logits/rejected": -13.512300491333008, "logps/chosen": -2252.151611328125, "logps/rejected": -2126.00537109375, "loss": 12.1294, "rewards/accuracies": 0.5, "rewards/chosen": -151.60348510742188, "rewards/margins": -3.066319227218628, "rewards/rejected": -148.5371856689453, "step": 10040 }, { "epoch": 0.58, "grad_norm": 75.7551498413086, "learning_rate": 0.0008084291187739465, "logits/chosen": -10.485158920288086, "logits/rejected": -11.594807624816895, "logps/chosen": -2401.589111328125, "logps/rejected": -2358.310302734375, "loss": 22.1757, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -69.80144500732422, "rewards/margins": -17.608224868774414, "rewards/rejected": -52.19321823120117, "step": 10050 }, { "epoch": 0.58, "grad_norm": 4.361804293402313e-14, "learning_rate": 0.0008082356128333141, "logits/chosen": -10.365533828735352, "logits/rejected": -10.440832138061523, "logps/chosen": -1970.095703125, "logps/rejected": -1805.238525390625, "loss": 7.9144, "rewards/accuracies": 0.5, "rewards/chosen": -19.975418090820312, "rewards/margins": 7.901106357574463, "rewards/rejected": -27.876529693603516, "step": 10060 }, { "epoch": 0.58, "grad_norm": 159.6342315673828, "learning_rate": 0.0008080421068926817, "logits/chosen": -12.526859283447266, "logits/rejected": -12.78009033203125, "logps/chosen": -1986.3480224609375, "logps/rejected": -1788.820068359375, "loss": 20.1475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -112.99720764160156, "rewards/margins": -18.67940902709961, "rewards/rejected": -94.31778717041016, "step": 10070 }, { "epoch": 0.58, "grad_norm": 0.041198164224624634, "learning_rate": 0.0008078486009520493, "logits/chosen": -9.862557411193848, "logits/rejected": -10.343873977661133, "logps/chosen": -2528.88232421875, "logps/rejected": -1685.7672119140625, "loss": 16.887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -57.9545783996582, "rewards/margins": -0.7655437588691711, "rewards/rejected": -57.18903732299805, "step": 10080 }, { "epoch": 0.58, "grad_norm": 0.02883530780673027, "learning_rate": 0.0008076550950114169, "logits/chosen": -14.381139755249023, "logits/rejected": -13.844215393066406, "logps/chosen": -1885.3629150390625, "logps/rejected": -1861.2611083984375, "loss": 9.6931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.843017578125, "rewards/margins": 0.6425994634628296, "rewards/rejected": -160.48562622070312, "step": 10090 }, { "epoch": 0.58, "grad_norm": 159.7011260986328, "learning_rate": 0.0008074615890707844, "logits/chosen": -12.77823543548584, "logits/rejected": -13.5620698928833, "logps/chosen": -1958.9586181640625, "logps/rejected": -1709.574951171875, "loss": 11.4056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -106.13945007324219, "rewards/margins": -1.1357860565185547, "rewards/rejected": -105.00367736816406, "step": 10100 }, { "epoch": 0.59, "grad_norm": 210.13998413085938, "learning_rate": 0.000807268083130152, "logits/chosen": -8.75949478149414, "logits/rejected": -8.718608856201172, "logps/chosen": -2760.040771484375, "logps/rejected": -2398.6044921875, "loss": 22.8172, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -119.51164245605469, "rewards/margins": -13.37114429473877, "rewards/rejected": -106.1405029296875, "step": 10110 }, { "epoch": 0.59, "grad_norm": 210.57958984375, "learning_rate": 0.0008070745771895197, "logits/chosen": -14.396418571472168, "logits/rejected": -15.155987739562988, "logps/chosen": -2399.3876953125, "logps/rejected": -1872.856201171875, "loss": 41.269, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -203.7978057861328, "rewards/margins": -36.154075622558594, "rewards/rejected": -167.6437225341797, "step": 10120 }, { "epoch": 0.59, "grad_norm": 78.9296875, "learning_rate": 0.0008068810712488873, "logits/chosen": -10.483707427978516, "logits/rejected": -10.223383903503418, "logps/chosen": -2614.268798828125, "logps/rejected": -2686.657958984375, "loss": 5.1509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.33894348144531, "rewards/margins": 11.501343727111816, "rewards/rejected": -127.84028625488281, "step": 10130 }, { "epoch": 0.59, "grad_norm": 288.24505615234375, "learning_rate": 0.0008066875653082549, "logits/chosen": -9.59766960144043, "logits/rejected": -9.607951164245605, "logps/chosen": -2448.000244140625, "logps/rejected": -2285.1953125, "loss": 26.2675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -112.60009765625, "rewards/margins": -15.220464706420898, "rewards/rejected": -97.379638671875, "step": 10140 }, { "epoch": 0.59, "grad_norm": 126.83595275878906, "learning_rate": 0.0008064940593676226, "logits/chosen": -9.668432235717773, "logits/rejected": -10.212553977966309, "logps/chosen": -2378.347900390625, "logps/rejected": -2142.68701171875, "loss": 30.7402, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -132.30538940429688, "rewards/margins": -22.169218063354492, "rewards/rejected": -110.13618469238281, "step": 10150 }, { "epoch": 0.59, "grad_norm": 2.0461788177490234, "learning_rate": 0.0008063005534269902, "logits/chosen": -8.962995529174805, "logits/rejected": -8.885539054870605, "logps/chosen": -2932.78076171875, "logps/rejected": -2684.18115234375, "loss": 8.1913, "rewards/accuracies": 0.5, "rewards/chosen": -79.72918701171875, "rewards/margins": 0.613189697265625, "rewards/rejected": -80.34236907958984, "step": 10160 }, { "epoch": 0.59, "grad_norm": 481.7521057128906, "learning_rate": 0.0008061070474863579, "logits/chosen": -11.65300178527832, "logits/rejected": -11.519678115844727, "logps/chosen": -2899.568359375, "logps/rejected": -2195.948974609375, "loss": 14.6594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.70953369140625, "rewards/margins": 9.450156211853027, "rewards/rejected": -144.15968322753906, "step": 10170 }, { "epoch": 0.59, "grad_norm": 6.866490709356299e-10, "learning_rate": 0.0008059135415457255, "logits/chosen": -15.440653800964355, "logits/rejected": -15.634721755981445, "logps/chosen": -1984.2135009765625, "logps/rejected": -1916.6695556640625, "loss": 7.1398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.465087890625, "rewards/margins": -2.135547161102295, "rewards/rejected": -173.3295440673828, "step": 10180 }, { "epoch": 0.59, "grad_norm": 35.132320404052734, "learning_rate": 0.0008057200356050931, "logits/chosen": -10.76207160949707, "logits/rejected": -10.279067039489746, "logps/chosen": -2077.56982421875, "logps/rejected": -1863.096923828125, "loss": 18.4221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -99.22354888916016, "rewards/margins": 3.935307264328003, "rewards/rejected": -103.15885162353516, "step": 10190 }, { "epoch": 0.59, "grad_norm": 48.719154357910156, "learning_rate": 0.0008055265296644607, "logits/chosen": -9.75456428527832, "logits/rejected": -9.793771743774414, "logps/chosen": -2363.661376953125, "logps/rejected": -2390.253173828125, "loss": 10.0195, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -45.39698028564453, "rewards/margins": -1.7757911682128906, "rewards/rejected": -43.62118911743164, "step": 10200 }, { "epoch": 0.59, "grad_norm": 0.0011246463982388377, "learning_rate": 0.0008053330237238283, "logits/chosen": -12.629924774169922, "logits/rejected": -12.74519157409668, "logps/chosen": -2368.33544921875, "logps/rejected": -2217.760498046875, "loss": 17.8575, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -136.43792724609375, "rewards/margins": -16.013530731201172, "rewards/rejected": -120.42439270019531, "step": 10210 }, { "epoch": 0.59, "grad_norm": 45.87723922729492, "learning_rate": 0.000805139517783196, "logits/chosen": -12.503347396850586, "logits/rejected": -13.400065422058105, "logps/chosen": -2631.598876953125, "logps/rejected": -2280.42822265625, "loss": 25.952, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -117.59112548828125, "rewards/margins": -22.327144622802734, "rewards/rejected": -95.26396942138672, "step": 10220 }, { "epoch": 0.59, "grad_norm": 50.60591506958008, "learning_rate": 0.0008049460118425636, "logits/chosen": -10.715167999267578, "logits/rejected": -10.775490760803223, "logps/chosen": -2224.1201171875, "logps/rejected": -1891.5869140625, "loss": 7.7702, "rewards/accuracies": 0.5, "rewards/chosen": -80.5379409790039, "rewards/margins": 4.936086654663086, "rewards/rejected": -85.4740219116211, "step": 10230 }, { "epoch": 0.59, "grad_norm": 0.00024346048303414136, "learning_rate": 0.0008047525059019312, "logits/chosen": -9.833361625671387, "logits/rejected": -9.880647659301758, "logps/chosen": -2079.823974609375, "logps/rejected": -1861.644287109375, "loss": 17.9884, "rewards/accuracies": 0.5, "rewards/chosen": -166.5228729248047, "rewards/margins": -10.570991516113281, "rewards/rejected": -155.95188903808594, "step": 10240 }, { "epoch": 0.59, "grad_norm": 82.70262908935547, "learning_rate": 0.0008045589999612988, "logits/chosen": -8.489569664001465, "logits/rejected": -8.368816375732422, "logps/chosen": -2328.11328125, "logps/rejected": -1872.5445556640625, "loss": 20.7704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -57.751441955566406, "rewards/margins": -9.054336547851562, "rewards/rejected": -48.697105407714844, "step": 10250 }, { "epoch": 0.59, "grad_norm": 1.8213118124776884e-08, "learning_rate": 0.0008043654940206665, "logits/chosen": -10.09893798828125, "logits/rejected": -10.129434585571289, "logps/chosen": -2154.93505859375, "logps/rejected": -1740.3922119140625, "loss": 19.3363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.99684143066406, "rewards/margins": 4.512127876281738, "rewards/rejected": -112.50897216796875, "step": 10260 }, { "epoch": 0.59, "grad_norm": 46.519168853759766, "learning_rate": 0.0008041719880800341, "logits/chosen": -11.252659797668457, "logits/rejected": -11.191152572631836, "logps/chosen": -2185.15576171875, "logps/rejected": -1984.29296875, "loss": 11.5856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.31887817382812, "rewards/margins": -0.3092994689941406, "rewards/rejected": -145.00958251953125, "step": 10270 }, { "epoch": 0.6, "grad_norm": 35.130130767822266, "learning_rate": 0.0008039784821394018, "logits/chosen": -11.905051231384277, "logits/rejected": -11.918854713439941, "logps/chosen": -2243.390380859375, "logps/rejected": -2003.035888671875, "loss": 6.7194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -52.184722900390625, "rewards/margins": 8.007227897644043, "rewards/rejected": -60.19195556640625, "step": 10280 }, { "epoch": 0.6, "grad_norm": 147.47454833984375, "learning_rate": 0.0008037849761987694, "logits/chosen": -11.110368728637695, "logits/rejected": -11.306907653808594, "logps/chosen": -2325.33251953125, "logps/rejected": -2197.703857421875, "loss": 6.5582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -52.8930549621582, "rewards/margins": 24.63019561767578, "rewards/rejected": -77.52325439453125, "step": 10290 }, { "epoch": 0.6, "grad_norm": 33.18565368652344, "learning_rate": 0.000803591470258137, "logits/chosen": -13.363093376159668, "logits/rejected": -15.509785652160645, "logps/chosen": -2352.45068359375, "logps/rejected": -2103.90576171875, "loss": 21.997, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.09420776367188, "rewards/margins": -12.603940963745117, "rewards/rejected": -151.49026489257812, "step": 10300 }, { "epoch": 0.6, "grad_norm": 88.24436950683594, "learning_rate": 0.0008033979643175046, "logits/chosen": -14.614659309387207, "logits/rejected": -14.643617630004883, "logps/chosen": -2203.143798828125, "logps/rejected": -1859.466064453125, "loss": 31.8743, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -186.88265991210938, "rewards/margins": -26.545513153076172, "rewards/rejected": -160.33714294433594, "step": 10310 }, { "epoch": 0.6, "grad_norm": 0.02008446678519249, "learning_rate": 0.0008032044583768721, "logits/chosen": -12.383871078491211, "logits/rejected": -12.587305068969727, "logps/chosen": -2316.299072265625, "logps/rejected": -2104.09912109375, "loss": 4.5393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -77.28001403808594, "rewards/margins": 19.693622589111328, "rewards/rejected": -96.97364807128906, "step": 10320 }, { "epoch": 0.6, "grad_norm": 1.4723736119215403e-11, "learning_rate": 0.0008030109524362397, "logits/chosen": -11.316909790039062, "logits/rejected": -11.396977424621582, "logps/chosen": -2447.033935546875, "logps/rejected": -2157.18896484375, "loss": 16.6047, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -111.29716491699219, "rewards/margins": -12.159326553344727, "rewards/rejected": -99.13785552978516, "step": 10330 }, { "epoch": 0.6, "grad_norm": 81.16638946533203, "learning_rate": 0.0008028174464956074, "logits/chosen": -10.078409194946289, "logits/rejected": -9.868169784545898, "logps/chosen": -2304.9013671875, "logps/rejected": -2228.4990234375, "loss": 10.384, "rewards/accuracies": 0.5, "rewards/chosen": -116.77591705322266, "rewards/margins": 2.159170627593994, "rewards/rejected": -118.93509674072266, "step": 10340 }, { "epoch": 0.6, "grad_norm": 29.861909866333008, "learning_rate": 0.000802623940554975, "logits/chosen": -9.794785499572754, "logits/rejected": -9.667593002319336, "logps/chosen": -2428.004150390625, "logps/rejected": -2174.01806640625, "loss": 11.4926, "rewards/accuracies": 0.5, "rewards/chosen": -4.960661888122559, "rewards/margins": 3.935663938522339, "rewards/rejected": -8.896327018737793, "step": 10350 }, { "epoch": 0.6, "grad_norm": 39.64450454711914, "learning_rate": 0.0008024304346143427, "logits/chosen": -12.597955703735352, "logits/rejected": -13.198512077331543, "logps/chosen": -1989.3131103515625, "logps/rejected": -1781.032470703125, "loss": 13.8098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -92.93756866455078, "rewards/margins": -8.780976295471191, "rewards/rejected": -84.1565933227539, "step": 10360 }, { "epoch": 0.6, "grad_norm": 58.0825309753418, "learning_rate": 0.0008022369286737103, "logits/chosen": -10.357958793640137, "logits/rejected": -10.335670471191406, "logps/chosen": -2214.537841796875, "logps/rejected": -2474.373291015625, "loss": 7.1781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.6513900756836, "rewards/margins": 8.375565528869629, "rewards/rejected": -111.0269546508789, "step": 10370 }, { "epoch": 0.6, "grad_norm": 126.11138153076172, "learning_rate": 0.0008020434227330779, "logits/chosen": -9.872658729553223, "logits/rejected": -10.246195793151855, "logps/chosen": -2837.211669921875, "logps/rejected": -2167.907470703125, "loss": 30.6213, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -100.84508514404297, "rewards/margins": -24.76749038696289, "rewards/rejected": -76.07759857177734, "step": 10380 }, { "epoch": 0.6, "grad_norm": 9.685565325863033e-18, "learning_rate": 0.0008018499167924455, "logits/chosen": -11.893294334411621, "logits/rejected": -11.659246444702148, "logps/chosen": -2245.75634765625, "logps/rejected": -1930.228515625, "loss": 19.5158, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -108.2598876953125, "rewards/margins": -8.022780418395996, "rewards/rejected": -100.23709869384766, "step": 10390 }, { "epoch": 0.6, "grad_norm": 70.98374938964844, "learning_rate": 0.0008016564108518132, "logits/chosen": -13.205965042114258, "logits/rejected": -13.358053207397461, "logps/chosen": -2226.09326171875, "logps/rejected": -2075.53564453125, "loss": 18.6492, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -122.77980041503906, "rewards/margins": -17.74750518798828, "rewards/rejected": -105.03230285644531, "step": 10400 }, { "epoch": 0.6, "grad_norm": 28.095359802246094, "learning_rate": 0.0008014629049111808, "logits/chosen": -13.154704093933105, "logits/rejected": -13.040496826171875, "logps/chosen": -2091.001953125, "logps/rejected": -2003.102783203125, "loss": 30.8653, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.13710021972656, "rewards/margins": -26.4466609954834, "rewards/rejected": -102.6904296875, "step": 10410 }, { "epoch": 0.6, "grad_norm": 48.8814697265625, "learning_rate": 0.0008012693989705484, "logits/chosen": -10.369891166687012, "logits/rejected": -10.194555282592773, "logps/chosen": -2572.023681640625, "logps/rejected": -1726.7515869140625, "loss": 38.557, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -113.86358642578125, "rewards/margins": -30.83270835876465, "rewards/rejected": -83.03086853027344, "step": 10420 }, { "epoch": 0.6, "grad_norm": 39.77910232543945, "learning_rate": 0.000801075893029916, "logits/chosen": -10.984376907348633, "logits/rejected": -10.921833038330078, "logps/chosen": -2428.40283203125, "logps/rejected": -1835.4146728515625, "loss": 5.6668, "rewards/accuracies": 0.5, "rewards/chosen": -53.41284942626953, "rewards/margins": 9.162126541137695, "rewards/rejected": -62.574974060058594, "step": 10430 }, { "epoch": 0.6, "grad_norm": 62.573631286621094, "learning_rate": 0.0008008823870892836, "logits/chosen": -12.567705154418945, "logits/rejected": -12.612750053405762, "logps/chosen": -1981.867919921875, "logps/rejected": -1894.586181640625, "loss": 9.143, "rewards/accuracies": 0.5, "rewards/chosen": -81.76395416259766, "rewards/margins": -2.4656405448913574, "rewards/rejected": -79.29830932617188, "step": 10440 }, { "epoch": 0.6, "grad_norm": 9.573378128104916e-16, "learning_rate": 0.0008006888811486513, "logits/chosen": -12.42189884185791, "logits/rejected": -12.498483657836914, "logps/chosen": -2104.318603515625, "logps/rejected": -2044.9163818359375, "loss": 29.2079, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.4970932006836, "rewards/margins": -16.703901290893555, "rewards/rejected": -96.7931900024414, "step": 10450 }, { "epoch": 0.61, "grad_norm": 0.49635905027389526, "learning_rate": 0.0008004953752080189, "logits/chosen": -11.099702835083008, "logits/rejected": -11.322182655334473, "logps/chosen": -2430.90087890625, "logps/rejected": -1963.694580078125, "loss": 26.3824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -93.80220031738281, "rewards/margins": -19.376502990722656, "rewards/rejected": -74.42570495605469, "step": 10460 }, { "epoch": 0.61, "grad_norm": 47.56906509399414, "learning_rate": 0.0008003018692673866, "logits/chosen": -11.544645309448242, "logits/rejected": -11.190885543823242, "logps/chosen": -2292.85986328125, "logps/rejected": -2158.81201171875, "loss": 8.1831, "rewards/accuracies": 0.5, "rewards/chosen": -152.74429321289062, "rewards/margins": 1.0031036138534546, "rewards/rejected": -153.74740600585938, "step": 10470 }, { "epoch": 0.61, "grad_norm": 0.002741122618317604, "learning_rate": 0.0008001083633267542, "logits/chosen": -14.364686965942383, "logits/rejected": -14.527743339538574, "logps/chosen": -2126.17333984375, "logps/rejected": -2001.1435546875, "loss": 13.1094, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -149.88424682617188, "rewards/margins": -6.383072853088379, "rewards/rejected": -143.5011749267578, "step": 10480 }, { "epoch": 0.61, "grad_norm": 92.33341979980469, "learning_rate": 0.0007999148573861218, "logits/chosen": -12.234684944152832, "logits/rejected": -12.33569622039795, "logps/chosen": -2155.77197265625, "logps/rejected": -1889.797607421875, "loss": 20.4592, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -150.6809844970703, "rewards/margins": -20.296249389648438, "rewards/rejected": -130.3847198486328, "step": 10490 }, { "epoch": 0.61, "grad_norm": 91.60292053222656, "learning_rate": 0.0007997213514454894, "logits/chosen": -8.679628372192383, "logits/rejected": -8.653093338012695, "logps/chosen": -2571.476806640625, "logps/rejected": -2509.18212890625, "loss": 9.1435, "rewards/accuracies": 0.5, "rewards/chosen": -69.15353393554688, "rewards/margins": 4.119935035705566, "rewards/rejected": -73.27346801757812, "step": 10500 }, { "epoch": 0.61, "grad_norm": 102.74906158447266, "learning_rate": 0.0007995278455048571, "logits/chosen": -12.046102523803711, "logits/rejected": -12.21019172668457, "logps/chosen": -1994.1705322265625, "logps/rejected": -1902.6744384765625, "loss": 8.7444, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -171.1548309326172, "rewards/margins": -5.141657829284668, "rewards/rejected": -166.01318359375, "step": 10510 }, { "epoch": 0.61, "grad_norm": 114.71734619140625, "learning_rate": 0.0007993343395642247, "logits/chosen": -11.958335876464844, "logits/rejected": -11.920541763305664, "logps/chosen": -2143.26904296875, "logps/rejected": -1960.738037109375, "loss": 21.0895, "rewards/accuracies": 0.5, "rewards/chosen": -173.89186096191406, "rewards/margins": -8.673538208007812, "rewards/rejected": -165.21829223632812, "step": 10520 }, { "epoch": 0.61, "grad_norm": 25.989408493041992, "learning_rate": 0.0007991408336235923, "logits/chosen": -12.046186447143555, "logits/rejected": -12.056671142578125, "logps/chosen": -2364.946533203125, "logps/rejected": -2041.03125, "loss": 5.9276, "rewards/accuracies": 0.5, "rewards/chosen": -97.4203872680664, "rewards/margins": -1.5377254486083984, "rewards/rejected": -95.88265228271484, "step": 10530 }, { "epoch": 0.61, "grad_norm": 1.0782798528671265, "learning_rate": 0.0007989473276829598, "logits/chosen": -10.950288772583008, "logits/rejected": -10.917869567871094, "logps/chosen": -2492.273193359375, "logps/rejected": -2653.37548828125, "loss": 3.7561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -107.8947982788086, "rewards/margins": 17.15729522705078, "rewards/rejected": -125.05208587646484, "step": 10540 }, { "epoch": 0.61, "grad_norm": 87.23564147949219, "learning_rate": 0.0007987538217423274, "logits/chosen": -9.947726249694824, "logits/rejected": -9.992485046386719, "logps/chosen": -2845.219482421875, "logps/rejected": -2665.586181640625, "loss": 6.37, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.33319091796875, "rewards/margins": 6.162361145019531, "rewards/rejected": -139.49554443359375, "step": 10550 }, { "epoch": 0.61, "grad_norm": 0.01555819995701313, "learning_rate": 0.000798560315801695, "logits/chosen": -10.250639915466309, "logits/rejected": -9.94650936126709, "logps/chosen": -1969.626220703125, "logps/rejected": -1920.145751953125, "loss": 13.0621, "rewards/accuracies": 0.5, "rewards/chosen": -97.7879867553711, "rewards/margins": -4.824465274810791, "rewards/rejected": -92.9635238647461, "step": 10560 }, { "epoch": 0.61, "grad_norm": 3.426906314274447e-09, "learning_rate": 0.0007983668098610628, "logits/chosen": -12.65910816192627, "logits/rejected": -12.564810752868652, "logps/chosen": -2346.850341796875, "logps/rejected": -2356.06298828125, "loss": 4.4919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.87033081054688, "rewards/margins": 7.047934055328369, "rewards/rejected": -169.91827392578125, "step": 10570 }, { "epoch": 0.61, "grad_norm": 8.343945274669965e-16, "learning_rate": 0.0007981733039204304, "logits/chosen": -14.138201713562012, "logits/rejected": -14.165733337402344, "logps/chosen": -2289.581787109375, "logps/rejected": -2205.71875, "loss": 4.471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.96824645996094, "rewards/margins": 5.344101428985596, "rewards/rejected": -166.31234741210938, "step": 10580 }, { "epoch": 0.61, "grad_norm": 12.94461441040039, "learning_rate": 0.000797979797979798, "logits/chosen": -13.886457443237305, "logits/rejected": -13.782341003417969, "logps/chosen": -2440.731689453125, "logps/rejected": -2083.051513671875, "loss": 30.3407, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -202.59048461914062, "rewards/margins": -25.929073333740234, "rewards/rejected": -176.6614227294922, "step": 10590 }, { "epoch": 0.61, "grad_norm": 0.015185854397714138, "learning_rate": 0.0007977862920391656, "logits/chosen": -12.600724220275879, "logits/rejected": -12.6262845993042, "logps/chosen": -2104.73583984375, "logps/rejected": -2167.193115234375, "loss": 14.6362, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -161.19473266601562, "rewards/margins": -3.976337432861328, "rewards/rejected": -157.21839904785156, "step": 10600 }, { "epoch": 0.61, "grad_norm": 0.10739434510469437, "learning_rate": 0.0007975927860985332, "logits/chosen": -7.4730329513549805, "logits/rejected": -7.498738765716553, "logps/chosen": -3060.09130859375, "logps/rejected": -2638.081787109375, "loss": 17.4802, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -136.4165496826172, "rewards/margins": -16.56290054321289, "rewards/rejected": -119.8536376953125, "step": 10610 }, { "epoch": 0.61, "grad_norm": 0.00467284768819809, "learning_rate": 0.0007973992801579009, "logits/chosen": -9.545419692993164, "logits/rejected": -9.698561668395996, "logps/chosen": -2352.17431640625, "logps/rejected": -2360.01123046875, "loss": 19.678, "rewards/accuracies": 0.5, "rewards/chosen": -96.60550689697266, "rewards/margins": -1.1141388416290283, "rewards/rejected": -95.49136352539062, "step": 10620 }, { "epoch": 0.62, "grad_norm": 139.84242248535156, "learning_rate": 0.0007972057742172685, "logits/chosen": -11.284002304077148, "logits/rejected": -11.152708053588867, "logps/chosen": -2290.340576171875, "logps/rejected": -2144.829345703125, "loss": 18.7694, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.47470092773438, "rewards/margins": -9.684846878051758, "rewards/rejected": -118.78985595703125, "step": 10630 }, { "epoch": 0.62, "grad_norm": 973.7020263671875, "learning_rate": 0.0007970122682766361, "logits/chosen": -10.470682144165039, "logits/rejected": -10.522659301757812, "logps/chosen": -2691.89013671875, "logps/rejected": -2402.92236328125, "loss": 17.3537, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.2594757080078, "rewards/margins": -12.776440620422363, "rewards/rejected": -141.48304748535156, "step": 10640 }, { "epoch": 0.62, "grad_norm": 119.37989807128906, "learning_rate": 0.0007968187623360037, "logits/chosen": -11.399009704589844, "logits/rejected": -11.374500274658203, "logps/chosen": -2263.86474609375, "logps/rejected": -1663.438720703125, "loss": 28.0894, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -127.96085357666016, "rewards/margins": -23.835060119628906, "rewards/rejected": -104.12579345703125, "step": 10650 }, { "epoch": 0.62, "grad_norm": 57.90945053100586, "learning_rate": 0.0007966252563953713, "logits/chosen": -11.720869064331055, "logits/rejected": -11.862150192260742, "logps/chosen": -2221.879150390625, "logps/rejected": -1564.0946044921875, "loss": 31.6122, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -127.2541275024414, "rewards/margins": -19.55833625793457, "rewards/rejected": -107.69578552246094, "step": 10660 }, { "epoch": 0.62, "grad_norm": 9.602599249092236e-08, "learning_rate": 0.0007964317504547389, "logits/chosen": -11.856185913085938, "logits/rejected": -11.943275451660156, "logps/chosen": -2255.11181640625, "logps/rejected": -1905.475341796875, "loss": 25.8345, "rewards/accuracies": 0.5, "rewards/chosen": -126.0628890991211, "rewards/margins": -15.429308891296387, "rewards/rejected": -110.63359069824219, "step": 10670 }, { "epoch": 0.62, "grad_norm": 63.43304443359375, "learning_rate": 0.0007962382445141067, "logits/chosen": -12.7504243850708, "logits/rejected": -12.740872383117676, "logps/chosen": -2508.56640625, "logps/rejected": -2145.46435546875, "loss": 24.4087, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -133.5681915283203, "rewards/margins": -16.944629669189453, "rewards/rejected": -116.6235580444336, "step": 10680 }, { "epoch": 0.62, "grad_norm": 134.612060546875, "learning_rate": 0.0007960447385734743, "logits/chosen": -12.854217529296875, "logits/rejected": -12.7291898727417, "logps/chosen": -2223.45556640625, "logps/rejected": -2092.39404296875, "loss": 17.6175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.66273498535156, "rewards/margins": -13.744587898254395, "rewards/rejected": -139.91815185546875, "step": 10690 }, { "epoch": 0.62, "grad_norm": 66.13236236572266, "learning_rate": 0.0007958512326328419, "logits/chosen": -9.573163986206055, "logits/rejected": -9.506889343261719, "logps/chosen": -2534.99853515625, "logps/rejected": -2055.16552734375, "loss": 22.4926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -118.48686218261719, "rewards/margins": -14.900856018066406, "rewards/rejected": -103.58601379394531, "step": 10700 }, { "epoch": 0.62, "grad_norm": 73.66090393066406, "learning_rate": 0.0007956577266922095, "logits/chosen": -9.51713752746582, "logits/rejected": -9.448319435119629, "logps/chosen": -2726.50732421875, "logps/rejected": -2407.12548828125, "loss": 14.2957, "rewards/accuracies": 0.5, "rewards/chosen": -124.5330581665039, "rewards/margins": -5.9005537033081055, "rewards/rejected": -118.63250732421875, "step": 10710 }, { "epoch": 0.62, "grad_norm": 2.010101079940796, "learning_rate": 0.0007954642207515771, "logits/chosen": -12.988494873046875, "logits/rejected": -13.155029296875, "logps/chosen": -2333.576171875, "logps/rejected": -2254.23046875, "loss": 14.3486, "rewards/accuracies": 0.5, "rewards/chosen": -166.6046142578125, "rewards/margins": -4.256861686706543, "rewards/rejected": -162.34776306152344, "step": 10720 }, { "epoch": 0.62, "grad_norm": 238.69549560546875, "learning_rate": 0.0007952707148109447, "logits/chosen": -13.504606246948242, "logits/rejected": -13.541143417358398, "logps/chosen": -1957.2991943359375, "logps/rejected": -2138.6240234375, "loss": 21.6115, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -82.13652038574219, "rewards/margins": -10.165426254272461, "rewards/rejected": -71.97108459472656, "step": 10730 }, { "epoch": 0.62, "grad_norm": 58.99864196777344, "learning_rate": 0.0007950772088703124, "logits/chosen": -12.738786697387695, "logits/rejected": -12.747932434082031, "logps/chosen": -2092.067626953125, "logps/rejected": -1887.6373291015625, "loss": 17.2148, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.23619079589844, "rewards/margins": -11.378755569458008, "rewards/rejected": -125.85743713378906, "step": 10740 }, { "epoch": 0.62, "grad_norm": 62.761112213134766, "learning_rate": 0.00079488370292968, "logits/chosen": -9.822239875793457, "logits/rejected": -9.782532691955566, "logps/chosen": -2267.45849609375, "logps/rejected": -2061.27197265625, "loss": 5.5729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -52.89630889892578, "rewards/margins": 7.570639133453369, "rewards/rejected": -60.466941833496094, "step": 10750 }, { "epoch": 0.62, "grad_norm": 5.406978173469479e-16, "learning_rate": 0.0007946901969890475, "logits/chosen": -10.261648178100586, "logits/rejected": -10.277280807495117, "logps/chosen": -2310.52734375, "logps/rejected": -2677.594970703125, "loss": 9.615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -83.97233581542969, "rewards/margins": 2.7824673652648926, "rewards/rejected": -86.75481414794922, "step": 10760 }, { "epoch": 0.62, "grad_norm": 59.331722259521484, "learning_rate": 0.0007944966910484151, "logits/chosen": -11.54155158996582, "logits/rejected": -11.495894432067871, "logps/chosen": -2724.01171875, "logps/rejected": -2657.30029296875, "loss": 12.8992, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -161.09854125976562, "rewards/margins": -9.137763977050781, "rewards/rejected": -151.96078491210938, "step": 10770 }, { "epoch": 0.62, "grad_norm": 76.49432373046875, "learning_rate": 0.0007943031851077828, "logits/chosen": -12.628133773803711, "logits/rejected": -12.855707168579102, "logps/chosen": -2414.175048828125, "logps/rejected": -2304.435791015625, "loss": 5.7744, "rewards/accuracies": 0.5, "rewards/chosen": -114.32734680175781, "rewards/margins": 2.78178071975708, "rewards/rejected": -117.109130859375, "step": 10780 }, { "epoch": 0.62, "grad_norm": 9.699882498777346e-15, "learning_rate": 0.0007941096791671505, "logits/chosen": -10.339503288269043, "logits/rejected": -10.305935859680176, "logps/chosen": -2861.3642578125, "logps/rejected": -2701.996826171875, "loss": 32.8568, "rewards/accuracies": 0.5, "rewards/chosen": -71.59477233886719, "rewards/margins": -16.439395904541016, "rewards/rejected": -55.15537643432617, "step": 10790 }, { "epoch": 0.63, "grad_norm": 45.909053802490234, "learning_rate": 0.0007939161732265181, "logits/chosen": -14.191022872924805, "logits/rejected": -14.157562255859375, "logps/chosen": -2083.61083984375, "logps/rejected": -2005.3984375, "loss": 12.1091, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -135.53009033203125, "rewards/margins": 4.607076168060303, "rewards/rejected": -140.1371612548828, "step": 10800 }, { "epoch": 0.63, "grad_norm": 71.09223175048828, "learning_rate": 0.0007937226672858857, "logits/chosen": -9.984423637390137, "logits/rejected": -9.862208366394043, "logps/chosen": -2676.77197265625, "logps/rejected": -2013.8043212890625, "loss": 29.1988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.15030670166016, "rewards/margins": -25.39154624938965, "rewards/rejected": -73.75877380371094, "step": 10810 }, { "epoch": 0.63, "grad_norm": 3.0228943824768066, "learning_rate": 0.0007935291613452533, "logits/chosen": -9.950400352478027, "logits/rejected": -9.735240936279297, "logps/chosen": -2724.381103515625, "logps/rejected": -2332.300537109375, "loss": 18.9904, "rewards/accuracies": 0.5, "rewards/chosen": -108.08735656738281, "rewards/margins": -11.527959823608398, "rewards/rejected": -96.55937957763672, "step": 10820 }, { "epoch": 0.63, "grad_norm": 142.33404541015625, "learning_rate": 0.0007933356554046209, "logits/chosen": -8.926565170288086, "logits/rejected": -9.040987014770508, "logps/chosen": -2599.66259765625, "logps/rejected": -2497.268798828125, "loss": 15.6243, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -114.13700103759766, "rewards/margins": -13.137455940246582, "rewards/rejected": -100.99954223632812, "step": 10830 }, { "epoch": 0.63, "grad_norm": 0.01947464980185032, "learning_rate": 0.0007931421494639885, "logits/chosen": -10.003586769104004, "logits/rejected": -9.940374374389648, "logps/chosen": -2777.508544921875, "logps/rejected": -2439.87939453125, "loss": 15.5677, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -161.32237243652344, "rewards/margins": -2.011962890625, "rewards/rejected": -159.31039428710938, "step": 10840 }, { "epoch": 0.63, "grad_norm": 4.119944267207153e-18, "learning_rate": 0.0007929486435233562, "logits/chosen": -11.244297981262207, "logits/rejected": -11.135394096374512, "logps/chosen": -2723.75341796875, "logps/rejected": -2588.48291015625, "loss": 13.178, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -150.34671020507812, "rewards/margins": -3.3974597454071045, "rewards/rejected": -146.94924926757812, "step": 10850 }, { "epoch": 0.63, "grad_norm": 11.428117752075195, "learning_rate": 0.0007927551375827238, "logits/chosen": -11.633871078491211, "logits/rejected": -11.68630599975586, "logps/chosen": -2475.4755859375, "logps/rejected": -2239.661376953125, "loss": 20.804, "rewards/accuracies": 0.5, "rewards/chosen": -144.27134704589844, "rewards/margins": -12.384902954101562, "rewards/rejected": -131.88644409179688, "step": 10860 }, { "epoch": 0.63, "grad_norm": 7.907293365860824e-06, "learning_rate": 0.0007925616316420914, "logits/chosen": -11.839479446411133, "logits/rejected": -11.596576690673828, "logps/chosen": -2630.368896484375, "logps/rejected": -2653.33837890625, "loss": 8.5295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.24488830566406, "rewards/margins": 6.442358493804932, "rewards/rejected": -121.68724060058594, "step": 10870 }, { "epoch": 0.63, "grad_norm": 0.12348641455173492, "learning_rate": 0.000792368125701459, "logits/chosen": -12.037630081176758, "logits/rejected": -11.828702926635742, "logps/chosen": -2614.841796875, "logps/rejected": -2148.124267578125, "loss": 30.8064, "rewards/accuracies": 0.5, "rewards/chosen": -154.79183959960938, "rewards/margins": -23.2553768157959, "rewards/rejected": -131.5364532470703, "step": 10880 }, { "epoch": 0.63, "grad_norm": 53.27492141723633, "learning_rate": 0.0007921746197608267, "logits/chosen": -11.19366455078125, "logits/rejected": -11.194620132446289, "logps/chosen": -2529.57470703125, "logps/rejected": -2334.041748046875, "loss": 10.1135, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.02218627929688, "rewards/margins": -2.8543457984924316, "rewards/rejected": -133.1678466796875, "step": 10890 }, { "epoch": 0.63, "grad_norm": 109.14934539794922, "learning_rate": 0.0007919811138201944, "logits/chosen": -12.31143856048584, "logits/rejected": -12.409936904907227, "logps/chosen": -1967.486328125, "logps/rejected": -1893.4964599609375, "loss": 17.9424, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -89.13699340820312, "rewards/margins": -2.4656975269317627, "rewards/rejected": -86.67129516601562, "step": 10900 }, { "epoch": 0.63, "grad_norm": 70.49114990234375, "learning_rate": 0.000791787607879562, "logits/chosen": -13.296978950500488, "logits/rejected": -13.17475700378418, "logps/chosen": -2333.10009765625, "logps/rejected": -2259.3974609375, "loss": 9.373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.0530242919922, "rewards/margins": -4.5631422996521, "rewards/rejected": -199.4898681640625, "step": 10910 }, { "epoch": 0.63, "grad_norm": 41.37921905517578, "learning_rate": 0.0007915941019389296, "logits/chosen": -10.462733268737793, "logits/rejected": -10.421148300170898, "logps/chosen": -2166.309326171875, "logps/rejected": -1538.447998046875, "loss": 20.39, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -122.66877746582031, "rewards/margins": -2.7918121814727783, "rewards/rejected": -119.87696838378906, "step": 10920 }, { "epoch": 0.63, "grad_norm": 66.16036987304688, "learning_rate": 0.0007914005959982972, "logits/chosen": -8.829947471618652, "logits/rejected": -8.860213279724121, "logps/chosen": -2137.8681640625, "logps/rejected": -2033.8154296875, "loss": 5.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.02774047851562, "rewards/margins": 10.223063468933105, "rewards/rejected": -159.25082397460938, "step": 10930 }, { "epoch": 0.63, "grad_norm": 1.7416792377477655e-18, "learning_rate": 0.0007912070900576648, "logits/chosen": -6.424986362457275, "logits/rejected": -6.245006084442139, "logps/chosen": -2802.73828125, "logps/rejected": -2187.19482421875, "loss": 23.6131, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -105.8819808959961, "rewards/margins": -13.977907180786133, "rewards/rejected": -91.90406799316406, "step": 10940 }, { "epoch": 0.63, "grad_norm": 2.116082214342896e-05, "learning_rate": 0.0007910135841170324, "logits/chosen": -11.090739250183105, "logits/rejected": -11.053985595703125, "logps/chosen": -2346.817626953125, "logps/rejected": -1692.155029296875, "loss": 46.969, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -159.24765014648438, "rewards/margins": -32.54484558105469, "rewards/rejected": -126.70279693603516, "step": 10950 }, { "epoch": 0.63, "grad_norm": 2.9474629659707874e-13, "learning_rate": 0.0007908200781764, "logits/chosen": -11.739689826965332, "logits/rejected": -11.671862602233887, "logps/chosen": -2318.378662109375, "logps/rejected": -2271.43896484375, "loss": 16.6678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.089298248291016, "rewards/margins": -2.2279582023620605, "rewards/rejected": -23.86134147644043, "step": 10960 }, { "epoch": 0.63, "grad_norm": 59.52195739746094, "learning_rate": 0.0007906265722357676, "logits/chosen": -12.431737899780273, "logits/rejected": -12.449895858764648, "logps/chosen": -2704.45361328125, "logps/rejected": -2399.115966796875, "loss": 31.9607, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -154.1359405517578, "rewards/margins": -27.085186004638672, "rewards/rejected": -127.0507583618164, "step": 10970 }, { "epoch": 0.64, "grad_norm": 48.37668228149414, "learning_rate": 0.0007904330662951352, "logits/chosen": -9.987796783447266, "logits/rejected": -9.92723560333252, "logps/chosen": -2330.2431640625, "logps/rejected": -1844.994873046875, "loss": 10.5326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -83.5976333618164, "rewards/margins": 15.668493270874023, "rewards/rejected": -99.26612854003906, "step": 10980 }, { "epoch": 0.64, "grad_norm": 65.5294418334961, "learning_rate": 0.0007902395603545029, "logits/chosen": -6.88291072845459, "logits/rejected": -6.834323883056641, "logps/chosen": -2579.292236328125, "logps/rejected": -2097.37255859375, "loss": 9.7245, "rewards/accuracies": 0.5, "rewards/chosen": -33.343875885009766, "rewards/margins": 7.585638523101807, "rewards/rejected": -40.92951202392578, "step": 10990 }, { "epoch": 0.64, "grad_norm": 83.79450988769531, "learning_rate": 0.0007900460544138705, "logits/chosen": -8.16398811340332, "logits/rejected": -8.141798973083496, "logps/chosen": -2019.6259765625, "logps/rejected": -2047.930908203125, "loss": 7.7006, "rewards/accuracies": 0.5, "rewards/chosen": -99.17345428466797, "rewards/margins": 1.2246389389038086, "rewards/rejected": -100.39810180664062, "step": 11000 }, { "epoch": 0.64, "grad_norm": 0.04992400109767914, "learning_rate": 0.0007898525484732381, "logits/chosen": -10.11825180053711, "logits/rejected": -10.162053108215332, "logps/chosen": -1749.314208984375, "logps/rejected": -1582.1044921875, "loss": 22.0358, "rewards/accuracies": 0.5, "rewards/chosen": -74.8621826171875, "rewards/margins": -9.931760787963867, "rewards/rejected": -64.93041229248047, "step": 11010 }, { "epoch": 0.64, "grad_norm": 65.19100952148438, "learning_rate": 0.0007896590425326058, "logits/chosen": -10.791997909545898, "logits/rejected": -10.463907241821289, "logps/chosen": -2236.1826171875, "logps/rejected": -1637.1744384765625, "loss": 12.2545, "rewards/accuracies": 0.5, "rewards/chosen": -76.83140563964844, "rewards/margins": 9.309412002563477, "rewards/rejected": -86.14082336425781, "step": 11020 }, { "epoch": 0.64, "grad_norm": 0.0028178479988127947, "learning_rate": 0.0007894655365919734, "logits/chosen": -11.644638061523438, "logits/rejected": -11.685638427734375, "logps/chosen": -2158.7548828125, "logps/rejected": -1901.9622802734375, "loss": 20.7787, "rewards/accuracies": 0.5, "rewards/chosen": -111.26841735839844, "rewards/margins": -6.297377109527588, "rewards/rejected": -104.9710464477539, "step": 11030 }, { "epoch": 0.64, "grad_norm": 8.984395692690494e-12, "learning_rate": 0.000789272030651341, "logits/chosen": -11.38669204711914, "logits/rejected": -11.187992095947266, "logps/chosen": -2496.902099609375, "logps/rejected": -1768.491943359375, "loss": 18.06, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -91.326416015625, "rewards/margins": -5.339208126068115, "rewards/rejected": -85.9872055053711, "step": 11040 }, { "epoch": 0.64, "grad_norm": 0.0009576261509209871, "learning_rate": 0.0007890785247107086, "logits/chosen": -12.377175331115723, "logits/rejected": -12.768465995788574, "logps/chosen": -2289.59765625, "logps/rejected": -2157.161865234375, "loss": 27.1351, "rewards/accuracies": 0.5, "rewards/chosen": -170.92837524414062, "rewards/margins": -19.461145401000977, "rewards/rejected": -151.4672393798828, "step": 11050 }, { "epoch": 0.64, "grad_norm": 179.56068420410156, "learning_rate": 0.0007888850187700762, "logits/chosen": -9.178522109985352, "logits/rejected": -9.30396842956543, "logps/chosen": -2147.604736328125, "logps/rejected": -1649.2388916015625, "loss": 15.7974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -125.09205627441406, "rewards/margins": -2.8090407848358154, "rewards/rejected": -122.28300476074219, "step": 11060 }, { "epoch": 0.64, "grad_norm": 103.46265411376953, "learning_rate": 0.0007886915128294438, "logits/chosen": -9.579426765441895, "logits/rejected": -9.592744827270508, "logps/chosen": -2342.74365234375, "logps/rejected": -2084.90087890625, "loss": 21.0129, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -96.24494934082031, "rewards/margins": -3.7219772338867188, "rewards/rejected": -92.5229721069336, "step": 11070 }, { "epoch": 0.64, "grad_norm": 0.004963921383023262, "learning_rate": 0.0007884980068888115, "logits/chosen": -10.496163368225098, "logits/rejected": -10.317971229553223, "logps/chosen": -2461.662109375, "logps/rejected": -2171.58447265625, "loss": 32.8768, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -131.9690704345703, "rewards/margins": -22.309917449951172, "rewards/rejected": -109.65914154052734, "step": 11080 }, { "epoch": 0.64, "grad_norm": 464.6502685546875, "learning_rate": 0.0007883045009481791, "logits/chosen": -11.782760620117188, "logits/rejected": -11.773866653442383, "logps/chosen": -1892.599365234375, "logps/rejected": -1842.3363037109375, "loss": 21.2519, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -135.9146728515625, "rewards/margins": -16.57529640197754, "rewards/rejected": -119.3393783569336, "step": 11090 }, { "epoch": 0.64, "grad_norm": 66.856689453125, "learning_rate": 0.0007881109950075468, "logits/chosen": -11.956808090209961, "logits/rejected": -11.733660697937012, "logps/chosen": -1998.939453125, "logps/rejected": -1875.0751953125, "loss": 16.3114, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -126.89408874511719, "rewards/margins": -10.081868171691895, "rewards/rejected": -116.8122329711914, "step": 11100 }, { "epoch": 0.64, "grad_norm": 105.88270568847656, "learning_rate": 0.0007879174890669144, "logits/chosen": -10.267005920410156, "logits/rejected": -9.898686408996582, "logps/chosen": -2469.78515625, "logps/rejected": -1732.8424072265625, "loss": 18.8092, "rewards/accuracies": 0.5, "rewards/chosen": -59.76709747314453, "rewards/margins": 0.7880832552909851, "rewards/rejected": -60.55517578125, "step": 11110 }, { "epoch": 0.64, "grad_norm": 73.62271118164062, "learning_rate": 0.000787723983126282, "logits/chosen": -10.308664321899414, "logits/rejected": -10.121755599975586, "logps/chosen": -2547.189697265625, "logps/rejected": -2077.208251953125, "loss": 15.2576, "rewards/accuracies": 0.5, "rewards/chosen": -98.16958618164062, "rewards/margins": 4.335218906402588, "rewards/rejected": -102.50480651855469, "step": 11120 }, { "epoch": 0.64, "grad_norm": 49.70130920410156, "learning_rate": 0.0007875304771856497, "logits/chosen": -12.113349914550781, "logits/rejected": -11.738763809204102, "logps/chosen": -2050.761474609375, "logps/rejected": -1914.581787109375, "loss": 21.9786, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -143.67684936523438, "rewards/margins": -16.26826286315918, "rewards/rejected": -127.4085922241211, "step": 11130 }, { "epoch": 0.64, "grad_norm": 110.67045593261719, "learning_rate": 0.0007873369712450173, "logits/chosen": -10.784849166870117, "logits/rejected": -10.778677940368652, "logps/chosen": -2199.485107421875, "logps/rejected": -1885.0169677734375, "loss": 30.9179, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.31297302246094, "rewards/margins": -21.425888061523438, "rewards/rejected": -95.8870849609375, "step": 11140 }, { "epoch": 0.65, "grad_norm": 72.78254699707031, "learning_rate": 0.0007871434653043849, "logits/chosen": -11.292553901672363, "logits/rejected": -11.081062316894531, "logps/chosen": -2430.77880859375, "logps/rejected": -2156.66796875, "loss": 22.7742, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -157.9486083984375, "rewards/margins": -16.98550796508789, "rewards/rejected": -140.96310424804688, "step": 11150 }, { "epoch": 0.65, "grad_norm": 1.3434360027313232, "learning_rate": 0.0007869499593637525, "logits/chosen": -13.471735954284668, "logits/rejected": -13.438270568847656, "logps/chosen": -2109.695068359375, "logps/rejected": -2054.51171875, "loss": 7.9324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -144.86471557617188, "rewards/margins": -5.4581146240234375, "rewards/rejected": -139.4066162109375, "step": 11160 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 0.0007867564534231201, "logits/chosen": -12.369389533996582, "logits/rejected": -12.217212677001953, "logps/chosen": -2069.77587890625, "logps/rejected": -1839.3385009765625, "loss": 21.5938, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -141.43716430664062, "rewards/margins": -9.435003280639648, "rewards/rejected": -132.00216674804688, "step": 11170 }, { "epoch": 0.65, "grad_norm": 4.0014696696130203e-16, "learning_rate": 0.0007865629474824877, "logits/chosen": -9.932882308959961, "logits/rejected": -10.079790115356445, "logps/chosen": -2477.07666015625, "logps/rejected": -2234.83203125, "loss": 20.3688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -105.4830093383789, "rewards/margins": -12.659141540527344, "rewards/rejected": -92.82386779785156, "step": 11180 }, { "epoch": 0.65, "grad_norm": 89.12382507324219, "learning_rate": 0.0007863694415418552, "logits/chosen": -12.878942489624023, "logits/rejected": -12.931800842285156, "logps/chosen": -2130.3427734375, "logps/rejected": -2080.131103515625, "loss": 27.816, "rewards/accuracies": 0.5, "rewards/chosen": -177.92098999023438, "rewards/margins": 4.405543327331543, "rewards/rejected": -182.32655334472656, "step": 11190 }, { "epoch": 0.65, "grad_norm": 2.383563756942749, "learning_rate": 0.000786175935601223, "logits/chosen": -8.63416576385498, "logits/rejected": -8.595245361328125, "logps/chosen": -2798.18212890625, "logps/rejected": -2429.31640625, "loss": 10.5119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -60.68745040893555, "rewards/margins": -0.42141932249069214, "rewards/rejected": -60.26603317260742, "step": 11200 }, { "epoch": 0.65, "grad_norm": 76.70443725585938, "learning_rate": 0.0007859824296605906, "logits/chosen": -11.500944137573242, "logits/rejected": -11.698984146118164, "logps/chosen": -2581.89501953125, "logps/rejected": -2303.962646484375, "loss": 12.9106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.44447326660156, "rewards/margins": -5.551095962524414, "rewards/rejected": -143.89337158203125, "step": 11210 }, { "epoch": 0.65, "grad_norm": 48.67179870605469, "learning_rate": 0.0007857889237199582, "logits/chosen": -11.288705825805664, "logits/rejected": -11.4315824508667, "logps/chosen": -2641.37939453125, "logps/rejected": -2525.27734375, "loss": 13.5949, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -103.4339828491211, "rewards/margins": -1.6042331457138062, "rewards/rejected": -101.82975769042969, "step": 11220 }, { "epoch": 0.65, "grad_norm": 16.18996238708496, "learning_rate": 0.0007855954177793258, "logits/chosen": -11.549413681030273, "logits/rejected": -11.335824966430664, "logps/chosen": -2619.19091796875, "logps/rejected": -2488.709228515625, "loss": 23.3412, "rewards/accuracies": 0.5, "rewards/chosen": -135.66433715820312, "rewards/margins": -18.471664428710938, "rewards/rejected": -117.19266510009766, "step": 11230 }, { "epoch": 0.65, "grad_norm": 33.54993438720703, "learning_rate": 0.0007854019118386934, "logits/chosen": -9.33427619934082, "logits/rejected": -9.353006362915039, "logps/chosen": -2515.541015625, "logps/rejected": -2490.989990234375, "loss": 12.8676, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -75.86090850830078, "rewards/margins": -2.114835023880005, "rewards/rejected": -73.74607849121094, "step": 11240 }, { "epoch": 0.65, "grad_norm": 12.237160682678223, "learning_rate": 0.0007852084058980611, "logits/chosen": -11.213188171386719, "logits/rejected": -11.31629753112793, "logps/chosen": -2255.165771484375, "logps/rejected": -2244.6220703125, "loss": 18.209, "rewards/accuracies": 0.5, "rewards/chosen": -148.10435485839844, "rewards/margins": 4.366209983825684, "rewards/rejected": -152.4705810546875, "step": 11250 }, { "epoch": 0.65, "grad_norm": 3162.02294921875, "learning_rate": 0.0007850148999574287, "logits/chosen": -11.925457000732422, "logits/rejected": -11.932879447937012, "logps/chosen": -2410.764892578125, "logps/rejected": -2362.518798828125, "loss": 14.4586, "rewards/accuracies": 0.5, "rewards/chosen": -139.94955444335938, "rewards/margins": 4.915223121643066, "rewards/rejected": -144.86477661132812, "step": 11260 }, { "epoch": 0.65, "grad_norm": 115.18543243408203, "learning_rate": 0.0007848213940167963, "logits/chosen": -12.409706115722656, "logits/rejected": -12.414652824401855, "logps/chosen": -2220.884765625, "logps/rejected": -2171.0322265625, "loss": 23.2031, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -88.4093017578125, "rewards/margins": -16.9780216217041, "rewards/rejected": -71.43128967285156, "step": 11270 }, { "epoch": 0.65, "grad_norm": 0.5939205884933472, "learning_rate": 0.0007846278880761639, "logits/chosen": -11.641190528869629, "logits/rejected": -11.607399940490723, "logps/chosen": -2185.839111328125, "logps/rejected": -1544.605712890625, "loss": 14.8401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -122.90858459472656, "rewards/margins": 3.038579225540161, "rewards/rejected": -125.94718170166016, "step": 11280 }, { "epoch": 0.65, "grad_norm": 159.3882293701172, "learning_rate": 0.0007844343821355315, "logits/chosen": -11.826231002807617, "logits/rejected": -11.492509841918945, "logps/chosen": -2132.479248046875, "logps/rejected": -1843.2291259765625, "loss": 37.7321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -84.66653442382812, "rewards/margins": -24.80128288269043, "rewards/rejected": -59.8652458190918, "step": 11290 }, { "epoch": 0.65, "grad_norm": 72.18359375, "learning_rate": 0.0007842408761948991, "logits/chosen": -13.496482849121094, "logits/rejected": -13.536924362182617, "logps/chosen": -2170.81298828125, "logps/rejected": -1881.4840087890625, "loss": 23.3576, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.2124481201172, "rewards/margins": -17.276803970336914, "rewards/rejected": -160.93565368652344, "step": 11300 }, { "epoch": 0.65, "grad_norm": 4.9107866287231445, "learning_rate": 0.0007840473702542669, "logits/chosen": -10.76008415222168, "logits/rejected": -11.2245512008667, "logps/chosen": -2444.57763671875, "logps/rejected": -2395.447265625, "loss": 12.328, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -81.59264373779297, "rewards/margins": -5.672621726989746, "rewards/rejected": -75.9200210571289, "step": 11310 }, { "epoch": 0.66, "grad_norm": 994.9684448242188, "learning_rate": 0.0007838538643136345, "logits/chosen": -12.740494728088379, "logits/rejected": -12.693170547485352, "logps/chosen": -2288.820556640625, "logps/rejected": -2116.608154296875, "loss": 13.4017, "rewards/accuracies": 0.5, "rewards/chosen": -117.66158294677734, "rewards/margins": 5.248988151550293, "rewards/rejected": -122.91056823730469, "step": 11320 }, { "epoch": 0.66, "grad_norm": 1.443599200682405e-13, "learning_rate": 0.0007836603583730021, "logits/chosen": -13.868746757507324, "logits/rejected": -14.044950485229492, "logps/chosen": -2288.1064453125, "logps/rejected": -2103.32666015625, "loss": 20.32, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -80.65902709960938, "rewards/margins": -8.350770950317383, "rewards/rejected": -72.3082504272461, "step": 11330 }, { "epoch": 0.66, "grad_norm": 153.77084350585938, "learning_rate": 0.0007834668524323697, "logits/chosen": -12.918432235717773, "logits/rejected": -13.021949768066406, "logps/chosen": -2337.97802734375, "logps/rejected": -2104.06787109375, "loss": 10.5004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -173.0931854248047, "rewards/margins": 3.2529969215393066, "rewards/rejected": -176.34616088867188, "step": 11340 }, { "epoch": 0.66, "grad_norm": 0.05317741259932518, "learning_rate": 0.0007832733464917373, "logits/chosen": -10.503227233886719, "logits/rejected": -10.341835021972656, "logps/chosen": -2420.62890625, "logps/rejected": -2165.40087890625, "loss": 2.2836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -43.093502044677734, "rewards/margins": 14.772003173828125, "rewards/rejected": -57.865509033203125, "step": 11350 }, { "epoch": 0.66, "grad_norm": 93.60961151123047, "learning_rate": 0.000783079840551105, "logits/chosen": -15.445759773254395, "logits/rejected": -15.004297256469727, "logps/chosen": -2355.271240234375, "logps/rejected": -2036.3990478515625, "loss": 31.126, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -130.86868286132812, "rewards/margins": -29.435791015625, "rewards/rejected": -101.43289947509766, "step": 11360 }, { "epoch": 0.66, "grad_norm": 3.3742205963475296e-17, "learning_rate": 0.0007828863346104726, "logits/chosen": -16.37249755859375, "logits/rejected": -16.642187118530273, "logps/chosen": -2157.336669921875, "logps/rejected": -2201.592041015625, "loss": 19.4716, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.15359497070312, "rewards/margins": -8.435194969177246, "rewards/rejected": -155.7183837890625, "step": 11370 }, { "epoch": 0.66, "grad_norm": 64.35269165039062, "learning_rate": 0.0007826928286698402, "logits/chosen": -17.86587905883789, "logits/rejected": -18.99755859375, "logps/chosen": -2472.18994140625, "logps/rejected": -2406.4609375, "loss": 27.1226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.9864273071289, "rewards/margins": -7.774473667144775, "rewards/rejected": -112.21195983886719, "step": 11380 }, { "epoch": 0.66, "grad_norm": 5.464587360393125e-08, "learning_rate": 0.0007824993227292078, "logits/chosen": -15.759710311889648, "logits/rejected": -16.369417190551758, "logps/chosen": -2756.219482421875, "logps/rejected": -2401.390380859375, "loss": 16.7206, "rewards/accuracies": 0.5, "rewards/chosen": -165.7053680419922, "rewards/margins": -5.130258083343506, "rewards/rejected": -160.57513427734375, "step": 11390 }, { "epoch": 0.66, "grad_norm": 97.7955551147461, "learning_rate": 0.0007823058167885754, "logits/chosen": -13.551173210144043, "logits/rejected": -13.471590042114258, "logps/chosen": -2293.824951171875, "logps/rejected": -2208.033447265625, "loss": 26.9231, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -170.73568725585938, "rewards/margins": -22.921218872070312, "rewards/rejected": -147.81443786621094, "step": 11400 }, { "epoch": 0.66, "grad_norm": 0.0018775133648887277, "learning_rate": 0.0007821123108479432, "logits/chosen": -14.331838607788086, "logits/rejected": -14.052412033081055, "logps/chosen": -2286.727294921875, "logps/rejected": -2197.696044921875, "loss": 16.4511, "rewards/accuracies": 0.5, "rewards/chosen": -127.33058166503906, "rewards/margins": -9.369282722473145, "rewards/rejected": -117.9613037109375, "step": 11410 }, { "epoch": 0.66, "grad_norm": 52.14789962768555, "learning_rate": 0.0007819188049073107, "logits/chosen": -12.957417488098145, "logits/rejected": -13.074361801147461, "logps/chosen": -2381.901123046875, "logps/rejected": -2277.13134765625, "loss": 7.476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.1018295288086, "rewards/margins": 4.2054243087768555, "rewards/rejected": -113.3072509765625, "step": 11420 }, { "epoch": 0.66, "grad_norm": 113.59964752197266, "learning_rate": 0.0007817252989666783, "logits/chosen": -13.896388053894043, "logits/rejected": -14.073979377746582, "logps/chosen": -2361.69677734375, "logps/rejected": -2361.89306640625, "loss": 10.3741, "rewards/accuracies": 0.5, "rewards/chosen": -154.4905548095703, "rewards/margins": -4.743355751037598, "rewards/rejected": -149.7471923828125, "step": 11430 }, { "epoch": 0.66, "grad_norm": 1.2926891684703737e-19, "learning_rate": 0.0007815317930260459, "logits/chosen": -9.197977066040039, "logits/rejected": -9.104055404663086, "logps/chosen": -2657.113037109375, "logps/rejected": -2599.40283203125, "loss": 7.4851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.152135848999023, "rewards/margins": 16.619226455688477, "rewards/rejected": -32.7713623046875, "step": 11440 }, { "epoch": 0.66, "grad_norm": 84.59940338134766, "learning_rate": 0.0007813382870854135, "logits/chosen": -11.942037582397461, "logits/rejected": -11.981197357177734, "logps/chosen": -2494.52001953125, "logps/rejected": -2463.39697265625, "loss": 6.5709, "rewards/accuracies": 0.5, "rewards/chosen": -80.27537536621094, "rewards/margins": 6.001765251159668, "rewards/rejected": -86.27715301513672, "step": 11450 }, { "epoch": 0.66, "grad_norm": 97.55162811279297, "learning_rate": 0.0007811447811447811, "logits/chosen": -12.3815336227417, "logits/rejected": -12.348369598388672, "logps/chosen": -2578.336669921875, "logps/rejected": -2499.97802734375, "loss": 17.5157, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.39497375488281, "rewards/margins": -9.57048511505127, "rewards/rejected": -96.8244857788086, "step": 11460 }, { "epoch": 0.66, "grad_norm": 1.967869684449397e-05, "learning_rate": 0.0007809512752041487, "logits/chosen": -12.663164138793945, "logits/rejected": -12.716025352478027, "logps/chosen": -2338.888671875, "logps/rejected": -1878.1585693359375, "loss": 10.4701, "rewards/accuracies": 0.5, "rewards/chosen": -132.33547973632812, "rewards/margins": 14.610173225402832, "rewards/rejected": -146.94564819335938, "step": 11470 }, { "epoch": 0.66, "grad_norm": 69.64265441894531, "learning_rate": 0.0007807577692635164, "logits/chosen": -13.321565628051758, "logits/rejected": -13.881990432739258, "logps/chosen": -2428.841064453125, "logps/rejected": -1954.234375, "loss": 40.622, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -165.42300415039062, "rewards/margins": -34.07966995239258, "rewards/rejected": -131.34335327148438, "step": 11480 }, { "epoch": 0.67, "grad_norm": 95.95516204833984, "learning_rate": 0.000780564263322884, "logits/chosen": -11.81951904296875, "logits/rejected": -11.674778938293457, "logps/chosen": -2350.596923828125, "logps/rejected": -2215.114013671875, "loss": 7.7275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -99.18494415283203, "rewards/margins": 11.095965385437012, "rewards/rejected": -110.2809066772461, "step": 11490 }, { "epoch": 0.67, "grad_norm": 1.4716687246618676e-06, "learning_rate": 0.0007803707573822516, "logits/chosen": -14.13843059539795, "logits/rejected": -14.215336799621582, "logps/chosen": -2804.75048828125, "logps/rejected": -2428.8984375, "loss": 4.266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.0382843017578, "rewards/margins": 8.909355163574219, "rewards/rejected": -142.9476318359375, "step": 11500 }, { "epoch": 0.67, "grad_norm": 22.119131088256836, "learning_rate": 0.0007801772514416193, "logits/chosen": -13.98303508758545, "logits/rejected": -13.938326835632324, "logps/chosen": -2805.753173828125, "logps/rejected": -2802.70751953125, "loss": 15.384, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -147.89422607421875, "rewards/margins": -8.621057510375977, "rewards/rejected": -139.27316284179688, "step": 11510 }, { "epoch": 0.67, "grad_norm": 43.264068603515625, "learning_rate": 0.0007799837455009869, "logits/chosen": -15.5145902633667, "logits/rejected": -15.658650398254395, "logps/chosen": -2125.34228515625, "logps/rejected": -1860.3642578125, "loss": 17.0769, "rewards/accuracies": 0.5, "rewards/chosen": -149.9554443359375, "rewards/margins": -7.255765438079834, "rewards/rejected": -142.69967651367188, "step": 11520 }, { "epoch": 0.67, "grad_norm": 0.0002165173355024308, "learning_rate": 0.0007797902395603546, "logits/chosen": -12.282548904418945, "logits/rejected": -12.267457008361816, "logps/chosen": -2712.04150390625, "logps/rejected": -2212.43310546875, "loss": 13.3203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -81.32926177978516, "rewards/margins": 7.321459770202637, "rewards/rejected": -88.65071105957031, "step": 11530 }, { "epoch": 0.67, "grad_norm": 68.44863891601562, "learning_rate": 0.0007795967336197222, "logits/chosen": -11.107137680053711, "logits/rejected": -11.05681324005127, "logps/chosen": -2779.89404296875, "logps/rejected": -2537.113037109375, "loss": 7.4603, "rewards/accuracies": 0.5, "rewards/chosen": -37.714454650878906, "rewards/margins": -0.3076358735561371, "rewards/rejected": -37.40681838989258, "step": 11540 }, { "epoch": 0.67, "grad_norm": 90.97479248046875, "learning_rate": 0.0007794032276790898, "logits/chosen": -16.381677627563477, "logits/rejected": -16.435983657836914, "logps/chosen": -2248.72998046875, "logps/rejected": -2113.566162109375, "loss": 14.9448, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -164.3554229736328, "rewards/margins": -7.247048854827881, "rewards/rejected": -157.10836791992188, "step": 11550 }, { "epoch": 0.67, "grad_norm": 74.05748748779297, "learning_rate": 0.0007792097217384574, "logits/chosen": -14.965551376342773, "logits/rejected": -15.395756721496582, "logps/chosen": -2214.11669921875, "logps/rejected": -2202.56689453125, "loss": 5.8196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.25340270996094, "rewards/margins": 2.9757583141326904, "rewards/rejected": -155.22915649414062, "step": 11560 }, { "epoch": 0.67, "grad_norm": 7.0275977352425405e-15, "learning_rate": 0.000779016215797825, "logits/chosen": -13.09142780303955, "logits/rejected": -12.945157051086426, "logps/chosen": -2375.97314453125, "logps/rejected": -2140.130859375, "loss": 8.1208, "rewards/accuracies": 0.5, "rewards/chosen": -98.06729125976562, "rewards/margins": 5.817930698394775, "rewards/rejected": -103.88521575927734, "step": 11570 }, { "epoch": 0.67, "grad_norm": 97.19711303710938, "learning_rate": 0.0007788227098571926, "logits/chosen": -14.836830139160156, "logits/rejected": -15.06151008605957, "logps/chosen": -1954.2349853515625, "logps/rejected": -1729.7115478515625, "loss": 4.7407, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -134.92822265625, "rewards/margins": 11.858372688293457, "rewards/rejected": -146.78659057617188, "step": 11580 }, { "epoch": 0.67, "grad_norm": 67.27342987060547, "learning_rate": 0.0007786292039165603, "logits/chosen": -10.141678810119629, "logits/rejected": -9.980104446411133, "logps/chosen": -2605.40673828125, "logps/rejected": -2257.246826171875, "loss": 24.1298, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -38.76155471801758, "rewards/margins": -13.812250137329102, "rewards/rejected": -24.949298858642578, "step": 11590 }, { "epoch": 0.67, "grad_norm": 52.87922286987305, "learning_rate": 0.0007784356979759279, "logits/chosen": -9.170857429504395, "logits/rejected": -8.953627586364746, "logps/chosen": -2413.04052734375, "logps/rejected": -2147.617431640625, "loss": 15.121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -97.86447143554688, "rewards/margins": 4.141026496887207, "rewards/rejected": -102.00550842285156, "step": 11600 }, { "epoch": 0.67, "grad_norm": 3.2691318402333636e-11, "learning_rate": 0.0007782421920352955, "logits/chosen": -9.683409690856934, "logits/rejected": -9.530526161193848, "logps/chosen": -1885.2222900390625, "logps/rejected": -1750.347412109375, "loss": 20.2476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.24098205566406, "rewards/margins": 2.8550314903259277, "rewards/rejected": -121.09600830078125, "step": 11610 }, { "epoch": 0.67, "grad_norm": 1.473938852913914e-11, "learning_rate": 0.0007780486860946632, "logits/chosen": -10.522153854370117, "logits/rejected": -10.392573356628418, "logps/chosen": -2520.597900390625, "logps/rejected": -2458.615966796875, "loss": 14.1853, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -110.35284423828125, "rewards/margins": -6.305046081542969, "rewards/rejected": -104.04780578613281, "step": 11620 }, { "epoch": 0.67, "grad_norm": 0.07290655374526978, "learning_rate": 0.0007778551801540307, "logits/chosen": -13.222822189331055, "logits/rejected": -13.359628677368164, "logps/chosen": -2451.097412109375, "logps/rejected": -2188.801025390625, "loss": 2.7499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.63426971435547, "rewards/margins": 16.084989547729492, "rewards/rejected": -114.7192611694336, "step": 11630 }, { "epoch": 0.67, "grad_norm": 1.5553770065307617, "learning_rate": 0.0007776616742133983, "logits/chosen": -11.165060043334961, "logits/rejected": -11.046072006225586, "logps/chosen": -2456.14697265625, "logps/rejected": -2331.97216796875, "loss": 1.6951, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -69.23480987548828, "rewards/margins": 18.766469955444336, "rewards/rejected": -88.00127410888672, "step": 11640 }, { "epoch": 0.67, "grad_norm": 50.890499114990234, "learning_rate": 0.000777468168272766, "logits/chosen": -13.655654907226562, "logits/rejected": -13.406614303588867, "logps/chosen": -2095.6845703125, "logps/rejected": -1700.5562744140625, "loss": 28.1003, "rewards/accuracies": 0.5, "rewards/chosen": -155.9793243408203, "rewards/margins": -18.392078399658203, "rewards/rejected": -137.58724975585938, "step": 11650 }, { "epoch": 0.67, "grad_norm": 34.46733093261719, "learning_rate": 0.0007772746623321336, "logits/chosen": -13.994918823242188, "logits/rejected": -13.781506538391113, "logps/chosen": -1998.5048828125, "logps/rejected": -1879.1865234375, "loss": 16.5008, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.25762939453125, "rewards/margins": -5.5759758949279785, "rewards/rejected": -122.68165588378906, "step": 11660 }, { "epoch": 0.68, "grad_norm": 904.0211181640625, "learning_rate": 0.0007770811563915012, "logits/chosen": -14.158900260925293, "logits/rejected": -13.915867805480957, "logps/chosen": -2577.94775390625, "logps/rejected": -2239.62060546875, "loss": 14.1223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -181.35194396972656, "rewards/margins": -6.084205627441406, "rewards/rejected": -175.26773071289062, "step": 11670 }, { "epoch": 0.68, "grad_norm": 3.950308084487915, "learning_rate": 0.0007768876504508688, "logits/chosen": -15.892590522766113, "logits/rejected": -16.276294708251953, "logps/chosen": -1894.900634765625, "logps/rejected": -2049.48828125, "loss": 4.5529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -123.1832275390625, "rewards/margins": 23.398357391357422, "rewards/rejected": -146.58157348632812, "step": 11680 }, { "epoch": 0.68, "grad_norm": 55.05390930175781, "learning_rate": 0.0007766941445102364, "logits/chosen": -11.816423416137695, "logits/rejected": -11.741178512573242, "logps/chosen": -1943.578125, "logps/rejected": -1554.458251953125, "loss": 18.5306, "rewards/accuracies": 0.5, "rewards/chosen": -93.11775207519531, "rewards/margins": -10.996294975280762, "rewards/rejected": -82.12145233154297, "step": 11690 }, { "epoch": 0.68, "grad_norm": 2.158797932557961e-16, "learning_rate": 0.000776500638569604, "logits/chosen": -9.5835542678833, "logits/rejected": -9.47449779510498, "logps/chosen": -2856.95361328125, "logps/rejected": -2697.880126953125, "loss": 4.6022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -100.98478698730469, "rewards/margins": 17.8679256439209, "rewards/rejected": -118.85270690917969, "step": 11700 }, { "epoch": 0.68, "grad_norm": 1.683957140397041e-11, "learning_rate": 0.0007763071326289717, "logits/chosen": -12.448288917541504, "logits/rejected": -12.180303573608398, "logps/chosen": -2507.583984375, "logps/rejected": -2287.35888671875, "loss": 13.4728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -99.05714416503906, "rewards/margins": 1.9341789484024048, "rewards/rejected": -100.99131774902344, "step": 11710 }, { "epoch": 0.68, "grad_norm": 65.30632781982422, "learning_rate": 0.0007761136266883394, "logits/chosen": -11.804254531860352, "logits/rejected": -11.608135223388672, "logps/chosen": -2484.993408203125, "logps/rejected": -2474.52392578125, "loss": 5.1854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -105.69974517822266, "rewards/margins": 28.543987274169922, "rewards/rejected": -134.24374389648438, "step": 11720 }, { "epoch": 0.68, "grad_norm": 44.50529861450195, "learning_rate": 0.000775920120747707, "logits/chosen": -11.834230422973633, "logits/rejected": -11.24193000793457, "logps/chosen": -2580.23193359375, "logps/rejected": -2436.71337890625, "loss": 7.1258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.90348052978516, "rewards/margins": 18.116382598876953, "rewards/rejected": -128.01986694335938, "step": 11730 }, { "epoch": 0.68, "grad_norm": 1.1336616277694702, "learning_rate": 0.0007757266148070746, "logits/chosen": -13.4053955078125, "logits/rejected": -13.692212104797363, "logps/chosen": -2220.56298828125, "logps/rejected": -2463.88720703125, "loss": 4.9724, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.2338104248047, "rewards/margins": 19.799835205078125, "rewards/rejected": -204.03366088867188, "step": 11740 }, { "epoch": 0.68, "grad_norm": 40.176361083984375, "learning_rate": 0.0007755331088664422, "logits/chosen": -9.48358154296875, "logits/rejected": -9.244400978088379, "logps/chosen": -2997.48779296875, "logps/rejected": -2719.7509765625, "loss": 5.9666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -70.13885498046875, "rewards/margins": 3.3751747608184814, "rewards/rejected": -73.5140380859375, "step": 11750 }, { "epoch": 0.68, "grad_norm": 4.427189196576364e-06, "learning_rate": 0.0007753396029258099, "logits/chosen": -12.247812271118164, "logits/rejected": -12.143692016601562, "logps/chosen": -1754.772705078125, "logps/rejected": -1697.289794921875, "loss": 20.2782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.15975189208984, "rewards/margins": 18.46657371520996, "rewards/rejected": -139.62631225585938, "step": 11760 }, { "epoch": 0.68, "grad_norm": 1.031499330395036e-08, "learning_rate": 0.0007751460969851775, "logits/chosen": -11.451128959655762, "logits/rejected": -11.388829231262207, "logps/chosen": -2380.35693359375, "logps/rejected": -2160.349853515625, "loss": 9.759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.01417541503906, "rewards/margins": -1.0964664220809937, "rewards/rejected": -158.91769409179688, "step": 11770 }, { "epoch": 0.68, "grad_norm": 3.433854103088379, "learning_rate": 0.0007749525910445451, "logits/chosen": -12.346430778503418, "logits/rejected": -12.455699920654297, "logps/chosen": -2103.62744140625, "logps/rejected": -2064.91455078125, "loss": 19.1441, "rewards/accuracies": 0.5, "rewards/chosen": -114.81742858886719, "rewards/margins": 2.0203781127929688, "rewards/rejected": -116.8377914428711, "step": 11780 }, { "epoch": 0.68, "grad_norm": 0.00025609013391658664, "learning_rate": 0.0007747590851039127, "logits/chosen": -10.807019233703613, "logits/rejected": -10.914587020874023, "logps/chosen": -2523.885009765625, "logps/rejected": -2187.39306640625, "loss": 23.0142, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -64.48519897460938, "rewards/margins": -13.139312744140625, "rewards/rejected": -51.34588623046875, "step": 11790 }, { "epoch": 0.68, "grad_norm": 0.0006981877377256751, "learning_rate": 0.0007745655791632803, "logits/chosen": -10.917263984680176, "logits/rejected": -10.946675300598145, "logps/chosen": -2344.02978515625, "logps/rejected": -2116.14501953125, "loss": 13.998, "rewards/accuracies": 0.5, "rewards/chosen": -119.98478698730469, "rewards/margins": -8.081205368041992, "rewards/rejected": -111.9035873413086, "step": 11800 }, { "epoch": 0.68, "grad_norm": 142.67408752441406, "learning_rate": 0.000774372073222648, "logits/chosen": -10.815011024475098, "logits/rejected": -10.784801483154297, "logps/chosen": -2622.212158203125, "logps/rejected": -2423.6171875, "loss": 27.9383, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.27247619628906, "rewards/margins": -12.35999584197998, "rewards/rejected": -104.91249084472656, "step": 11810 }, { "epoch": 0.68, "grad_norm": 0.023978829383850098, "learning_rate": 0.0007741785672820156, "logits/chosen": -11.719093322753906, "logits/rejected": -11.923178672790527, "logps/chosen": -2068.4833984375, "logps/rejected": -2047.9935302734375, "loss": 10.3609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -97.72103881835938, "rewards/margins": -7.469079494476318, "rewards/rejected": -90.251953125, "step": 11820 }, { "epoch": 0.68, "grad_norm": 0.002403888152912259, "learning_rate": 0.0007739850613413833, "logits/chosen": -13.659894943237305, "logits/rejected": -13.888799667358398, "logps/chosen": -2489.670166015625, "logps/rejected": -2394.69677734375, "loss": 21.4115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.2463836669922, "rewards/margins": -9.505952835083008, "rewards/rejected": -141.74044799804688, "step": 11830 }, { "epoch": 0.69, "grad_norm": 19.296558380126953, "learning_rate": 0.0007737915554007509, "logits/chosen": -14.807649612426758, "logits/rejected": -15.445831298828125, "logps/chosen": -2344.03271484375, "logps/rejected": -2083.145263671875, "loss": 20.2035, "rewards/accuracies": 0.5, "rewards/chosen": -123.56889343261719, "rewards/margins": -10.427874565124512, "rewards/rejected": -113.1410140991211, "step": 11840 }, { "epoch": 0.69, "grad_norm": 1.252944969998282e-11, "learning_rate": 0.0007735980494601184, "logits/chosen": -20.09015655517578, "logits/rejected": -20.187780380249023, "logps/chosen": -2450.51806640625, "logps/rejected": -2380.82568359375, "loss": 4.7597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.4594268798828, "rewards/margins": 12.419217109680176, "rewards/rejected": -144.87863159179688, "step": 11850 }, { "epoch": 0.69, "grad_norm": 60.13103103637695, "learning_rate": 0.000773404543519486, "logits/chosen": -21.191837310791016, "logits/rejected": -22.32037925720215, "logps/chosen": -2742.399658203125, "logps/rejected": -2538.04833984375, "loss": 15.9407, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -200.42005920410156, "rewards/margins": -8.461688041687012, "rewards/rejected": -191.9583740234375, "step": 11860 }, { "epoch": 0.69, "grad_norm": 0.00018338944937568158, "learning_rate": 0.0007732110375788537, "logits/chosen": -18.603151321411133, "logits/rejected": -18.885488510131836, "logps/chosen": -2089.4111328125, "logps/rejected": -2122.9384765625, "loss": 20.0546, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -171.48837280273438, "rewards/margins": -18.062210083007812, "rewards/rejected": -153.4261932373047, "step": 11870 }, { "epoch": 0.69, "grad_norm": 5.6329358955053976e-08, "learning_rate": 0.0007730175316382213, "logits/chosen": -15.462626457214355, "logits/rejected": -15.446986198425293, "logps/chosen": -2515.20263671875, "logps/rejected": -2323.031005859375, "loss": 18.7086, "rewards/accuracies": 0.5, "rewards/chosen": -167.2799835205078, "rewards/margins": -8.264955520629883, "rewards/rejected": -159.01502990722656, "step": 11880 }, { "epoch": 0.69, "grad_norm": 100.35936737060547, "learning_rate": 0.0007728240256975889, "logits/chosen": -12.892595291137695, "logits/rejected": -12.827921867370605, "logps/chosen": -2694.994873046875, "logps/rejected": -2776.185302734375, "loss": 7.9615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -84.52790832519531, "rewards/margins": 13.6854248046875, "rewards/rejected": -98.21333312988281, "step": 11890 }, { "epoch": 0.69, "grad_norm": 453.5437316894531, "learning_rate": 0.0007726305197569565, "logits/chosen": -15.632708549499512, "logits/rejected": -15.636932373046875, "logps/chosen": -2434.673095703125, "logps/rejected": -2344.678955078125, "loss": 8.7428, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.58587646484375, "rewards/margins": -1.0858383178710938, "rewards/rejected": -112.50003814697266, "step": 11900 }, { "epoch": 0.69, "grad_norm": 10.121369361877441, "learning_rate": 0.0007724370138163241, "logits/chosen": -16.83792495727539, "logits/rejected": -16.36052703857422, "logps/chosen": -2532.2568359375, "logps/rejected": -2577.50537109375, "loss": 1.9733, "rewards/accuracies": 0.5, "rewards/chosen": -66.89315032958984, "rewards/margins": 13.41871452331543, "rewards/rejected": -80.3118667602539, "step": 11910 }, { "epoch": 0.69, "grad_norm": 79.09244537353516, "learning_rate": 0.0007722435078756917, "logits/chosen": -16.75870132446289, "logits/rejected": -16.53815460205078, "logps/chosen": -2800.015625, "logps/rejected": -2774.455078125, "loss": 29.7226, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -169.471435546875, "rewards/margins": -11.61294174194336, "rewards/rejected": -157.85850524902344, "step": 11920 }, { "epoch": 0.69, "grad_norm": 67.74852752685547, "learning_rate": 0.0007720500019350595, "logits/chosen": -19.19378089904785, "logits/rejected": -19.81354331970215, "logps/chosen": -2988.09912109375, "logps/rejected": -2704.453857421875, "loss": 13.5693, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -183.40225219726562, "rewards/margins": -3.598689317703247, "rewards/rejected": -179.80355834960938, "step": 11930 }, { "epoch": 0.69, "grad_norm": 45.07322311401367, "learning_rate": 0.0007718564959944271, "logits/chosen": -13.149374008178711, "logits/rejected": -13.264167785644531, "logps/chosen": -3291.86279296875, "logps/rejected": -3142.24267578125, "loss": 11.8333, "rewards/accuracies": 0.5, "rewards/chosen": -84.66849517822266, "rewards/margins": 0.5840805172920227, "rewards/rejected": -85.25257873535156, "step": 11940 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 0.0007716629900537947, "logits/chosen": -21.170928955078125, "logits/rejected": -22.204830169677734, "logps/chosen": -2809.296875, "logps/rejected": -2776.268310546875, "loss": 11.4709, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -198.14004516601562, "rewards/margins": 4.238908290863037, "rewards/rejected": -202.37896728515625, "step": 11950 }, { "epoch": 0.69, "grad_norm": 175.25355529785156, "learning_rate": 0.0007714694841131623, "logits/chosen": -18.874692916870117, "logits/rejected": -20.239295959472656, "logps/chosen": -2795.66064453125, "logps/rejected": -2655.189453125, "loss": 16.5555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.73922729492188, "rewards/margins": -3.0090103149414062, "rewards/rejected": -164.73020935058594, "step": 11960 }, { "epoch": 0.69, "grad_norm": 0.015652762725949287, "learning_rate": 0.0007712759781725299, "logits/chosen": -15.489164352416992, "logits/rejected": -15.258962631225586, "logps/chosen": -2621.591796875, "logps/rejected": -2293.300048828125, "loss": 10.243, "rewards/accuracies": 0.5, "rewards/chosen": -129.58059692382812, "rewards/margins": 5.50009822845459, "rewards/rejected": -135.08070373535156, "step": 11970 }, { "epoch": 0.69, "grad_norm": 1.467750888276692e-13, "learning_rate": 0.0007710824722318975, "logits/chosen": -17.57381248474121, "logits/rejected": -17.3376407623291, "logps/chosen": -2396.7294921875, "logps/rejected": -2412.931396484375, "loss": 13.7034, "rewards/accuracies": 0.5, "rewards/chosen": -142.00741577148438, "rewards/margins": 2.811234951019287, "rewards/rejected": -144.81866455078125, "step": 11980 }, { "epoch": 0.69, "grad_norm": 2.796792364837511e-21, "learning_rate": 0.0007708889662912652, "logits/chosen": -17.784000396728516, "logits/rejected": -17.755508422851562, "logps/chosen": -2332.845947265625, "logps/rejected": -2314.322021484375, "loss": 2.7869, "rewards/accuracies": 0.5, "rewards/chosen": -134.4120330810547, "rewards/margins": 19.78708839416504, "rewards/rejected": -154.19912719726562, "step": 11990 }, { "epoch": 0.69, "grad_norm": 132.74169921875, "learning_rate": 0.0007706954603506328, "logits/chosen": -17.446245193481445, "logits/rejected": -17.425371170043945, "logps/chosen": -2534.83837890625, "logps/rejected": -2122.135009765625, "loss": 10.2572, "rewards/accuracies": 0.5, "rewards/chosen": -150.92214965820312, "rewards/margins": 2.51488995552063, "rewards/rejected": -153.4370574951172, "step": 12000 }, { "epoch": 0.7, "grad_norm": 204.91842651367188, "learning_rate": 0.0007705019544100004, "logits/chosen": -17.25234031677246, "logits/rejected": -17.42220687866211, "logps/chosen": -2690.32568359375, "logps/rejected": -2783.903076171875, "loss": 18.6756, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -116.0504379272461, "rewards/margins": -13.248705863952637, "rewards/rejected": -102.80171966552734, "step": 12010 }, { "epoch": 0.7, "grad_norm": 116.31256866455078, "learning_rate": 0.000770308448469368, "logits/chosen": -12.414462089538574, "logits/rejected": -12.18144416809082, "logps/chosen": -2527.61279296875, "logps/rejected": -2107.154296875, "loss": 8.0434, "rewards/accuracies": 0.5, "rewards/chosen": -138.97238159179688, "rewards/margins": 1.7436416149139404, "rewards/rejected": -140.71603393554688, "step": 12020 }, { "epoch": 0.7, "grad_norm": 7.544584274291992, "learning_rate": 0.0007701149425287356, "logits/chosen": -17.44808578491211, "logits/rejected": -17.303377151489258, "logps/chosen": -2298.9326171875, "logps/rejected": -2187.59326171875, "loss": 12.6265, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.0299835205078, "rewards/margins": -6.861051082611084, "rewards/rejected": -165.16893005371094, "step": 12030 }, { "epoch": 0.7, "grad_norm": 147.8988037109375, "learning_rate": 0.0007699214365881034, "logits/chosen": -15.958963394165039, "logits/rejected": -15.196220397949219, "logps/chosen": -2644.343994140625, "logps/rejected": -2452.718505859375, "loss": 10.7933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.28363037109375, "rewards/margins": 2.9321258068084717, "rewards/rejected": -118.21574401855469, "step": 12040 }, { "epoch": 0.7, "grad_norm": 182.1521453857422, "learning_rate": 0.000769727930647471, "logits/chosen": -13.354243278503418, "logits/rejected": -13.241111755371094, "logps/chosen": -2656.378662109375, "logps/rejected": -2483.15087890625, "loss": 3.1576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -49.818359375, "rewards/margins": 22.56158447265625, "rewards/rejected": -72.37994384765625, "step": 12050 }, { "epoch": 0.7, "grad_norm": 24.849971771240234, "learning_rate": 0.0007695344247068386, "logits/chosen": -19.651432037353516, "logits/rejected": -19.865480422973633, "logps/chosen": -2409.102783203125, "logps/rejected": -2342.234130859375, "loss": 2.2687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.96096801757812, "rewards/margins": 15.051460266113281, "rewards/rejected": -186.01242065429688, "step": 12060 }, { "epoch": 0.7, "grad_norm": 154.7920379638672, "learning_rate": 0.0007693409187662061, "logits/chosen": -14.519145011901855, "logits/rejected": -14.461385726928711, "logps/chosen": -2735.9501953125, "logps/rejected": -2701.23828125, "loss": 9.9926, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -92.22553253173828, "rewards/margins": -3.621720790863037, "rewards/rejected": -88.60381317138672, "step": 12070 }, { "epoch": 0.7, "grad_norm": 134.73912048339844, "learning_rate": 0.0007691474128255737, "logits/chosen": -17.20237159729004, "logits/rejected": -17.358898162841797, "logps/chosen": -3133.950439453125, "logps/rejected": -2893.509033203125, "loss": 22.61, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -205.3536834716797, "rewards/margins": -10.81641960144043, "rewards/rejected": -194.5372772216797, "step": 12080 }, { "epoch": 0.7, "grad_norm": 109.19534301757812, "learning_rate": 0.0007689539068849413, "logits/chosen": -12.04612922668457, "logits/rejected": -11.831181526184082, "logps/chosen": -2950.263671875, "logps/rejected": -2445.5224609375, "loss": 9.859, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -75.53945922851562, "rewards/margins": -4.965743064880371, "rewards/rejected": -70.57372283935547, "step": 12090 }, { "epoch": 0.7, "grad_norm": 67.68376922607422, "learning_rate": 0.000768760400944309, "logits/chosen": -14.152643203735352, "logits/rejected": -13.838525772094727, "logps/chosen": -2290.808349609375, "logps/rejected": -2347.917724609375, "loss": 3.725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -159.3685760498047, "rewards/margins": 23.916248321533203, "rewards/rejected": -183.28482055664062, "step": 12100 }, { "epoch": 0.7, "grad_norm": 118.98746490478516, "learning_rate": 0.0007685668950036766, "logits/chosen": -13.285781860351562, "logits/rejected": -13.158940315246582, "logps/chosen": -2446.78564453125, "logps/rejected": -2296.181640625, "loss": 13.2045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.66062927246094, "rewards/margins": -2.343061923980713, "rewards/rejected": -200.31756591796875, "step": 12110 }, { "epoch": 0.7, "grad_norm": 102.17890930175781, "learning_rate": 0.0007683733890630442, "logits/chosen": -11.31608772277832, "logits/rejected": -11.301546096801758, "logps/chosen": -2277.81787109375, "logps/rejected": -2591.7490234375, "loss": 8.5296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -136.74472045898438, "rewards/margins": 5.877696514129639, "rewards/rejected": -142.62240600585938, "step": 12120 }, { "epoch": 0.7, "grad_norm": 67.55280303955078, "learning_rate": 0.0007681798831224118, "logits/chosen": -12.10985279083252, "logits/rejected": -12.19272518157959, "logps/chosen": -2460.88037109375, "logps/rejected": -1914.6787109375, "loss": 34.3771, "rewards/accuracies": 0.5, "rewards/chosen": -154.1935272216797, "rewards/margins": -25.622478485107422, "rewards/rejected": -128.571044921875, "step": 12130 }, { "epoch": 0.7, "grad_norm": 164.544677734375, "learning_rate": 0.0007679863771817795, "logits/chosen": -12.764161109924316, "logits/rejected": -12.906747817993164, "logps/chosen": -2397.50927734375, "logps/rejected": -2352.179443359375, "loss": 9.3212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -104.9313735961914, "rewards/margins": 1.926089882850647, "rewards/rejected": -106.85746765136719, "step": 12140 }, { "epoch": 0.7, "grad_norm": 98.47219848632812, "learning_rate": 0.0007677928712411471, "logits/chosen": -15.38288402557373, "logits/rejected": -15.051838874816895, "logps/chosen": -2238.348388671875, "logps/rejected": -2104.531982421875, "loss": 8.6781, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -136.51312255859375, "rewards/margins": -6.6939287185668945, "rewards/rejected": -129.81918334960938, "step": 12150 }, { "epoch": 0.7, "grad_norm": 99.22476959228516, "learning_rate": 0.0007675993653005148, "logits/chosen": -12.539979934692383, "logits/rejected": -12.785414695739746, "logps/chosen": -2429.85107421875, "logps/rejected": -1894.175537109375, "loss": 42.8946, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -142.7002410888672, "rewards/margins": -36.71912384033203, "rewards/rejected": -105.98111724853516, "step": 12160 }, { "epoch": 0.7, "grad_norm": 83.26119232177734, "learning_rate": 0.0007674058593598824, "logits/chosen": -11.279745101928711, "logits/rejected": -11.020284652709961, "logps/chosen": -2630.990478515625, "logps/rejected": -2289.424560546875, "loss": 31.8225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -123.9076919555664, "rewards/margins": -16.662382125854492, "rewards/rejected": -107.24531555175781, "step": 12170 }, { "epoch": 0.71, "grad_norm": 79.55905151367188, "learning_rate": 0.00076721235341925, "logits/chosen": -11.637983322143555, "logits/rejected": -11.834595680236816, "logps/chosen": -2139.096435546875, "logps/rejected": -2131.7744140625, "loss": 20.1584, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -184.85877990722656, "rewards/margins": -3.678187608718872, "rewards/rejected": -181.18060302734375, "step": 12180 }, { "epoch": 0.71, "grad_norm": 47.533023834228516, "learning_rate": 0.0007670188474786176, "logits/chosen": -10.055475234985352, "logits/rejected": -9.887397766113281, "logps/chosen": -2399.81103515625, "logps/rejected": -1897.3011474609375, "loss": 24.8562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.94635009765625, "rewards/margins": -12.543184280395508, "rewards/rejected": -113.4031753540039, "step": 12190 }, { "epoch": 0.71, "grad_norm": 2.8130104468004902e-08, "learning_rate": 0.0007668253415379852, "logits/chosen": -10.502551078796387, "logits/rejected": -10.450728416442871, "logps/chosen": -1969.036376953125, "logps/rejected": -2078.93115234375, "loss": 35.9302, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.12643432617188, "rewards/margins": -29.87029457092285, "rewards/rejected": -69.25613403320312, "step": 12200 }, { "epoch": 0.71, "grad_norm": 6.356816584229819e-07, "learning_rate": 0.0007666318355973529, "logits/chosen": -10.647619247436523, "logits/rejected": -10.61534309387207, "logps/chosen": -2054.144287109375, "logps/rejected": -2139.91357421875, "loss": 2.4967, "rewards/accuracies": 0.5, "rewards/chosen": -67.70043182373047, "rewards/margins": 3.5564308166503906, "rewards/rejected": -71.25686645507812, "step": 12210 }, { "epoch": 0.71, "grad_norm": 51.49312973022461, "learning_rate": 0.0007664383296567205, "logits/chosen": -11.049944877624512, "logits/rejected": -11.02486515045166, "logps/chosen": -1733.253173828125, "logps/rejected": -1600.147216796875, "loss": 9.6732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -104.06890869140625, "rewards/margins": -2.5421385765075684, "rewards/rejected": -101.52677154541016, "step": 12220 }, { "epoch": 0.71, "grad_norm": 42.27461624145508, "learning_rate": 0.0007662448237160881, "logits/chosen": -12.343162536621094, "logits/rejected": -12.246036529541016, "logps/chosen": -2044.0950927734375, "logps/rejected": -1695.6429443359375, "loss": 15.142, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -98.62760925292969, "rewards/margins": -6.223785400390625, "rewards/rejected": -92.40382385253906, "step": 12230 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 0.0007660513177754557, "logits/chosen": -10.662484169006348, "logits/rejected": -10.990682601928711, "logps/chosen": -2249.060546875, "logps/rejected": -1975.071044921875, "loss": 7.5403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -55.19646453857422, "rewards/margins": 5.054214000701904, "rewards/rejected": -60.25067901611328, "step": 12240 }, { "epoch": 0.71, "grad_norm": 131.7966766357422, "learning_rate": 0.0007658578118348234, "logits/chosen": -11.637654304504395, "logits/rejected": -12.156807899475098, "logps/chosen": -2287.95556640625, "logps/rejected": -1960.157958984375, "loss": 25.5382, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -127.9017562866211, "rewards/margins": -15.066357612609863, "rewards/rejected": -112.83538818359375, "step": 12250 }, { "epoch": 0.71, "grad_norm": 0.6304950714111328, "learning_rate": 0.000765664305894191, "logits/chosen": -11.716168403625488, "logits/rejected": -12.813885688781738, "logps/chosen": -2402.02099609375, "logps/rejected": -2072.64013671875, "loss": 23.0174, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -63.27666091918945, "rewards/margins": -15.895291328430176, "rewards/rejected": -47.381370544433594, "step": 12260 }, { "epoch": 0.71, "grad_norm": 0.06559697538614273, "learning_rate": 0.0007654707999535587, "logits/chosen": -13.899038314819336, "logits/rejected": -13.55760383605957, "logps/chosen": -2263.003173828125, "logps/rejected": -2100.61474609375, "loss": 9.98, "rewards/accuracies": 0.5, "rewards/chosen": -119.26104736328125, "rewards/margins": 8.012084007263184, "rewards/rejected": -127.27311706542969, "step": 12270 }, { "epoch": 0.71, "grad_norm": 108.72277069091797, "learning_rate": 0.0007652772940129263, "logits/chosen": -13.579813003540039, "logits/rejected": -13.720077514648438, "logps/chosen": -2834.30908203125, "logps/rejected": -2786.542236328125, "loss": 10.0122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -198.56057739257812, "rewards/margins": 3.064072370529175, "rewards/rejected": -201.62464904785156, "step": 12280 }, { "epoch": 0.71, "grad_norm": 164.87539672851562, "learning_rate": 0.0007650837880722938, "logits/chosen": -15.536094665527344, "logits/rejected": -14.232256889343262, "logps/chosen": -3005.79345703125, "logps/rejected": -2825.225341796875, "loss": 14.9897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.93758392333984, "rewards/margins": 14.852991104125977, "rewards/rejected": -136.79058837890625, "step": 12290 }, { "epoch": 0.71, "grad_norm": 0.03445049375295639, "learning_rate": 0.0007648902821316614, "logits/chosen": -18.07889175415039, "logits/rejected": -17.978504180908203, "logps/chosen": -2567.343994140625, "logps/rejected": -2608.478515625, "loss": 9.9757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -225.24417114257812, "rewards/margins": -3.0148537158966064, "rewards/rejected": -222.22933959960938, "step": 12300 }, { "epoch": 0.71, "grad_norm": 6.063422590590932e-20, "learning_rate": 0.000764696776191029, "logits/chosen": -12.740825653076172, "logits/rejected": -12.99591064453125, "logps/chosen": -2400.881591796875, "logps/rejected": -2299.05322265625, "loss": 22.7155, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -180.4145965576172, "rewards/margins": -17.481027603149414, "rewards/rejected": -162.93356323242188, "step": 12310 }, { "epoch": 0.71, "grad_norm": 55.0321159362793, "learning_rate": 0.0007645032702503966, "logits/chosen": -9.922799110412598, "logits/rejected": -10.129514694213867, "logps/chosen": -2311.690673828125, "logps/rejected": -2042.081298828125, "loss": 7.6602, "rewards/accuracies": 0.5, "rewards/chosen": -109.5455551147461, "rewards/margins": 3.669579267501831, "rewards/rejected": -113.21513366699219, "step": 12320 }, { "epoch": 0.71, "grad_norm": 2.12684446165574e-15, "learning_rate": 0.0007643097643097643, "logits/chosen": -8.850937843322754, "logits/rejected": -8.974766731262207, "logps/chosen": -2327.00732421875, "logps/rejected": -2076.164306640625, "loss": 4.8025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -63.556480407714844, "rewards/margins": 14.000661849975586, "rewards/rejected": -77.55714416503906, "step": 12330 }, { "epoch": 0.71, "grad_norm": 73.91313934326172, "learning_rate": 0.0007641162583691319, "logits/chosen": -11.935515403747559, "logits/rejected": -11.910552024841309, "logps/chosen": -1973.2867431640625, "logps/rejected": -1913.228759765625, "loss": 9.5434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -125.22967529296875, "rewards/margins": 5.02468204498291, "rewards/rejected": -130.25436401367188, "step": 12340 }, { "epoch": 0.71, "grad_norm": 7.019774913787842, "learning_rate": 0.0007639227524284996, "logits/chosen": -11.297032356262207, "logits/rejected": -11.281778335571289, "logps/chosen": -2208.619873046875, "logps/rejected": -2142.4169921875, "loss": 10.6432, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -70.01576232910156, "rewards/margins": -2.4212729930877686, "rewards/rejected": -67.59449768066406, "step": 12350 }, { "epoch": 0.72, "grad_norm": 82.74015045166016, "learning_rate": 0.0007637292464878672, "logits/chosen": -12.808499336242676, "logits/rejected": -13.013944625854492, "logps/chosen": -2103.573486328125, "logps/rejected": -1788.219970703125, "loss": 20.5808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -131.37313842773438, "rewards/margins": -14.125129699707031, "rewards/rejected": -117.24800109863281, "step": 12360 }, { "epoch": 0.72, "grad_norm": 0.12835434079170227, "learning_rate": 0.0007635357405472348, "logits/chosen": -14.208964347839355, "logits/rejected": -13.720850944519043, "logps/chosen": -2389.83837890625, "logps/rejected": -2430.970703125, "loss": 5.0341, "rewards/accuracies": 0.5, "rewards/chosen": -127.3681640625, "rewards/margins": 11.306230545043945, "rewards/rejected": -138.6743927001953, "step": 12370 }, { "epoch": 0.72, "grad_norm": 94.94402313232422, "learning_rate": 0.0007633422346066025, "logits/chosen": -16.4637451171875, "logits/rejected": -19.224695205688477, "logps/chosen": -2780.61474609375, "logps/rejected": -2195.26171875, "loss": 59.1261, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -203.91639709472656, "rewards/margins": -57.400245666503906, "rewards/rejected": -146.51614379882812, "step": 12380 }, { "epoch": 0.72, "grad_norm": 14.784769058227539, "learning_rate": 0.0007631487286659701, "logits/chosen": -13.800877571105957, "logits/rejected": -14.091360092163086, "logps/chosen": -2563.97265625, "logps/rejected": -2146.3486328125, "loss": 19.331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -73.90782165527344, "rewards/margins": 5.874997615814209, "rewards/rejected": -79.78282165527344, "step": 12390 }, { "epoch": 0.72, "grad_norm": 179.80311584472656, "learning_rate": 0.0007629552227253377, "logits/chosen": -16.36454963684082, "logits/rejected": -17.812511444091797, "logps/chosen": -2590.07275390625, "logps/rejected": -2274.989013671875, "loss": 22.4831, "rewards/accuracies": 0.5, "rewards/chosen": -142.7572784423828, "rewards/margins": -16.815637588500977, "rewards/rejected": -125.94163513183594, "step": 12400 }, { "epoch": 0.72, "grad_norm": 100.81964111328125, "learning_rate": 0.0007627617167847053, "logits/chosen": -17.87542724609375, "logits/rejected": -17.993906021118164, "logps/chosen": -2551.765625, "logps/rejected": -2405.60888671875, "loss": 19.9805, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.44635009765625, "rewards/margins": -16.24950408935547, "rewards/rejected": -100.19683837890625, "step": 12410 }, { "epoch": 0.72, "grad_norm": 34.99311065673828, "learning_rate": 0.0007625682108440729, "logits/chosen": -12.497343063354492, "logits/rejected": -12.982231140136719, "logps/chosen": -2531.11474609375, "logps/rejected": -2119.732421875, "loss": 4.6721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -51.70594024658203, "rewards/margins": 14.46293830871582, "rewards/rejected": -66.16886901855469, "step": 12420 }, { "epoch": 0.72, "grad_norm": 62.79753494262695, "learning_rate": 0.0007623747049034405, "logits/chosen": -11.062826156616211, "logits/rejected": -11.04469108581543, "logps/chosen": -2681.902587890625, "logps/rejected": -2277.268798828125, "loss": 12.1535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -58.43196487426758, "rewards/margins": 7.218491554260254, "rewards/rejected": -65.65045166015625, "step": 12430 }, { "epoch": 0.72, "grad_norm": 3.5244994163513184, "learning_rate": 0.0007621811989628082, "logits/chosen": -14.363866806030273, "logits/rejected": -14.261167526245117, "logps/chosen": -2592.50390625, "logps/rejected": -2587.8916015625, "loss": 6.0617, "rewards/accuracies": 0.5, "rewards/chosen": -150.5924530029297, "rewards/margins": 2.5181350708007812, "rewards/rejected": -153.11058044433594, "step": 12440 }, { "epoch": 0.72, "grad_norm": 0.34519436955451965, "learning_rate": 0.0007619876930221758, "logits/chosen": -13.939002990722656, "logits/rejected": -14.861848831176758, "logps/chosen": -2208.05908203125, "logps/rejected": -1942.728515625, "loss": 14.2457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.18782043457031, "rewards/margins": -7.879303932189941, "rewards/rejected": -109.30850982666016, "step": 12450 }, { "epoch": 0.72, "grad_norm": 299.2992248535156, "learning_rate": 0.0007617941870815435, "logits/chosen": -11.432666778564453, "logits/rejected": -12.572992324829102, "logps/chosen": -2691.483154296875, "logps/rejected": -2262.702392578125, "loss": 18.8393, "rewards/accuracies": 0.5, "rewards/chosen": -156.17892456054688, "rewards/margins": 3.8929367065429688, "rewards/rejected": -160.07186889648438, "step": 12460 }, { "epoch": 0.72, "grad_norm": 68.07228088378906, "learning_rate": 0.0007616006811409111, "logits/chosen": -10.733222961425781, "logits/rejected": -10.827786445617676, "logps/chosen": -2466.68212890625, "logps/rejected": -1895.7998046875, "loss": 46.1194, "rewards/accuracies": 0.5, "rewards/chosen": -178.13995361328125, "rewards/margins": -34.48529815673828, "rewards/rejected": -143.65464782714844, "step": 12470 }, { "epoch": 0.72, "grad_norm": 108.81864929199219, "learning_rate": 0.0007614071752002787, "logits/chosen": -10.504616737365723, "logits/rejected": -10.625432014465332, "logps/chosen": -2388.94580078125, "logps/rejected": -1995.561279296875, "loss": 12.5478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -59.73930740356445, "rewards/margins": 7.362910270690918, "rewards/rejected": -67.10222625732422, "step": 12480 }, { "epoch": 0.72, "grad_norm": 72.81068420410156, "learning_rate": 0.0007612136692596464, "logits/chosen": -10.795915603637695, "logits/rejected": -11.045507431030273, "logps/chosen": -2366.823974609375, "logps/rejected": -2306.187744140625, "loss": 27.5269, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -59.345848083496094, "rewards/margins": -23.553417205810547, "rewards/rejected": -35.79243850708008, "step": 12490 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 0.000761020163319014, "logits/chosen": -10.751829147338867, "logits/rejected": -11.072148323059082, "logps/chosen": -2668.107177734375, "logps/rejected": -2344.12255859375, "loss": 2.7738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.644224643707275, "rewards/margins": 24.746768951416016, "rewards/rejected": -32.39099884033203, "step": 12500 }, { "epoch": 0.72, "grad_norm": 0.009136472828686237, "learning_rate": 0.0007608266573783815, "logits/chosen": -12.78676986694336, "logits/rejected": -13.537531852722168, "logps/chosen": -2698.355712890625, "logps/rejected": -2669.26513671875, "loss": 11.2201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.239501953125, "rewards/margins": 3.3253350257873535, "rewards/rejected": -156.56484985351562, "step": 12510 }, { "epoch": 0.72, "grad_norm": 93.40953826904297, "learning_rate": 0.0007606331514377491, "logits/chosen": -11.994913101196289, "logits/rejected": -13.176801681518555, "logps/chosen": -2716.8583984375, "logps/rejected": -2681.32568359375, "loss": 4.3747, "rewards/accuracies": 0.5, "rewards/chosen": -127.4564208984375, "rewards/margins": 2.5155282020568848, "rewards/rejected": -129.97195434570312, "step": 12520 }, { "epoch": 0.73, "grad_norm": 14.252189636230469, "learning_rate": 0.0007604396454971167, "logits/chosen": -18.224489212036133, "logits/rejected": -17.901559829711914, "logps/chosen": -2374.647705078125, "logps/rejected": -2020.0794677734375, "loss": 44.841, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -210.48666381835938, "rewards/margins": -30.760873794555664, "rewards/rejected": -179.72579956054688, "step": 12530 }, { "epoch": 0.73, "grad_norm": 60.162906646728516, "learning_rate": 0.0007602461395564843, "logits/chosen": -13.952569961547852, "logits/rejected": -15.31938362121582, "logps/chosen": -2917.41650390625, "logps/rejected": -2588.67333984375, "loss": 14.4925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -138.42819213867188, "rewards/margins": -6.6927642822265625, "rewards/rejected": -131.73544311523438, "step": 12540 }, { "epoch": 0.73, "grad_norm": 5.294295262913871e-16, "learning_rate": 0.0007600526336158519, "logits/chosen": -12.981500625610352, "logits/rejected": -13.449915885925293, "logps/chosen": -2110.299072265625, "logps/rejected": -2083.845947265625, "loss": 9.2969, "rewards/accuracies": 0.5, "rewards/chosen": -83.23924255371094, "rewards/margins": 12.190048217773438, "rewards/rejected": -95.4292984008789, "step": 12550 }, { "epoch": 0.73, "grad_norm": 0.005672111175954342, "learning_rate": 0.0007598591276752197, "logits/chosen": -12.738899230957031, "logits/rejected": -13.207679748535156, "logps/chosen": -2554.208740234375, "logps/rejected": -2268.5712890625, "loss": 12.6649, "rewards/accuracies": 0.5, "rewards/chosen": -104.88724517822266, "rewards/margins": 9.375428199768066, "rewards/rejected": -114.2626724243164, "step": 12560 }, { "epoch": 0.73, "grad_norm": 3.117432117462158, "learning_rate": 0.0007596656217345873, "logits/chosen": -12.004164695739746, "logits/rejected": -12.00599479675293, "logps/chosen": -2573.50732421875, "logps/rejected": -2521.646240234375, "loss": 9.1904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.76522827148438, "rewards/margins": -2.1990866661071777, "rewards/rejected": -139.56614685058594, "step": 12570 }, { "epoch": 0.73, "grad_norm": 77.7320785522461, "learning_rate": 0.0007594721157939549, "logits/chosen": -10.9173002243042, "logits/rejected": -11.382848739624023, "logps/chosen": -2430.295654296875, "logps/rejected": -2253.47265625, "loss": 4.8359, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -91.87913513183594, "rewards/margins": 20.87957763671875, "rewards/rejected": -112.75870513916016, "step": 12580 }, { "epoch": 0.73, "grad_norm": 168.40048217773438, "learning_rate": 0.0007592786098533225, "logits/chosen": -12.587095260620117, "logits/rejected": -12.881543159484863, "logps/chosen": -2237.359375, "logps/rejected": -2128.95068359375, "loss": 29.8533, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.09185791015625, "rewards/margins": -21.336233139038086, "rewards/rejected": -106.75563049316406, "step": 12590 }, { "epoch": 0.73, "grad_norm": 2.1219955421398438e-17, "learning_rate": 0.0007590851039126901, "logits/chosen": -9.997949600219727, "logits/rejected": -10.17793083190918, "logps/chosen": -2492.61474609375, "logps/rejected": -2021.258544921875, "loss": 19.2802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.28919982910156, "rewards/margins": -3.5770981311798096, "rewards/rejected": -89.71211242675781, "step": 12600 }, { "epoch": 0.73, "grad_norm": 256.33905029296875, "learning_rate": 0.0007588915979720578, "logits/chosen": -10.971315383911133, "logits/rejected": -11.386430740356445, "logps/chosen": -2347.90380859375, "logps/rejected": -2135.4619140625, "loss": 17.9081, "rewards/accuracies": 0.5, "rewards/chosen": -100.56161499023438, "rewards/margins": -13.823663711547852, "rewards/rejected": -86.73796081542969, "step": 12610 }, { "epoch": 0.73, "grad_norm": 157.20919799804688, "learning_rate": 0.0007586980920314254, "logits/chosen": -11.42012882232666, "logits/rejected": -11.231258392333984, "logps/chosen": -2379.62158203125, "logps/rejected": -2497.39892578125, "loss": 24.0487, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -102.90621185302734, "rewards/margins": -12.778979301452637, "rewards/rejected": -90.12723541259766, "step": 12620 }, { "epoch": 0.73, "grad_norm": 12.879470825195312, "learning_rate": 0.000758504586090793, "logits/chosen": -13.130175590515137, "logits/rejected": -13.268148422241211, "logps/chosen": -2334.28125, "logps/rejected": -2080.64208984375, "loss": 10.8736, "rewards/accuracies": 0.5, "rewards/chosen": -140.81057739257812, "rewards/margins": -3.8490943908691406, "rewards/rejected": -136.9615020751953, "step": 12630 }, { "epoch": 0.73, "grad_norm": 14.049040794372559, "learning_rate": 0.0007583110801501606, "logits/chosen": -13.364489555358887, "logits/rejected": -13.772987365722656, "logps/chosen": -2344.94091796875, "logps/rejected": -2003.727294921875, "loss": 32.0951, "rewards/accuracies": 0.5, "rewards/chosen": -135.47073364257812, "rewards/margins": -21.63758659362793, "rewards/rejected": -113.8331298828125, "step": 12640 }, { "epoch": 0.73, "grad_norm": 15.491958618164062, "learning_rate": 0.0007581175742095282, "logits/chosen": -12.15742301940918, "logits/rejected": -12.651741027832031, "logps/chosen": -2066.9404296875, "logps/rejected": -2014.250732421875, "loss": 18.7406, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -115.44425201416016, "rewards/margins": -7.88153076171875, "rewards/rejected": -107.56272888183594, "step": 12650 }, { "epoch": 0.73, "grad_norm": 0.01173000130802393, "learning_rate": 0.0007579240682688958, "logits/chosen": -11.671675682067871, "logits/rejected": -12.301011085510254, "logps/chosen": -2626.80615234375, "logps/rejected": -2218.000244140625, "loss": 15.6097, "rewards/accuracies": 0.5, "rewards/chosen": -93.05194091796875, "rewards/margins": -2.39201021194458, "rewards/rejected": -90.65992736816406, "step": 12660 }, { "epoch": 0.73, "grad_norm": 29.024085998535156, "learning_rate": 0.0007577305623282636, "logits/chosen": -16.580202102661133, "logits/rejected": -17.462833404541016, "logps/chosen": -2285.810791015625, "logps/rejected": -2143.72509765625, "loss": 7.0911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -116.60905456542969, "rewards/margins": 25.711523056030273, "rewards/rejected": -142.32058715820312, "step": 12670 }, { "epoch": 0.73, "grad_norm": 1.9845296264975332e-05, "learning_rate": 0.0007575370563876312, "logits/chosen": -14.203198432922363, "logits/rejected": -14.470202445983887, "logps/chosen": -2409.1923828125, "logps/rejected": -2261.38671875, "loss": 23.7751, "rewards/accuracies": 0.5, "rewards/chosen": -87.48771667480469, "rewards/margins": -6.394379615783691, "rewards/rejected": -81.09333801269531, "step": 12680 }, { "epoch": 0.73, "grad_norm": 0.00020332752319518477, "learning_rate": 0.0007573435504469988, "logits/chosen": -20.817859649658203, "logits/rejected": -21.72355079650879, "logps/chosen": -2040.477294921875, "logps/rejected": -2039.6510009765625, "loss": 15.9885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.0322723388672, "rewards/margins": 6.017505645751953, "rewards/rejected": -186.04977416992188, "step": 12690 }, { "epoch": 0.74, "grad_norm": 1.395218849182129, "learning_rate": 0.0007571500445063664, "logits/chosen": -11.045836448669434, "logits/rejected": -11.067218780517578, "logps/chosen": -2413.95263671875, "logps/rejected": -2158.871337890625, "loss": 11.3996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -72.6213607788086, "rewards/margins": -4.3682732582092285, "rewards/rejected": -68.25308227539062, "step": 12700 }, { "epoch": 0.74, "grad_norm": 7.999882291440281e-15, "learning_rate": 0.000756956538565734, "logits/chosen": -13.217506408691406, "logits/rejected": -13.291722297668457, "logps/chosen": -2509.59619140625, "logps/rejected": -1992.1083984375, "loss": 16.2172, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -53.800682067871094, "rewards/margins": 7.66890811920166, "rewards/rejected": -61.4695930480957, "step": 12710 }, { "epoch": 0.74, "grad_norm": 0.00794187281280756, "learning_rate": 0.0007567630326251015, "logits/chosen": -18.243194580078125, "logits/rejected": -18.407127380371094, "logps/chosen": -2069.661865234375, "logps/rejected": -1972.9111328125, "loss": 11.112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.63858032226562, "rewards/margins": -6.8644118309021, "rewards/rejected": -127.7741470336914, "step": 12720 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 0.0007565695266844692, "logits/chosen": -15.297770500183105, "logits/rejected": -14.856416702270508, "logps/chosen": -2382.9599609375, "logps/rejected": -2351.4140625, "loss": 9.1231, "rewards/accuracies": 0.5, "rewards/chosen": -45.09855270385742, "rewards/margins": 7.96231746673584, "rewards/rejected": -53.06086349487305, "step": 12730 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 0.0007563760207438368, "logits/chosen": -20.130050659179688, "logits/rejected": -19.94233512878418, "logps/chosen": -2275.536865234375, "logps/rejected": -2199.43310546875, "loss": 21.7569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.28140258789062, "rewards/margins": -10.584820747375488, "rewards/rejected": -155.69659423828125, "step": 12740 }, { "epoch": 0.74, "grad_norm": 1.1606139160846851e-08, "learning_rate": 0.0007561825148032044, "logits/chosen": -20.293296813964844, "logits/rejected": -21.60651969909668, "logps/chosen": -2520.76708984375, "logps/rejected": -2399.451171875, "loss": 17.5856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -149.31369018554688, "rewards/margins": -6.787903785705566, "rewards/rejected": -142.52577209472656, "step": 12750 }, { "epoch": 0.74, "grad_norm": 1587.1180419921875, "learning_rate": 0.000755989008862572, "logits/chosen": -17.474464416503906, "logits/rejected": -17.74557113647461, "logps/chosen": -2600.208984375, "logps/rejected": -2264.60400390625, "loss": 42.1352, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -165.59213256835938, "rewards/margins": -37.90080642700195, "rewards/rejected": -127.69132995605469, "step": 12760 }, { "epoch": 0.74, "grad_norm": 58.07225036621094, "learning_rate": 0.0007557955029219397, "logits/chosen": -17.384920120239258, "logits/rejected": -18.026447296142578, "logps/chosen": -2267.735107421875, "logps/rejected": -1951.564453125, "loss": 26.6977, "rewards/accuracies": 0.5, "rewards/chosen": -142.76974487304688, "rewards/margins": -12.884147644042969, "rewards/rejected": -129.88558959960938, "step": 12770 }, { "epoch": 0.74, "grad_norm": 73.84158325195312, "learning_rate": 0.0007556019969813074, "logits/chosen": -15.059948921203613, "logits/rejected": -15.172764778137207, "logps/chosen": -2773.674560546875, "logps/rejected": -2723.94140625, "loss": 19.4245, "rewards/accuracies": 0.5, "rewards/chosen": -73.98966217041016, "rewards/margins": -7.497804164886475, "rewards/rejected": -66.49185943603516, "step": 12780 }, { "epoch": 0.74, "grad_norm": 219.55845642089844, "learning_rate": 0.000755408491040675, "logits/chosen": -17.86200523376465, "logits/rejected": -17.4642276763916, "logps/chosen": -3179.81103515625, "logps/rejected": -3163.77099609375, "loss": 13.0804, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -175.90725708007812, "rewards/margins": -9.979866981506348, "rewards/rejected": -165.92739868164062, "step": 12790 }, { "epoch": 0.74, "grad_norm": 101.10002136230469, "learning_rate": 0.0007552149851000426, "logits/chosen": -16.386199951171875, "logits/rejected": -16.238182067871094, "logps/chosen": -2826.92919921875, "logps/rejected": -2778.864990234375, "loss": 12.6815, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.34562683105469, "rewards/margins": -5.512114524841309, "rewards/rejected": -111.8335189819336, "step": 12800 }, { "epoch": 0.74, "grad_norm": 135.67417907714844, "learning_rate": 0.0007550214791594102, "logits/chosen": -12.543227195739746, "logits/rejected": -12.781232833862305, "logps/chosen": -2777.13623046875, "logps/rejected": -2738.16796875, "loss": 20.2176, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -78.2780532836914, "rewards/margins": -17.983102798461914, "rewards/rejected": -60.294952392578125, "step": 12810 }, { "epoch": 0.74, "grad_norm": 47.22616958618164, "learning_rate": 0.0007548279732187778, "logits/chosen": -13.437817573547363, "logits/rejected": -13.44428539276123, "logps/chosen": -2534.467529296875, "logps/rejected": -2006.4390869140625, "loss": 7.5684, "rewards/accuracies": 0.5, "rewards/chosen": -141.45252990722656, "rewards/margins": 1.6847107410430908, "rewards/rejected": -143.13723754882812, "step": 12820 }, { "epoch": 0.74, "grad_norm": 159.2063751220703, "learning_rate": 0.0007546344672781454, "logits/chosen": -15.464506149291992, "logits/rejected": -16.37548828125, "logps/chosen": -2318.03857421875, "logps/rejected": -2029.029296875, "loss": 21.075, "rewards/accuracies": 0.5, "rewards/chosen": -85.545166015625, "rewards/margins": -15.778863906860352, "rewards/rejected": -69.76629638671875, "step": 12830 }, { "epoch": 0.74, "grad_norm": 93.4425048828125, "learning_rate": 0.0007544409613375131, "logits/chosen": -13.720888137817383, "logits/rejected": -15.494402885437012, "logps/chosen": -2579.380859375, "logps/rejected": -2124.322998046875, "loss": 25.1969, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -105.36541748046875, "rewards/margins": -15.76556396484375, "rewards/rejected": -89.59983825683594, "step": 12840 }, { "epoch": 0.74, "grad_norm": 52.93574142456055, "learning_rate": 0.0007542474553968807, "logits/chosen": -13.203496932983398, "logits/rejected": -13.649394035339355, "logps/chosen": -2143.998046875, "logps/rejected": -2026.723876953125, "loss": 24.3587, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -93.63258361816406, "rewards/margins": -22.10294532775879, "rewards/rejected": -71.52964782714844, "step": 12850 }, { "epoch": 0.74, "grad_norm": 142.97320556640625, "learning_rate": 0.0007540539494562483, "logits/chosen": -15.183178901672363, "logits/rejected": -15.898759841918945, "logps/chosen": -2436.23291015625, "logps/rejected": -2275.875, "loss": 19.0619, "rewards/accuracies": 0.5, "rewards/chosen": -147.200927734375, "rewards/margins": -7.830918788909912, "rewards/rejected": -139.37002563476562, "step": 12860 }, { "epoch": 0.74, "grad_norm": 3.083331006354241e-15, "learning_rate": 0.0007538604435156159, "logits/chosen": -15.128451347351074, "logits/rejected": -15.249685287475586, "logps/chosen": -2597.422607421875, "logps/rejected": -2365.343017578125, "loss": 11.5742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.37747192382812, "rewards/margins": 2.8963356018066406, "rewards/rejected": -160.27378845214844, "step": 12870 }, { "epoch": 0.75, "grad_norm": 130.6470184326172, "learning_rate": 0.0007536669375749836, "logits/chosen": -11.441843032836914, "logits/rejected": -11.304608345031738, "logps/chosen": -2200.915771484375, "logps/rejected": -2034.0882568359375, "loss": 29.0938, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -69.71466827392578, "rewards/margins": -9.426657676696777, "rewards/rejected": -60.28801345825195, "step": 12880 }, { "epoch": 0.75, "grad_norm": 0.4781399667263031, "learning_rate": 0.0007534734316343513, "logits/chosen": -13.125686645507812, "logits/rejected": -13.351613998413086, "logps/chosen": -1620.029052734375, "logps/rejected": -1554.658935546875, "loss": 7.9456, "rewards/accuracies": 0.5, "rewards/chosen": -138.64295959472656, "rewards/margins": -1.9538158178329468, "rewards/rejected": -136.68914794921875, "step": 12890 }, { "epoch": 0.75, "grad_norm": 7.480310273422219e-07, "learning_rate": 0.0007532799256937189, "logits/chosen": -13.241276741027832, "logits/rejected": -14.431551933288574, "logps/chosen": -2375.48095703125, "logps/rejected": -1890.5572509765625, "loss": 31.7523, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -102.26757049560547, "rewards/margins": -29.769901275634766, "rewards/rejected": -72.49766540527344, "step": 12900 }, { "epoch": 0.75, "grad_norm": 0.07715772837400436, "learning_rate": 0.0007530864197530865, "logits/chosen": -14.601785659790039, "logits/rejected": -14.80323314666748, "logps/chosen": -1866.6146240234375, "logps/rejected": -1869.811767578125, "loss": 32.8454, "rewards/accuracies": 0.5, "rewards/chosen": -101.04985809326172, "rewards/margins": -14.390141487121582, "rewards/rejected": -86.65971374511719, "step": 12910 }, { "epoch": 0.75, "grad_norm": 1.563427859396782e-10, "learning_rate": 0.0007528929138124541, "logits/chosen": -15.467236518859863, "logits/rejected": -15.511723518371582, "logps/chosen": -2443.82861328125, "logps/rejected": -2221.85693359375, "loss": 13.0101, "rewards/accuracies": 0.5, "rewards/chosen": -131.17938232421875, "rewards/margins": -3.1771819591522217, "rewards/rejected": -128.00218200683594, "step": 12920 }, { "epoch": 0.75, "grad_norm": 2.390121949247259e-07, "learning_rate": 0.0007526994078718217, "logits/chosen": -13.45220947265625, "logits/rejected": -14.588946342468262, "logps/chosen": -2633.577880859375, "logps/rejected": -2143.5595703125, "loss": 29.038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -71.78057861328125, "rewards/margins": -14.661478996276855, "rewards/rejected": -57.11909866333008, "step": 12930 }, { "epoch": 0.75, "grad_norm": 66.20551300048828, "learning_rate": 0.0007525059019311892, "logits/chosen": -16.357568740844727, "logits/rejected": -17.35902976989746, "logps/chosen": -2059.30810546875, "logps/rejected": -1680.039306640625, "loss": 10.9931, "rewards/accuracies": 0.5, "rewards/chosen": -142.23696899414062, "rewards/margins": -4.163141250610352, "rewards/rejected": -138.07382202148438, "step": 12940 }, { "epoch": 0.75, "grad_norm": 177.09255981445312, "learning_rate": 0.0007523123959905568, "logits/chosen": -16.683122634887695, "logits/rejected": -16.601844787597656, "logps/chosen": -2693.030029296875, "logps/rejected": -2588.82080078125, "loss": 12.4834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -194.42755126953125, "rewards/margins": -0.5898521542549133, "rewards/rejected": -193.83770751953125, "step": 12950 }, { "epoch": 0.75, "grad_norm": 145.1407928466797, "learning_rate": 0.0007521188900499245, "logits/chosen": -13.751409530639648, "logits/rejected": -13.716692924499512, "logps/chosen": -2475.460205078125, "logps/rejected": -2091.54833984375, "loss": 8.9047, "rewards/accuracies": 0.5, "rewards/chosen": -115.55599212646484, "rewards/margins": -3.3637242317199707, "rewards/rejected": -112.19227600097656, "step": 12960 }, { "epoch": 0.75, "grad_norm": 4.960170815748732e-11, "learning_rate": 0.0007519253841092921, "logits/chosen": -16.283239364624023, "logits/rejected": -16.121549606323242, "logps/chosen": -2216.14892578125, "logps/rejected": -2335.36669921875, "loss": 24.9011, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -110.56612396240234, "rewards/margins": -20.97517204284668, "rewards/rejected": -89.59095764160156, "step": 12970 }, { "epoch": 0.75, "grad_norm": 45.937744140625, "learning_rate": 0.0007517318781686598, "logits/chosen": -11.262475967407227, "logits/rejected": -11.404800415039062, "logps/chosen": -2663.00048828125, "logps/rejected": -2510.405517578125, "loss": 8.2326, "rewards/accuracies": 0.5, "rewards/chosen": -81.79413604736328, "rewards/margins": 2.9086570739746094, "rewards/rejected": -84.70278930664062, "step": 12980 }, { "epoch": 0.75, "grad_norm": 0.27447015047073364, "learning_rate": 0.0007515383722280274, "logits/chosen": -14.593440055847168, "logits/rejected": -14.39990234375, "logps/chosen": -2633.430908203125, "logps/rejected": -2117.374267578125, "loss": 27.0458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.8896713256836, "rewards/margins": -18.687358856201172, "rewards/rejected": -102.20231628417969, "step": 12990 }, { "epoch": 0.75, "grad_norm": 2.8280438754690884e-11, "learning_rate": 0.000751344866287395, "logits/chosen": -16.14998435974121, "logits/rejected": -16.19228744506836, "logps/chosen": -2204.897216796875, "logps/rejected": -2031.2496337890625, "loss": 12.9835, "rewards/accuracies": 0.5, "rewards/chosen": -76.66044616699219, "rewards/margins": -4.709660530090332, "rewards/rejected": -71.9507827758789, "step": 13000 }, { "epoch": 0.75, "grad_norm": 0.006967651657760143, "learning_rate": 0.0007511513603467627, "logits/chosen": -15.592529296875, "logits/rejected": -15.644437789916992, "logps/chosen": -2486.27587890625, "logps/rejected": -2470.78466796875, "loss": 0.3324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -109.9433822631836, "rewards/margins": 7.8521318435668945, "rewards/rejected": -117.79551696777344, "step": 13010 }, { "epoch": 0.75, "grad_norm": 1.6040428363339743e-06, "learning_rate": 0.0007509578544061303, "logits/chosen": -14.69037914276123, "logits/rejected": -15.663751602172852, "logps/chosen": -2585.13037109375, "logps/rejected": -2098.724853515625, "loss": 14.4013, "rewards/accuracies": 0.5, "rewards/chosen": -101.30680847167969, "rewards/margins": 8.1196870803833, "rewards/rejected": -109.42649841308594, "step": 13020 }, { "epoch": 0.75, "grad_norm": 3.9068801403045654, "learning_rate": 0.0007507643484654979, "logits/chosen": -16.162235260009766, "logits/rejected": -16.848140716552734, "logps/chosen": -2535.359375, "logps/rejected": -2440.5986328125, "loss": 11.7274, "rewards/accuracies": 0.5, "rewards/chosen": -149.85679626464844, "rewards/margins": -3.339693546295166, "rewards/rejected": -146.51710510253906, "step": 13030 }, { "epoch": 0.75, "grad_norm": 5381.65087890625, "learning_rate": 0.0007505708425248655, "logits/chosen": -12.435870170593262, "logits/rejected": -12.08319091796875, "logps/chosen": -3127.154541015625, "logps/rejected": -2627.425048828125, "loss": 28.2101, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -134.80755615234375, "rewards/margins": -14.10200023651123, "rewards/rejected": -120.70555114746094, "step": 13040 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 0.0007503773365842331, "logits/chosen": -12.547378540039062, "logits/rejected": -12.810018539428711, "logps/chosen": -2279.95654296875, "logps/rejected": -2261.832763671875, "loss": 21.5016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.26876831054688, "rewards/margins": 3.1564319133758545, "rewards/rejected": -142.4252166748047, "step": 13050 }, { "epoch": 0.76, "grad_norm": 149.44854736328125, "learning_rate": 0.0007501838306436007, "logits/chosen": -14.823695182800293, "logits/rejected": -15.318603515625, "logps/chosen": -2356.99169921875, "logps/rejected": -1827.295654296875, "loss": 17.8604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.5390625, "rewards/margins": -7.652426242828369, "rewards/rejected": -142.88665771484375, "step": 13060 }, { "epoch": 0.76, "grad_norm": 74.0484390258789, "learning_rate": 0.0007499903247029684, "logits/chosen": -15.896963119506836, "logits/rejected": -15.996496200561523, "logps/chosen": -2456.791748046875, "logps/rejected": -2199.82470703125, "loss": 10.7937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.59439086914062, "rewards/margins": -1.6594985723495483, "rewards/rejected": -175.9348907470703, "step": 13070 }, { "epoch": 0.76, "grad_norm": 7.150391390098321e-10, "learning_rate": 0.0007497968187623361, "logits/chosen": -14.276268005371094, "logits/rejected": -14.517908096313477, "logps/chosen": -2601.67138671875, "logps/rejected": -2203.904541015625, "loss": 24.6515, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.99564361572266, "rewards/margins": -21.35382080078125, "rewards/rejected": -95.64180755615234, "step": 13080 }, { "epoch": 0.76, "grad_norm": 53.196311950683594, "learning_rate": 0.0007496033128217037, "logits/chosen": -16.662988662719727, "logits/rejected": -17.446544647216797, "logps/chosen": -2190.608642578125, "logps/rejected": -1965.06640625, "loss": 19.6166, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -153.01205444335938, "rewards/margins": -13.033058166503906, "rewards/rejected": -139.97897338867188, "step": 13090 }, { "epoch": 0.76, "grad_norm": 26.64346694946289, "learning_rate": 0.0007494098068810713, "logits/chosen": -14.071972846984863, "logits/rejected": -14.161786079406738, "logps/chosen": -2503.68115234375, "logps/rejected": -2294.969482421875, "loss": 11.4129, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -115.00799560546875, "rewards/margins": -1.5205471515655518, "rewards/rejected": -113.4874496459961, "step": 13100 }, { "epoch": 0.76, "grad_norm": 138.64515686035156, "learning_rate": 0.0007492163009404389, "logits/chosen": -13.659439086914062, "logits/rejected": -13.765602111816406, "logps/chosen": -2958.461669921875, "logps/rejected": -2399.74658203125, "loss": 16.0566, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -132.9717559814453, "rewards/margins": -9.488519668579102, "rewards/rejected": -123.48323059082031, "step": 13110 }, { "epoch": 0.76, "grad_norm": 88.6471939086914, "learning_rate": 0.0007490227949998066, "logits/chosen": -16.390291213989258, "logits/rejected": -15.772428512573242, "logps/chosen": -2750.121337890625, "logps/rejected": -2513.24658203125, "loss": 16.9519, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -114.8929672241211, "rewards/margins": -1.0197200775146484, "rewards/rejected": -113.87324523925781, "step": 13120 }, { "epoch": 0.76, "grad_norm": 64.20426177978516, "learning_rate": 0.0007488292890591742, "logits/chosen": -22.09845542907715, "logits/rejected": -23.12550926208496, "logps/chosen": -2557.752197265625, "logps/rejected": -2109.887939453125, "loss": 24.3783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.695068359375, "rewards/margins": -7.2654218673706055, "rewards/rejected": -138.42962646484375, "step": 13130 }, { "epoch": 0.76, "grad_norm": 168.68959045410156, "learning_rate": 0.0007486357831185418, "logits/chosen": -22.823204040527344, "logits/rejected": -26.401077270507812, "logps/chosen": -2989.72802734375, "logps/rejected": -2568.240966796875, "loss": 22.2226, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -228.1304931640625, "rewards/margins": -10.8886079788208, "rewards/rejected": -217.2418975830078, "step": 13140 }, { "epoch": 0.76, "grad_norm": 168.4042205810547, "learning_rate": 0.0007484422771779094, "logits/chosen": -16.447391510009766, "logits/rejected": -16.920856475830078, "logps/chosen": -3142.006103515625, "logps/rejected": -2670.09912109375, "loss": 6.1479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.38465881347656, "rewards/margins": 5.847211837768555, "rewards/rejected": -108.23185729980469, "step": 13150 }, { "epoch": 0.76, "grad_norm": 1.1116104360553436e-05, "learning_rate": 0.0007482487712372769, "logits/chosen": -18.94549560546875, "logits/rejected": -19.033287048339844, "logps/chosen": -2939.10107421875, "logps/rejected": -2929.65185546875, "loss": 1.7749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -153.6129150390625, "rewards/margins": 15.18395709991455, "rewards/rejected": -168.796875, "step": 13160 }, { "epoch": 0.76, "grad_norm": 98.2681655883789, "learning_rate": 0.0007480552652966445, "logits/chosen": -21.738405227661133, "logits/rejected": -23.06644058227539, "logps/chosen": -2496.31689453125, "logps/rejected": -2279.960205078125, "loss": 30.0748, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -186.73049926757812, "rewards/margins": -21.603168487548828, "rewards/rejected": -165.12733459472656, "step": 13170 }, { "epoch": 0.76, "grad_norm": 2.720365228014998e-05, "learning_rate": 0.0007478617593560122, "logits/chosen": -15.363798141479492, "logits/rejected": -15.895013809204102, "logps/chosen": -2577.082763671875, "logps/rejected": -2606.36474609375, "loss": 0.558, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -145.7509002685547, "rewards/margins": 11.199469566345215, "rewards/rejected": -156.95037841796875, "step": 13180 }, { "epoch": 0.76, "grad_norm": 98.63524627685547, "learning_rate": 0.0007476682534153799, "logits/chosen": -14.054110527038574, "logits/rejected": -14.25780963897705, "logps/chosen": -2404.31884765625, "logps/rejected": -2413.072998046875, "loss": 0.1636, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -113.33319091796875, "rewards/margins": 20.722375869750977, "rewards/rejected": -134.05557250976562, "step": 13190 }, { "epoch": 0.76, "grad_norm": 1.4328928727991297e-06, "learning_rate": 0.0007474747474747475, "logits/chosen": -15.309816360473633, "logits/rejected": -15.298388481140137, "logps/chosen": -2672.311279296875, "logps/rejected": -2537.27880859375, "loss": 24.676, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -195.51040649414062, "rewards/margins": -16.305782318115234, "rewards/rejected": -179.20462036132812, "step": 13200 }, { "epoch": 0.76, "grad_norm": 4.325306432462206e-11, "learning_rate": 0.0007472812415341151, "logits/chosen": -13.362933158874512, "logits/rejected": -13.201980590820312, "logps/chosen": -2421.923828125, "logps/rejected": -2254.216552734375, "loss": 8.0366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -95.73443603515625, "rewards/margins": 6.692366123199463, "rewards/rejected": -102.42680358886719, "step": 13210 }, { "epoch": 0.77, "grad_norm": 0.09776049852371216, "learning_rate": 0.0007470877355934827, "logits/chosen": -11.555631637573242, "logits/rejected": -11.608724594116211, "logps/chosen": -3126.33544921875, "logps/rejected": -3032.272216796875, "loss": 6.4567, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -116.21968078613281, "rewards/margins": -4.039395332336426, "rewards/rejected": -112.1802749633789, "step": 13220 }, { "epoch": 0.77, "grad_norm": 0.3871757984161377, "learning_rate": 0.0007468942296528503, "logits/chosen": -12.561029434204102, "logits/rejected": -12.734663963317871, "logps/chosen": -2935.061767578125, "logps/rejected": -2894.690185546875, "loss": 2.8856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -123.72611999511719, "rewards/margins": 7.617791175842285, "rewards/rejected": -131.3439178466797, "step": 13230 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 0.000746700723712218, "logits/chosen": -14.849813461303711, "logits/rejected": -15.003198623657227, "logps/chosen": -2263.396240234375, "logps/rejected": -2030.947021484375, "loss": 35.3316, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -157.63775634765625, "rewards/margins": -19.361186981201172, "rewards/rejected": -138.27658081054688, "step": 13240 }, { "epoch": 0.77, "grad_norm": 0.6489533185958862, "learning_rate": 0.0007465072177715856, "logits/chosen": -12.200586318969727, "logits/rejected": -12.246854782104492, "logps/chosen": -2475.827880859375, "logps/rejected": -2512.213623046875, "loss": 2.8613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -85.19029235839844, "rewards/margins": 0.049703408032655716, "rewards/rejected": -85.239990234375, "step": 13250 }, { "epoch": 0.77, "grad_norm": 211.60067749023438, "learning_rate": 0.0007463137118309532, "logits/chosen": -15.574650764465332, "logits/rejected": -15.966944694519043, "logps/chosen": -2280.05029296875, "logps/rejected": -2308.358154296875, "loss": 19.0516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.0637664794922, "rewards/margins": 12.708332061767578, "rewards/rejected": -163.77210998535156, "step": 13260 }, { "epoch": 0.77, "grad_norm": 32.07349395751953, "learning_rate": 0.0007461202058903208, "logits/chosen": -16.42223358154297, "logits/rejected": -16.734607696533203, "logps/chosen": -2568.229736328125, "logps/rejected": -2514.8203125, "loss": 9.3623, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -118.49520111083984, "rewards/margins": -0.26536351442337036, "rewards/rejected": -118.22984313964844, "step": 13270 }, { "epoch": 0.77, "grad_norm": 0.01863556168973446, "learning_rate": 0.0007459266999496884, "logits/chosen": -11.594682693481445, "logits/rejected": -11.871915817260742, "logps/chosen": -2894.59423828125, "logps/rejected": -2416.87939453125, "loss": 0.8991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -91.31367492675781, "rewards/margins": 28.44805908203125, "rewards/rejected": -119.76175689697266, "step": 13280 }, { "epoch": 0.77, "grad_norm": 206.1193084716797, "learning_rate": 0.0007457331940090562, "logits/chosen": -12.215517044067383, "logits/rejected": -12.509847640991211, "logps/chosen": -2740.83056640625, "logps/rejected": -2179.628662109375, "loss": 5.8003, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -142.83212280273438, "rewards/margins": -1.4707496166229248, "rewards/rejected": -141.3613739013672, "step": 13290 }, { "epoch": 0.77, "grad_norm": 180.0523223876953, "learning_rate": 0.0007455396880684238, "logits/chosen": -14.244863510131836, "logits/rejected": -14.705388069152832, "logps/chosen": -2356.32421875, "logps/rejected": -1988.807861328125, "loss": 45.2989, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.0915298461914, "rewards/margins": -39.75959014892578, "rewards/rejected": -83.3319320678711, "step": 13300 }, { "epoch": 0.77, "grad_norm": 354.14984130859375, "learning_rate": 0.0007453461821277914, "logits/chosen": -16.5649356842041, "logits/rejected": -16.97028160095215, "logps/chosen": -2622.620361328125, "logps/rejected": -2505.39990234375, "loss": 6.5924, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -187.99093627929688, "rewards/margins": -4.629831790924072, "rewards/rejected": -183.3611297607422, "step": 13310 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 0.000745152676187159, "logits/chosen": -13.897104263305664, "logits/rejected": -14.099874496459961, "logps/chosen": -2160.896728515625, "logps/rejected": -1886.7464599609375, "loss": 18.2436, "rewards/accuracies": 0.5, "rewards/chosen": -137.44134521484375, "rewards/margins": -4.197816371917725, "rewards/rejected": -133.2435302734375, "step": 13320 }, { "epoch": 0.77, "grad_norm": 3.8526663780212402, "learning_rate": 0.0007449591702465266, "logits/chosen": -15.3932523727417, "logits/rejected": -15.444781303405762, "logps/chosen": -2636.90625, "logps/rejected": -2418.78076171875, "loss": 16.8597, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.62535095214844, "rewards/margins": -10.506978988647461, "rewards/rejected": -114.1183853149414, "step": 13330 }, { "epoch": 0.77, "grad_norm": 1.0711499452590942, "learning_rate": 0.0007447656643058942, "logits/chosen": -17.89801597595215, "logits/rejected": -17.58959197998047, "logps/chosen": -2099.589599609375, "logps/rejected": -1875.502685546875, "loss": 31.1541, "rewards/accuracies": 0.5, "rewards/chosen": -177.6470489501953, "rewards/margins": -20.1546688079834, "rewards/rejected": -157.4923858642578, "step": 13340 }, { "epoch": 0.77, "grad_norm": 6.7720624663536455e-09, "learning_rate": 0.0007445721583652619, "logits/chosen": -16.836956024169922, "logits/rejected": -16.597949981689453, "logps/chosen": -2736.58984375, "logps/rejected": -2709.47802734375, "loss": 7.4564, "rewards/accuracies": 0.5, "rewards/chosen": -195.37648010253906, "rewards/margins": 4.004467964172363, "rewards/rejected": -199.3809356689453, "step": 13350 }, { "epoch": 0.77, "grad_norm": 127.82465362548828, "learning_rate": 0.0007443786524246295, "logits/chosen": -16.754501342773438, "logits/rejected": -17.207233428955078, "logps/chosen": -2617.16259765625, "logps/rejected": -2383.146728515625, "loss": 27.8011, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -192.19012451171875, "rewards/margins": -17.0822696685791, "rewards/rejected": -175.10784912109375, "step": 13360 }, { "epoch": 0.77, "grad_norm": 0.00041771712130866945, "learning_rate": 0.0007441851464839971, "logits/chosen": -13.504168510437012, "logits/rejected": -13.548097610473633, "logps/chosen": -2780.8994140625, "logps/rejected": -2677.789306640625, "loss": 3.5933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -101.21357727050781, "rewards/margins": 20.94892692565918, "rewards/rejected": -122.16251373291016, "step": 13370 }, { "epoch": 0.77, "grad_norm": 0.35616180300712585, "learning_rate": 0.0007439916405433646, "logits/chosen": -15.51806354522705, "logits/rejected": -15.965621948242188, "logps/chosen": -2943.94287109375, "logps/rejected": -2995.475341796875, "loss": 5.7111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.2971954345703, "rewards/margins": 7.237767696380615, "rewards/rejected": -224.5349578857422, "step": 13380 }, { "epoch": 0.78, "grad_norm": 135.50352478027344, "learning_rate": 0.0007437981346027322, "logits/chosen": -10.490199089050293, "logits/rejected": -10.473041534423828, "logps/chosen": -3102.8564453125, "logps/rejected": -2504.37841796875, "loss": 11.2811, "rewards/accuracies": 0.5, "rewards/chosen": -67.50340270996094, "rewards/margins": 2.2697978019714355, "rewards/rejected": -69.77320861816406, "step": 13390 }, { "epoch": 0.78, "grad_norm": 0.0003947204095311463, "learning_rate": 0.0007436046286621, "logits/chosen": -19.838666915893555, "logits/rejected": -20.144289016723633, "logps/chosen": -2986.99169921875, "logps/rejected": -3135.95166015625, "loss": 5.6647, "rewards/accuracies": 0.5, "rewards/chosen": -234.6238555908203, "rewards/margins": 12.663851737976074, "rewards/rejected": -247.2877197265625, "step": 13400 }, { "epoch": 0.78, "grad_norm": 136.23416137695312, "learning_rate": 0.0007434111227214676, "logits/chosen": -18.68963050842285, "logits/rejected": -19.529743194580078, "logps/chosen": -2903.81494140625, "logps/rejected": -2711.590087890625, "loss": 20.1539, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -214.69473266601562, "rewards/margins": -13.446307182312012, "rewards/rejected": -201.24844360351562, "step": 13410 }, { "epoch": 0.78, "grad_norm": 1.7851640450317063e-06, "learning_rate": 0.0007432176167808352, "logits/chosen": -15.727602005004883, "logits/rejected": -15.60485553741455, "logps/chosen": -2640.603515625, "logps/rejected": -2042.975341796875, "loss": 24.5005, "rewards/accuracies": 0.5, "rewards/chosen": -95.26438903808594, "rewards/margins": -9.449660301208496, "rewards/rejected": -85.81471252441406, "step": 13420 }, { "epoch": 0.78, "grad_norm": 2.6126739612664096e-05, "learning_rate": 0.0007430241108402028, "logits/chosen": -10.866250991821289, "logits/rejected": -10.766256332397461, "logps/chosen": -2856.216064453125, "logps/rejected": -2636.31640625, "loss": 14.891, "rewards/accuracies": 0.5, "rewards/chosen": -54.63829803466797, "rewards/margins": 2.787825107574463, "rewards/rejected": -57.426116943359375, "step": 13430 }, { "epoch": 0.78, "grad_norm": 74.69610595703125, "learning_rate": 0.0007428306048995704, "logits/chosen": -12.18458080291748, "logits/rejected": -12.1874418258667, "logps/chosen": -2381.387939453125, "logps/rejected": -2312.24755859375, "loss": 2.4636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -136.12542724609375, "rewards/margins": 6.9539031982421875, "rewards/rejected": -143.07933044433594, "step": 13440 }, { "epoch": 0.78, "grad_norm": 62.79129409790039, "learning_rate": 0.000742637098958938, "logits/chosen": -12.425580978393555, "logits/rejected": -12.542167663574219, "logps/chosen": -2598.77001953125, "logps/rejected": -2515.230712890625, "loss": 13.0934, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -127.267333984375, "rewards/margins": -11.582030296325684, "rewards/rejected": -115.68531799316406, "step": 13450 }, { "epoch": 0.78, "grad_norm": 8.59094129168625e-09, "learning_rate": 0.0007424435930183057, "logits/chosen": -12.938955307006836, "logits/rejected": -13.010492324829102, "logps/chosen": -2150.26904296875, "logps/rejected": -2138.85791015625, "loss": 8.9215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.90928649902344, "rewards/margins": -1.519842505455017, "rewards/rejected": -114.38946533203125, "step": 13460 }, { "epoch": 0.78, "grad_norm": 7.404634629892826e-07, "learning_rate": 0.0007422500870776733, "logits/chosen": -12.298185348510742, "logits/rejected": -12.962839126586914, "logps/chosen": -2684.0693359375, "logps/rejected": -2646.97900390625, "loss": 8.4174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -227.48239135742188, "rewards/margins": 6.601804256439209, "rewards/rejected": -234.0841827392578, "step": 13470 }, { "epoch": 0.78, "grad_norm": 0.0015713757602497935, "learning_rate": 0.0007420565811370409, "logits/chosen": -12.62714958190918, "logits/rejected": -12.603391647338867, "logps/chosen": -2540.83154296875, "logps/rejected": -2382.71044921875, "loss": 10.8864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.89384460449219, "rewards/margins": 3.115798234939575, "rewards/rejected": -125.0096435546875, "step": 13480 }, { "epoch": 0.78, "grad_norm": 0.0008557455148547888, "learning_rate": 0.0007418630751964085, "logits/chosen": -11.968778610229492, "logits/rejected": -12.078160285949707, "logps/chosen": -2910.98583984375, "logps/rejected": -2213.212158203125, "loss": 27.418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -97.7416000366211, "rewards/margins": -13.257512092590332, "rewards/rejected": -84.48408508300781, "step": 13490 }, { "epoch": 0.78, "grad_norm": 375.4279479980469, "learning_rate": 0.0007416695692557762, "logits/chosen": -14.306714057922363, "logits/rejected": -14.155041694641113, "logps/chosen": -2529.87158203125, "logps/rejected": -2420.703369140625, "loss": 21.3482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -104.07283020019531, "rewards/margins": 0.40149229764938354, "rewards/rejected": -104.47431945800781, "step": 13500 }, { "epoch": 0.78, "grad_norm": 93.15128326416016, "learning_rate": 0.0007414760633151438, "logits/chosen": -15.271029472351074, "logits/rejected": -15.485336303710938, "logps/chosen": -2789.391357421875, "logps/rejected": -2784.594482421875, "loss": 5.4749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.00588989257812, "rewards/margins": 26.618804931640625, "rewards/rejected": -188.6247100830078, "step": 13510 }, { "epoch": 0.78, "grad_norm": 40.69184494018555, "learning_rate": 0.0007412825573745115, "logits/chosen": -17.40923309326172, "logits/rejected": -17.740413665771484, "logps/chosen": -2662.43017578125, "logps/rejected": -2674.976318359375, "loss": 7.973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.01451110839844, "rewards/margins": 1.4057095050811768, "rewards/rejected": -159.4202423095703, "step": 13520 }, { "epoch": 0.78, "grad_norm": 256.7845764160156, "learning_rate": 0.0007410890514338791, "logits/chosen": -15.923420906066895, "logits/rejected": -16.011310577392578, "logps/chosen": -2443.23486328125, "logps/rejected": -2609.825439453125, "loss": 14.5625, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -138.83987426757812, "rewards/margins": -11.13848876953125, "rewards/rejected": -127.7013931274414, "step": 13530 }, { "epoch": 0.78, "grad_norm": 4.138189790836577e-09, "learning_rate": 0.0007408955454932467, "logits/chosen": -13.860247611999512, "logits/rejected": -13.669334411621094, "logps/chosen": -2631.82080078125, "logps/rejected": -2690.235107421875, "loss": 13.6661, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -135.59471130371094, "rewards/margins": -3.0901267528533936, "rewards/rejected": -132.5045928955078, "step": 13540 }, { "epoch": 0.78, "grad_norm": 0.12369804829359055, "learning_rate": 0.0007407020395526143, "logits/chosen": -14.725641250610352, "logits/rejected": -14.694994926452637, "logps/chosen": -2704.174560546875, "logps/rejected": -2235.9716796875, "loss": 15.2828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.45693969726562, "rewards/margins": -2.471106767654419, "rewards/rejected": -162.98583984375, "step": 13550 }, { "epoch": 0.78, "grad_norm": 56.55247116088867, "learning_rate": 0.0007405085336119819, "logits/chosen": -14.498939514160156, "logits/rejected": -14.169031143188477, "logps/chosen": -2888.513427734375, "logps/rejected": -2775.408203125, "loss": 13.5849, "rewards/accuracies": 0.5, "rewards/chosen": -187.39576721191406, "rewards/margins": -10.475419998168945, "rewards/rejected": -176.92034912109375, "step": 13560 }, { "epoch": 0.79, "grad_norm": 616.5018310546875, "learning_rate": 0.0007403150276713495, "logits/chosen": -12.681905746459961, "logits/rejected": -12.601898193359375, "logps/chosen": -2460.42919921875, "logps/rejected": -2007.572021484375, "loss": 21.6346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.8001251220703, "rewards/margins": -9.681523323059082, "rewards/rejected": -130.1186065673828, "step": 13570 }, { "epoch": 0.79, "grad_norm": 0.00022913707653060555, "learning_rate": 0.0007401215217307172, "logits/chosen": -12.740941047668457, "logits/rejected": -12.726537704467773, "logps/chosen": -2519.124267578125, "logps/rejected": -2414.11181640625, "loss": 15.1226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.4906768798828, "rewards/margins": -3.3269405364990234, "rewards/rejected": -139.16372680664062, "step": 13580 }, { "epoch": 0.79, "grad_norm": 68.10858917236328, "learning_rate": 0.0007399280157900848, "logits/chosen": -12.348114013671875, "logits/rejected": -12.904050827026367, "logps/chosen": -2371.964111328125, "logps/rejected": -1903.091064453125, "loss": 30.0559, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.50062561035156, "rewards/margins": -17.579029083251953, "rewards/rejected": -70.92159271240234, "step": 13590 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 0.0007397345098494523, "logits/chosen": -13.444480895996094, "logits/rejected": -13.347081184387207, "logps/chosen": -2623.486083984375, "logps/rejected": -2399.78662109375, "loss": 4.106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -124.2256851196289, "rewards/margins": 14.359341621398926, "rewards/rejected": -138.5850067138672, "step": 13600 }, { "epoch": 0.79, "grad_norm": 0.5817909836769104, "learning_rate": 0.00073954100390882, "logits/chosen": -13.405369758605957, "logits/rejected": -13.412191390991211, "logps/chosen": -2810.18310546875, "logps/rejected": -2330.31103515625, "loss": 25.6311, "rewards/accuracies": 0.5, "rewards/chosen": -116.30482482910156, "rewards/margins": -14.916668891906738, "rewards/rejected": -101.38814544677734, "step": 13610 }, { "epoch": 0.79, "grad_norm": 5.723596132156672e-06, "learning_rate": 0.0007393474979681876, "logits/chosen": -13.503122329711914, "logits/rejected": -13.732782363891602, "logps/chosen": -2360.171142578125, "logps/rejected": -2443.993408203125, "loss": 2.1156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.96292114257812, "rewards/margins": 25.984539031982422, "rewards/rejected": -183.94744873046875, "step": 13620 }, { "epoch": 0.79, "grad_norm": 96.93977355957031, "learning_rate": 0.0007391539920275553, "logits/chosen": -13.845812797546387, "logits/rejected": -13.769503593444824, "logps/chosen": -2330.332763671875, "logps/rejected": -2180.573486328125, "loss": 13.3016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.96511840820312, "rewards/margins": 6.803221225738525, "rewards/rejected": -172.768310546875, "step": 13630 }, { "epoch": 0.79, "grad_norm": 4.156162738800049, "learning_rate": 0.0007389604860869229, "logits/chosen": -10.719039916992188, "logits/rejected": -10.602294921875, "logps/chosen": -2211.971923828125, "logps/rejected": -2108.604736328125, "loss": 6.5868, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -39.629905700683594, "rewards/margins": 4.833121299743652, "rewards/rejected": -44.463035583496094, "step": 13640 }, { "epoch": 0.79, "grad_norm": 0.05082736909389496, "learning_rate": 0.0007387669801462905, "logits/chosen": -13.257281303405762, "logits/rejected": -13.804548263549805, "logps/chosen": -2863.40625, "logps/rejected": -2345.827880859375, "loss": 37.5745, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -143.59027099609375, "rewards/margins": -34.11186981201172, "rewards/rejected": -109.4783935546875, "step": 13650 }, { "epoch": 0.79, "grad_norm": 5.908571965029807e-10, "learning_rate": 0.0007385734742056581, "logits/chosen": -14.487485885620117, "logits/rejected": -14.44523811340332, "logps/chosen": -2987.123779296875, "logps/rejected": -2768.47900390625, "loss": 26.1196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.3492889404297, "rewards/margins": -13.024075508117676, "rewards/rejected": -133.32521057128906, "step": 13660 }, { "epoch": 0.79, "grad_norm": 53.37586975097656, "learning_rate": 0.0007383799682650257, "logits/chosen": -13.660901069641113, "logits/rejected": -13.612340927124023, "logps/chosen": -2639.63037109375, "logps/rejected": -1948.2265625, "loss": 32.9517, "rewards/accuracies": 0.5, "rewards/chosen": -134.7147674560547, "rewards/margins": -22.620281219482422, "rewards/rejected": -112.09449768066406, "step": 13670 }, { "epoch": 0.79, "grad_norm": 14.292190551757812, "learning_rate": 0.0007381864623243933, "logits/chosen": -13.899431228637695, "logits/rejected": -14.04003620147705, "logps/chosen": -2190.080810546875, "logps/rejected": -1938.146484375, "loss": 25.1852, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -186.31468200683594, "rewards/margins": -19.012283325195312, "rewards/rejected": -167.30239868164062, "step": 13680 }, { "epoch": 0.79, "grad_norm": 9.849914931692183e-06, "learning_rate": 0.000737992956383761, "logits/chosen": -12.417793273925781, "logits/rejected": -12.427900314331055, "logps/chosen": -2246.047607421875, "logps/rejected": -2229.243408203125, "loss": 10.5728, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.1274642944336, "rewards/margins": 0.8026489019393921, "rewards/rejected": -124.93009948730469, "step": 13690 }, { "epoch": 0.79, "grad_norm": 1.0284242901725804e-10, "learning_rate": 0.0007377994504431286, "logits/chosen": -11.593022346496582, "logits/rejected": -11.69703197479248, "logps/chosen": -2274.010986328125, "logps/rejected": -2264.925537109375, "loss": 1.6659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -87.03809356689453, "rewards/margins": 12.539216995239258, "rewards/rejected": -99.57733154296875, "step": 13700 }, { "epoch": 0.79, "grad_norm": 2.7330349359675616e-18, "learning_rate": 0.0007376059445024963, "logits/chosen": -11.009706497192383, "logits/rejected": -10.805964469909668, "logps/chosen": -2029.1790771484375, "logps/rejected": -1988.49609375, "loss": 14.4632, "rewards/accuracies": 0.5, "rewards/chosen": -130.4252166748047, "rewards/margins": -1.606988549232483, "rewards/rejected": -128.8182373046875, "step": 13710 }, { "epoch": 0.79, "grad_norm": 1.8719593305744752e-15, "learning_rate": 0.0007374124385618639, "logits/chosen": -12.138155937194824, "logits/rejected": -12.448143005371094, "logps/chosen": -2281.106201171875, "logps/rejected": -1982.811767578125, "loss": 18.2523, "rewards/accuracies": 0.5, "rewards/chosen": -126.38047790527344, "rewards/margins": 3.988980770111084, "rewards/rejected": -130.3694610595703, "step": 13720 }, { "epoch": 0.79, "grad_norm": 3.613437788426517e-16, "learning_rate": 0.0007372189326212315, "logits/chosen": -13.274637222290039, "logits/rejected": -13.002047538757324, "logps/chosen": -2485.067626953125, "logps/rejected": -2315.9677734375, "loss": 18.2892, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -171.2538299560547, "rewards/margins": -13.9192476272583, "rewards/rejected": -157.3345947265625, "step": 13730 }, { "epoch": 0.8, "grad_norm": 22.759458541870117, "learning_rate": 0.0007370254266805991, "logits/chosen": -11.817298889160156, "logits/rejected": -12.021116256713867, "logps/chosen": -2038.3804931640625, "logps/rejected": -1900.563720703125, "loss": 15.7777, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -79.78933715820312, "rewards/margins": -4.415651798248291, "rewards/rejected": -75.3736801147461, "step": 13740 }, { "epoch": 0.8, "grad_norm": 5.705956937163137e-06, "learning_rate": 0.0007368319207399668, "logits/chosen": -10.475610733032227, "logits/rejected": -10.435018539428711, "logps/chosen": -2330.317626953125, "logps/rejected": -2356.2685546875, "loss": 2.5232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.60945129394531, "rewards/margins": 7.658873558044434, "rewards/rejected": -117.26832580566406, "step": 13750 }, { "epoch": 0.8, "grad_norm": 42.60792541503906, "learning_rate": 0.0007366384147993344, "logits/chosen": -9.243341445922852, "logits/rejected": -9.634995460510254, "logps/chosen": -2229.19873046875, "logps/rejected": -2061.36376953125, "loss": 16.6947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.1336212158203, "rewards/margins": -3.142920732498169, "rewards/rejected": -143.99070739746094, "step": 13760 }, { "epoch": 0.8, "grad_norm": 42.482643127441406, "learning_rate": 0.000736444908858702, "logits/chosen": -11.887979507446289, "logits/rejected": -11.923299789428711, "logps/chosen": -2149.4033203125, "logps/rejected": -1678.0787353515625, "loss": 24.8629, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -142.44491577148438, "rewards/margins": -18.183618545532227, "rewards/rejected": -124.26127624511719, "step": 13770 }, { "epoch": 0.8, "grad_norm": 8.238048394559883e-06, "learning_rate": 0.0007362514029180696, "logits/chosen": -16.658615112304688, "logits/rejected": -18.93308448791504, "logps/chosen": -2364.213623046875, "logps/rejected": -2422.2119140625, "loss": 3.2006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -165.0977020263672, "rewards/margins": 10.695172309875488, "rewards/rejected": -175.7928924560547, "step": 13780 }, { "epoch": 0.8, "grad_norm": 64.18568420410156, "learning_rate": 0.0007360578969774372, "logits/chosen": -14.682365417480469, "logits/rejected": -16.23111343383789, "logps/chosen": -2463.28662109375, "logps/rejected": -2341.806640625, "loss": 6.9902, "rewards/accuracies": 0.5, "rewards/chosen": -108.07047271728516, "rewards/margins": 3.018502712249756, "rewards/rejected": -111.08897399902344, "step": 13790 }, { "epoch": 0.8, "grad_norm": 62.52415084838867, "learning_rate": 0.0007358643910368049, "logits/chosen": -13.585931777954102, "logits/rejected": -14.405691146850586, "logps/chosen": -2460.927490234375, "logps/rejected": -1829.9156494140625, "loss": 17.9827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -65.69023132324219, "rewards/margins": 10.18982982635498, "rewards/rejected": -75.88005065917969, "step": 13800 }, { "epoch": 0.8, "grad_norm": 1.062989540140391e-11, "learning_rate": 0.0007356708850961725, "logits/chosen": -16.803953170776367, "logits/rejected": -17.86305046081543, "logps/chosen": -2614.400390625, "logps/rejected": -2657.656494140625, "loss": 18.5549, "rewards/accuracies": 0.5, "rewards/chosen": -212.63754272460938, "rewards/margins": -10.685598373413086, "rewards/rejected": -201.9519500732422, "step": 13810 }, { "epoch": 0.8, "grad_norm": 2.0249246517778374e-05, "learning_rate": 0.0007354773791555401, "logits/chosen": -15.508626937866211, "logits/rejected": -14.682098388671875, "logps/chosen": -2969.80322265625, "logps/rejected": -2314.248779296875, "loss": 4.7724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.6705093383789, "rewards/margins": 11.142252922058105, "rewards/rejected": -127.8127670288086, "step": 13820 }, { "epoch": 0.8, "grad_norm": 35.535701751708984, "learning_rate": 0.0007352838732149077, "logits/chosen": -17.43287467956543, "logits/rejected": -19.646326065063477, "logps/chosen": -2356.33154296875, "logps/rejected": -1984.7144775390625, "loss": 8.4664, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -135.927490234375, "rewards/margins": -1.8224475383758545, "rewards/rejected": -134.1050262451172, "step": 13830 }, { "epoch": 0.8, "grad_norm": 18.772035598754883, "learning_rate": 0.0007350903672742753, "logits/chosen": -12.050615310668945, "logits/rejected": -12.154088020324707, "logps/chosen": -2663.490234375, "logps/rejected": -2543.314697265625, "loss": 5.6001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.8542022705078, "rewards/margins": 0.8200990557670593, "rewards/rejected": -141.67431640625, "step": 13840 }, { "epoch": 0.8, "grad_norm": 0.00018820299010258168, "learning_rate": 0.0007348968613336429, "logits/chosen": -12.256302833557129, "logits/rejected": -12.428914070129395, "logps/chosen": -2229.835205078125, "logps/rejected": -2276.46484375, "loss": 8.7095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.20693969726562, "rewards/margins": 2.7089781761169434, "rewards/rejected": -140.91592407226562, "step": 13850 }, { "epoch": 0.8, "grad_norm": 1.007443916023476e-05, "learning_rate": 0.0007347033553930106, "logits/chosen": -11.950929641723633, "logits/rejected": -12.188192367553711, "logps/chosen": -2277.98388671875, "logps/rejected": -1922.9049072265625, "loss": 11.0478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -72.11392211914062, "rewards/margins": 15.591008186340332, "rewards/rejected": -87.7049331665039, "step": 13860 }, { "epoch": 0.8, "grad_norm": 0.002058836631476879, "learning_rate": 0.0007345098494523782, "logits/chosen": -10.532806396484375, "logits/rejected": -10.410320281982422, "logps/chosen": -2568.733154296875, "logps/rejected": -2398.21826171875, "loss": 10.4283, "rewards/accuracies": 0.5, "rewards/chosen": -137.537109375, "rewards/margins": -4.647356986999512, "rewards/rejected": -132.88973999023438, "step": 13870 }, { "epoch": 0.8, "grad_norm": 0.004954439587891102, "learning_rate": 0.0007343163435117458, "logits/chosen": -9.560684204101562, "logits/rejected": -9.399250984191895, "logps/chosen": -2447.00732421875, "logps/rejected": -2089.998046875, "loss": 4.8433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -56.42401885986328, "rewards/margins": 6.630577087402344, "rewards/rejected": -63.054595947265625, "step": 13880 }, { "epoch": 0.8, "grad_norm": 105.07803344726562, "learning_rate": 0.0007341228375711134, "logits/chosen": -12.598342895507812, "logits/rejected": -12.566490173339844, "logps/chosen": -2193.105224609375, "logps/rejected": -1757.4508056640625, "loss": 28.1397, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -145.49972534179688, "rewards/margins": -18.716045379638672, "rewards/rejected": -126.78369140625, "step": 13890 }, { "epoch": 0.8, "grad_norm": 1.4885279675524998e-08, "learning_rate": 0.000733929331630481, "logits/chosen": -10.417140007019043, "logits/rejected": -10.851654052734375, "logps/chosen": -2771.410888671875, "logps/rejected": -2664.197265625, "loss": 5.0437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -45.36883544921875, "rewards/margins": 6.169020652770996, "rewards/rejected": -51.5378532409668, "step": 13900 }, { "epoch": 0.81, "grad_norm": 16.813270568847656, "learning_rate": 0.0007337358256898486, "logits/chosen": -12.947720527648926, "logits/rejected": -13.292341232299805, "logps/chosen": -2847.49560546875, "logps/rejected": -2468.58984375, "loss": 8.1906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -102.16303253173828, "rewards/margins": 1.6440467834472656, "rewards/rejected": -103.80709075927734, "step": 13910 }, { "epoch": 0.81, "grad_norm": 24.11977195739746, "learning_rate": 0.0007335423197492164, "logits/chosen": -12.93150806427002, "logits/rejected": -12.496967315673828, "logps/chosen": -2568.57275390625, "logps/rejected": -2220.8505859375, "loss": 21.232, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -215.3987579345703, "rewards/margins": -19.083393096923828, "rewards/rejected": -196.3153533935547, "step": 13920 }, { "epoch": 0.81, "grad_norm": 9.68286418914795, "learning_rate": 0.000733348813808584, "logits/chosen": -11.067231178283691, "logits/rejected": -11.176610946655273, "logps/chosen": -2540.4189453125, "logps/rejected": -2528.91162109375, "loss": 7.2791, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -114.92449951171875, "rewards/margins": -3.4871208667755127, "rewards/rejected": -111.4373779296875, "step": 13930 }, { "epoch": 0.81, "grad_norm": 1.084332666323462e-06, "learning_rate": 0.0007331553078679516, "logits/chosen": -11.305036544799805, "logits/rejected": -11.256010055541992, "logps/chosen": -2481.0029296875, "logps/rejected": -2343.140625, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -119.3401107788086, "rewards/margins": 16.413827896118164, "rewards/rejected": -135.75393676757812, "step": 13940 }, { "epoch": 0.81, "grad_norm": 18.96559715270996, "learning_rate": 0.0007329618019273192, "logits/chosen": -11.462063789367676, "logits/rejected": -11.468629837036133, "logps/chosen": -2523.091064453125, "logps/rejected": -2242.109619140625, "loss": 27.7851, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.99801635742188, "rewards/margins": -15.565150260925293, "rewards/rejected": -113.4328842163086, "step": 13950 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 0.0007327682959866868, "logits/chosen": -10.785675048828125, "logits/rejected": -10.817834854125977, "logps/chosen": -2751.730224609375, "logps/rejected": -2340.89453125, "loss": 17.8059, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -97.392822265625, "rewards/margins": -5.50475549697876, "rewards/rejected": -91.88805389404297, "step": 13960 }, { "epoch": 0.81, "grad_norm": 126.14649963378906, "learning_rate": 0.0007325747900460545, "logits/chosen": -11.990460395812988, "logits/rejected": -12.058363914489746, "logps/chosen": -3104.780029296875, "logps/rejected": -2748.00146484375, "loss": 13.1953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.3438262939453, "rewards/margins": 2.50162935256958, "rewards/rejected": -158.845458984375, "step": 13970 }, { "epoch": 0.81, "grad_norm": 51.928157806396484, "learning_rate": 0.0007323812841054221, "logits/chosen": -14.44251823425293, "logits/rejected": -14.279504776000977, "logps/chosen": -2153.161865234375, "logps/rejected": -2276.231689453125, "loss": 0.9247, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -182.51968383789062, "rewards/margins": 7.743551731109619, "rewards/rejected": -190.26321411132812, "step": 13980 }, { "epoch": 0.81, "grad_norm": 0.0014288456877693534, "learning_rate": 0.0007321877781647897, "logits/chosen": -14.253247261047363, "logits/rejected": -14.572454452514648, "logps/chosen": -2619.62451171875, "logps/rejected": -2685.969482421875, "loss": 6.1632, "rewards/accuracies": 0.5, "rewards/chosen": -222.40255737304688, "rewards/margins": 7.648809909820557, "rewards/rejected": -230.05136108398438, "step": 13990 }, { "epoch": 0.81, "grad_norm": 106.09148406982422, "learning_rate": 0.0007319942722241573, "logits/chosen": -14.698101997375488, "logits/rejected": -14.478357315063477, "logps/chosen": -2476.70458984375, "logps/rejected": -2110.566650390625, "loss": 29.0262, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -209.0789794921875, "rewards/margins": -25.778385162353516, "rewards/rejected": -183.3005828857422, "step": 14000 }, { "epoch": 0.81, "grad_norm": 1.1739349048411896e-08, "learning_rate": 0.0007318007662835249, "logits/chosen": -11.509283065795898, "logits/rejected": -11.833643913269043, "logps/chosen": -2507.735107421875, "logps/rejected": -2542.739013671875, "loss": 7.6841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -87.71895599365234, "rewards/margins": 9.64760971069336, "rewards/rejected": -97.36656951904297, "step": 14010 }, { "epoch": 0.81, "grad_norm": 0.001151924254372716, "learning_rate": 0.0007316072603428925, "logits/chosen": -12.509490013122559, "logits/rejected": -12.413652420043945, "logps/chosen": -2507.177001953125, "logps/rejected": -2278.91796875, "loss": 3.0973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.91520690917969, "rewards/margins": 10.126424789428711, "rewards/rejected": -109.0416259765625, "step": 14020 }, { "epoch": 0.81, "grad_norm": 77.83094787597656, "learning_rate": 0.0007314137544022603, "logits/chosen": -11.67180061340332, "logits/rejected": -11.750744819641113, "logps/chosen": -2750.19921875, "logps/rejected": -2626.98291015625, "loss": 9.8223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -85.58625030517578, "rewards/margins": -5.52054500579834, "rewards/rejected": -80.06570434570312, "step": 14030 }, { "epoch": 0.81, "grad_norm": 83.60035705566406, "learning_rate": 0.0007312202484616278, "logits/chosen": -13.202285766601562, "logits/rejected": -13.520495414733887, "logps/chosen": -2280.43994140625, "logps/rejected": -2177.67919921875, "loss": 18.6501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.1239471435547, "rewards/margins": -8.507231712341309, "rewards/rejected": -149.61672973632812, "step": 14040 }, { "epoch": 0.81, "grad_norm": 90.30083465576172, "learning_rate": 0.0007310267425209954, "logits/chosen": -12.214844703674316, "logits/rejected": -12.320188522338867, "logps/chosen": -2550.507080078125, "logps/rejected": -2185.627685546875, "loss": 11.9041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.72671508789062, "rewards/margins": -0.8374878168106079, "rewards/rejected": -180.88925170898438, "step": 14050 }, { "epoch": 0.81, "grad_norm": 136.9004364013672, "learning_rate": 0.000730833236580363, "logits/chosen": -12.508768081665039, "logits/rejected": -12.484228134155273, "logps/chosen": -2201.554931640625, "logps/rejected": -2064.78271484375, "loss": 10.1907, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -149.6538848876953, "rewards/margins": -1.9525940418243408, "rewards/rejected": -147.70130920410156, "step": 14060 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 0.0007306397306397306, "logits/chosen": -11.590435981750488, "logits/rejected": -11.64394474029541, "logps/chosen": -2612.31103515625, "logps/rejected": -2427.97607421875, "loss": 3.7641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.55412292480469, "rewards/margins": 22.491268157958984, "rewards/rejected": -142.04539489746094, "step": 14070 }, { "epoch": 0.82, "grad_norm": 0.043826840817928314, "learning_rate": 0.0007304462246990982, "logits/chosen": -12.453737258911133, "logits/rejected": -12.171780586242676, "logps/chosen": -2437.3935546875, "logps/rejected": -2512.04541015625, "loss": 1.9679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -100.91822052001953, "rewards/margins": 16.014686584472656, "rewards/rejected": -116.93290710449219, "step": 14080 }, { "epoch": 0.82, "grad_norm": 0.04547161981463432, "learning_rate": 0.0007302527187584659, "logits/chosen": -14.635965347290039, "logits/rejected": -14.786328315734863, "logps/chosen": -2365.59521484375, "logps/rejected": -2297.30615234375, "loss": 15.1605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.04110717773438, "rewards/margins": 9.989141464233398, "rewards/rejected": -145.03024291992188, "step": 14090 }, { "epoch": 0.82, "grad_norm": 0.0003247923741582781, "learning_rate": 0.0007300592128178335, "logits/chosen": -12.97523307800293, "logits/rejected": -12.81975269317627, "logps/chosen": -2464.37890625, "logps/rejected": -2411.91259765625, "loss": 11.4282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -137.88998413085938, "rewards/margins": -1.0631637573242188, "rewards/rejected": -136.82681274414062, "step": 14100 }, { "epoch": 0.82, "grad_norm": 4.8553056716918945, "learning_rate": 0.0007298657068772011, "logits/chosen": -12.465702056884766, "logits/rejected": -12.71064567565918, "logps/chosen": -2628.595947265625, "logps/rejected": -2629.7265625, "loss": 4.7864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -116.09932708740234, "rewards/margins": 4.040997505187988, "rewards/rejected": -120.14033508300781, "step": 14110 }, { "epoch": 0.82, "grad_norm": 0.0009959188755601645, "learning_rate": 0.0007296722009365687, "logits/chosen": -10.630697250366211, "logits/rejected": -10.496062278747559, "logps/chosen": -2679.353515625, "logps/rejected": -2561.31591796875, "loss": 2.4242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -91.55293273925781, "rewards/margins": 17.245393753051758, "rewards/rejected": -108.7983169555664, "step": 14120 }, { "epoch": 0.82, "grad_norm": 0.2466026097536087, "learning_rate": 0.0007294786949959364, "logits/chosen": -11.743645668029785, "logits/rejected": -11.593307495117188, "logps/chosen": -2340.763427734375, "logps/rejected": -2222.654052734375, "loss": 19.503, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -140.23284912109375, "rewards/margins": -16.661174774169922, "rewards/rejected": -123.5716781616211, "step": 14130 }, { "epoch": 0.82, "grad_norm": 0.9226416945457458, "learning_rate": 0.000729285189055304, "logits/chosen": -11.628419876098633, "logits/rejected": -11.6495361328125, "logps/chosen": -2530.06201171875, "logps/rejected": -2506.36083984375, "loss": 20.2739, "rewards/accuracies": 0.5, "rewards/chosen": -136.86767578125, "rewards/margins": -14.987607955932617, "rewards/rejected": -121.88008117675781, "step": 14140 }, { "epoch": 0.82, "grad_norm": 2.4239602088928223, "learning_rate": 0.0007290916831146717, "logits/chosen": -9.532051086425781, "logits/rejected": -9.602590560913086, "logps/chosen": -3021.446533203125, "logps/rejected": -3079.146240234375, "loss": 5.0889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.92994689941406, "rewards/margins": 8.33536148071289, "rewards/rejected": -138.2653045654297, "step": 14150 }, { "epoch": 0.82, "grad_norm": 5.114185910315427e-07, "learning_rate": 0.0007288981771740393, "logits/chosen": -12.027247428894043, "logits/rejected": -12.101613998413086, "logps/chosen": -3038.776123046875, "logps/rejected": -2993.013916015625, "loss": 11.1355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.85324096679688, "rewards/margins": -5.4550933837890625, "rewards/rejected": -151.3981475830078, "step": 14160 }, { "epoch": 0.82, "grad_norm": 83.41696166992188, "learning_rate": 0.0007287046712334069, "logits/chosen": -14.25847339630127, "logits/rejected": -14.930795669555664, "logps/chosen": -2383.25830078125, "logps/rejected": -2120.079833984375, "loss": 17.9734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -178.27281188964844, "rewards/margins": -10.224588394165039, "rewards/rejected": -168.04820251464844, "step": 14170 }, { "epoch": 0.82, "grad_norm": 74.11482238769531, "learning_rate": 0.0007285111652927745, "logits/chosen": -15.745635986328125, "logits/rejected": -16.734569549560547, "logps/chosen": -2470.947021484375, "logps/rejected": -2456.584716796875, "loss": 1.7046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.88589477539062, "rewards/margins": 16.02134132385254, "rewards/rejected": -188.9072265625, "step": 14180 }, { "epoch": 0.82, "grad_norm": 31.875715255737305, "learning_rate": 0.0007283176593521421, "logits/chosen": -14.37989330291748, "logits/rejected": -15.086918830871582, "logps/chosen": -3010.816650390625, "logps/rejected": -2823.48193359375, "loss": 16.0991, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -161.13096618652344, "rewards/margins": -12.409706115722656, "rewards/rejected": -148.7212677001953, "step": 14190 }, { "epoch": 0.82, "grad_norm": 8.138267837409785e-11, "learning_rate": 0.0007281241534115098, "logits/chosen": -13.775593757629395, "logits/rejected": -13.940267562866211, "logps/chosen": -2908.97900390625, "logps/rejected": -2736.68701171875, "loss": 5.5283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.48721313476562, "rewards/margins": 7.010653495788574, "rewards/rejected": -155.49789428710938, "step": 14200 }, { "epoch": 0.82, "grad_norm": 1.7204464242137618e-10, "learning_rate": 0.0007279306474708774, "logits/chosen": -12.644800186157227, "logits/rejected": -13.090864181518555, "logps/chosen": -2195.89599609375, "logps/rejected": -2113.605224609375, "loss": 5.5844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -113.61988830566406, "rewards/margins": 6.2895917892456055, "rewards/rejected": -119.90946197509766, "step": 14210 }, { "epoch": 0.82, "grad_norm": 4.2595938793965615e-06, "learning_rate": 0.000727737141530245, "logits/chosen": -15.23560619354248, "logits/rejected": -15.03980827331543, "logps/chosen": -2248.35009765625, "logps/rejected": -2306.62158203125, "loss": 6.1038, "rewards/accuracies": 0.5, "rewards/chosen": -157.74813842773438, "rewards/margins": 10.459922790527344, "rewards/rejected": -168.20803833007812, "step": 14220 }, { "epoch": 0.82, "grad_norm": 3.5027991618363785e-09, "learning_rate": 0.0007275436355896126, "logits/chosen": -12.00969123840332, "logits/rejected": -12.350115776062012, "logps/chosen": -2560.532958984375, "logps/rejected": -2734.373046875, "loss": 7.7629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.75565338134766, "rewards/margins": 0.44971999526023865, "rewards/rejected": -92.20536804199219, "step": 14230 }, { "epoch": 0.82, "grad_norm": 107.7070541381836, "learning_rate": 0.0007273501296489803, "logits/chosen": -12.333928108215332, "logits/rejected": -12.71214485168457, "logps/chosen": -2544.080322265625, "logps/rejected": -2682.57958984375, "loss": 11.7087, "rewards/accuracies": 0.5, "rewards/chosen": -176.20498657226562, "rewards/margins": 1.2410377264022827, "rewards/rejected": -177.44601440429688, "step": 14240 }, { "epoch": 0.82, "grad_norm": 84.55343627929688, "learning_rate": 0.000727156623708348, "logits/chosen": -11.84675407409668, "logits/rejected": -12.284927368164062, "logps/chosen": -2695.505126953125, "logps/rejected": -2651.72216796875, "loss": 14.3171, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -161.35704040527344, "rewards/margins": -3.125689744949341, "rewards/rejected": -158.23135375976562, "step": 14250 }, { "epoch": 0.83, "grad_norm": 84.36238861083984, "learning_rate": 0.0007269631177677155, "logits/chosen": -10.548553466796875, "logits/rejected": -10.244100570678711, "logps/chosen": -2900.4765625, "logps/rejected": -2786.58642578125, "loss": 4.0146, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -98.15232849121094, "rewards/margins": 0.6383743286132812, "rewards/rejected": -98.79069519042969, "step": 14260 }, { "epoch": 0.83, "grad_norm": 173.27052307128906, "learning_rate": 0.0007267696118270831, "logits/chosen": -9.789865493774414, "logits/rejected": -10.79810905456543, "logps/chosen": -2915.37744140625, "logps/rejected": -2288.626708984375, "loss": 33.3259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.32302856445312, "rewards/margins": -21.72348403930664, "rewards/rejected": -120.59954833984375, "step": 14270 }, { "epoch": 0.83, "grad_norm": 6.894757916597882e-06, "learning_rate": 0.0007265761058864507, "logits/chosen": -9.812278747558594, "logits/rejected": -9.452171325683594, "logps/chosen": -3164.760986328125, "logps/rejected": -2745.9443359375, "loss": 8.4245, "rewards/accuracies": 0.5, "rewards/chosen": -51.96289825439453, "rewards/margins": -1.234398603439331, "rewards/rejected": -50.72849655151367, "step": 14280 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 0.0007263825999458183, "logits/chosen": -13.975090026855469, "logits/rejected": -13.979782104492188, "logps/chosen": -2481.394287109375, "logps/rejected": -1996.9761962890625, "loss": 4.0485, "rewards/accuracies": 0.5, "rewards/chosen": -115.5888900756836, "rewards/margins": 16.826297760009766, "rewards/rejected": -132.41517639160156, "step": 14290 }, { "epoch": 0.83, "grad_norm": 137.30648803710938, "learning_rate": 0.0007261890940051859, "logits/chosen": -15.484098434448242, "logits/rejected": -15.988096237182617, "logps/chosen": -2470.18359375, "logps/rejected": -2324.063232421875, "loss": 23.6202, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -116.59156799316406, "rewards/margins": -21.327749252319336, "rewards/rejected": -95.2638168334961, "step": 14300 }, { "epoch": 0.83, "grad_norm": 9.486691851634532e-05, "learning_rate": 0.0007259955880645535, "logits/chosen": -18.199522018432617, "logits/rejected": -18.907062530517578, "logps/chosen": -2447.455810546875, "logps/rejected": -2129.09326171875, "loss": 4.0181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.32156372070312, "rewards/margins": 10.654725074768066, "rewards/rejected": -185.97628784179688, "step": 14310 }, { "epoch": 0.83, "grad_norm": 84.8342056274414, "learning_rate": 0.0007258020821239212, "logits/chosen": -13.974966049194336, "logits/rejected": -14.350252151489258, "logps/chosen": -2417.51416015625, "logps/rejected": -2433.977783203125, "loss": 15.1961, "rewards/accuracies": 0.5, "rewards/chosen": -179.73788452148438, "rewards/margins": 2.503556489944458, "rewards/rejected": -182.24142456054688, "step": 14320 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 0.0007256085761832888, "logits/chosen": -13.18379020690918, "logits/rejected": -13.345903396606445, "logps/chosen": -2415.572021484375, "logps/rejected": -2413.618896484375, "loss": 2.632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -132.63424682617188, "rewards/margins": 14.329591751098633, "rewards/rejected": -146.96383666992188, "step": 14330 }, { "epoch": 0.83, "grad_norm": 6.633954762946814e-05, "learning_rate": 0.0007254150702426565, "logits/chosen": -10.512763977050781, "logits/rejected": -11.132074356079102, "logps/chosen": -2908.40625, "logps/rejected": -2668.59130859375, "loss": 2.1654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -80.0005874633789, "rewards/margins": 11.780677795410156, "rewards/rejected": -91.78124237060547, "step": 14340 }, { "epoch": 0.83, "grad_norm": 47.6845703125, "learning_rate": 0.0007252215643020241, "logits/chosen": -13.430368423461914, "logits/rejected": -13.530474662780762, "logps/chosen": -2435.3876953125, "logps/rejected": -1979.474609375, "loss": 23.6102, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -144.98806762695312, "rewards/margins": -10.506887435913086, "rewards/rejected": -134.48118591308594, "step": 14350 }, { "epoch": 0.83, "grad_norm": 84.23249816894531, "learning_rate": 0.0007250280583613917, "logits/chosen": -15.180532455444336, "logits/rejected": -15.849309921264648, "logps/chosen": -2229.08642578125, "logps/rejected": -1955.5833740234375, "loss": 14.6257, "rewards/accuracies": 0.5, "rewards/chosen": -162.55894470214844, "rewards/margins": -10.119340896606445, "rewards/rejected": -152.43960571289062, "step": 14360 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 0.0007248345524207594, "logits/chosen": -14.757098197937012, "logits/rejected": -14.270840644836426, "logps/chosen": -2489.93701171875, "logps/rejected": -1911.9566650390625, "loss": 13.9592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -78.2021713256836, "rewards/margins": 7.878096580505371, "rewards/rejected": -86.08026123046875, "step": 14370 }, { "epoch": 0.83, "grad_norm": 0.09520422667264938, "learning_rate": 0.000724641046480127, "logits/chosen": -14.576184272766113, "logits/rejected": -14.798538208007812, "logps/chosen": -2896.657470703125, "logps/rejected": -2502.012939453125, "loss": 12.6632, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -112.9997329711914, "rewards/margins": -2.2057430744171143, "rewards/rejected": -110.79399108886719, "step": 14380 }, { "epoch": 0.83, "grad_norm": 75.75411224365234, "learning_rate": 0.0007244475405394946, "logits/chosen": -13.733667373657227, "logits/rejected": -14.505975723266602, "logps/chosen": -3030.703125, "logps/rejected": -2359.274169921875, "loss": 27.7318, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -153.1499481201172, "rewards/margins": -20.817363739013672, "rewards/rejected": -132.3325958251953, "step": 14390 }, { "epoch": 0.83, "grad_norm": 88.23775482177734, "learning_rate": 0.0007242540345988622, "logits/chosen": -13.131919860839844, "logits/rejected": -13.7257080078125, "logps/chosen": -2544.19677734375, "logps/rejected": -2262.568603515625, "loss": 19.0571, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -161.778076171875, "rewards/margins": -13.091041564941406, "rewards/rejected": -148.68704223632812, "step": 14400 }, { "epoch": 0.83, "grad_norm": 434.29583740234375, "learning_rate": 0.0007240605286582298, "logits/chosen": -13.82603931427002, "logits/rejected": -13.640607833862305, "logps/chosen": -3449.31982421875, "logps/rejected": -2730.133056640625, "loss": 74.4033, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -270.40643310546875, "rewards/margins": -67.35787200927734, "rewards/rejected": -203.0485382080078, "step": 14410 }, { "epoch": 0.83, "grad_norm": 0.3477019965648651, "learning_rate": 0.0007238670227175974, "logits/chosen": -11.539373397827148, "logits/rejected": -12.730805397033691, "logps/chosen": -4163.96826171875, "logps/rejected": -2583.385009765625, "loss": 143.9838, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -314.83123779296875, "rewards/margins": -143.48731994628906, "rewards/rejected": -171.34396362304688, "step": 14420 }, { "epoch": 0.84, "grad_norm": 9.09193387244045e-10, "learning_rate": 0.0007236735167769651, "logits/chosen": -12.829679489135742, "logits/rejected": -12.833635330200195, "logps/chosen": -2808.808837890625, "logps/rejected": -2262.218994140625, "loss": 4.9572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.02716064453125, "rewards/margins": 6.36676549911499, "rewards/rejected": -136.3939208984375, "step": 14430 }, { "epoch": 0.84, "grad_norm": 49.5170783996582, "learning_rate": 0.0007234800108363327, "logits/chosen": -14.535226821899414, "logits/rejected": -14.694610595703125, "logps/chosen": -2722.66943359375, "logps/rejected": -2270.260986328125, "loss": 15.5457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.55105590820312, "rewards/margins": -8.5330228805542, "rewards/rejected": -151.0180206298828, "step": 14440 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 0.0007232865048957004, "logits/chosen": -13.185785293579102, "logits/rejected": -13.086311340332031, "logps/chosen": -2190.7568359375, "logps/rejected": -2068.69189453125, "loss": 3.0343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.2632064819336, "rewards/margins": 9.21944522857666, "rewards/rejected": -126.48265075683594, "step": 14450 }, { "epoch": 0.84, "grad_norm": 3.6404914913662273e-16, "learning_rate": 0.000723092998955068, "logits/chosen": -11.418015480041504, "logits/rejected": -11.422849655151367, "logps/chosen": -2831.555908203125, "logps/rejected": -2529.7236328125, "loss": 10.6514, "rewards/accuracies": 0.5, "rewards/chosen": -103.70462799072266, "rewards/margins": 4.27023458480835, "rewards/rejected": -107.974853515625, "step": 14460 }, { "epoch": 0.84, "grad_norm": 2.585359913146964e-15, "learning_rate": 0.0007228994930144355, "logits/chosen": -11.958150863647461, "logits/rejected": -11.818021774291992, "logps/chosen": -2447.8603515625, "logps/rejected": -2137.888427734375, "loss": 3.4182, "rewards/accuracies": 0.5, "rewards/chosen": -103.05839538574219, "rewards/margins": 8.28563404083252, "rewards/rejected": -111.34403228759766, "step": 14470 }, { "epoch": 0.84, "grad_norm": 81.29000854492188, "learning_rate": 0.0007227059870738031, "logits/chosen": -12.105875015258789, "logits/rejected": -11.960567474365234, "logps/chosen": -2586.405029296875, "logps/rejected": -2400.6416015625, "loss": 16.9599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.19424438476562, "rewards/margins": 4.054971218109131, "rewards/rejected": -159.24920654296875, "step": 14480 }, { "epoch": 0.84, "grad_norm": 84.35519409179688, "learning_rate": 0.0007225124811331708, "logits/chosen": -15.335630416870117, "logits/rejected": -16.32976531982422, "logps/chosen": -2678.406005859375, "logps/rejected": -2466.58349609375, "loss": 20.0473, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.95540618896484, "rewards/margins": -14.63764476776123, "rewards/rejected": -109.31776428222656, "step": 14490 }, { "epoch": 0.84, "grad_norm": 87.9269027709961, "learning_rate": 0.0007223189751925384, "logits/chosen": -14.70849895477295, "logits/rejected": -15.01036548614502, "logps/chosen": -2812.37158203125, "logps/rejected": -2712.62939453125, "loss": 11.549, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -170.5377655029297, "rewards/margins": -5.767727851867676, "rewards/rejected": -164.77005004882812, "step": 14500 }, { "epoch": 0.84, "grad_norm": 8.108591810144989e-18, "learning_rate": 0.000722125469251906, "logits/chosen": -12.859312057495117, "logits/rejected": -12.910661697387695, "logps/chosen": -2859.572998046875, "logps/rejected": -2785.17529296875, "loss": 6.4901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.14157104492188, "rewards/margins": 7.671913146972656, "rewards/rejected": -181.81349182128906, "step": 14510 }, { "epoch": 0.84, "grad_norm": 95.27892303466797, "learning_rate": 0.0007219319633112736, "logits/chosen": -15.1962251663208, "logits/rejected": -14.779687881469727, "logps/chosen": -2571.45458984375, "logps/rejected": -2361.837646484375, "loss": 3.3853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.76718139648438, "rewards/margins": 6.632696628570557, "rewards/rejected": -191.39988708496094, "step": 14520 }, { "epoch": 0.84, "grad_norm": 0.00013757178385276347, "learning_rate": 0.0007217384573706412, "logits/chosen": -15.522562980651855, "logits/rejected": -15.274151802062988, "logps/chosen": -2254.843505859375, "logps/rejected": -2230.95361328125, "loss": 8.9219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.1708526611328, "rewards/margins": 4.771066665649414, "rewards/rejected": -160.94192504882812, "step": 14530 }, { "epoch": 0.84, "grad_norm": 23.483980178833008, "learning_rate": 0.0007215449514300088, "logits/chosen": -15.338674545288086, "logits/rejected": -15.272684097290039, "logps/chosen": -2447.178955078125, "logps/rejected": -2134.37353515625, "loss": 4.3098, "rewards/accuracies": 0.5, "rewards/chosen": -140.63949584960938, "rewards/margins": 6.3959455490112305, "rewards/rejected": -147.0354461669922, "step": 14540 }, { "epoch": 0.84, "grad_norm": 51.8170166015625, "learning_rate": 0.0007213514454893766, "logits/chosen": -13.656763076782227, "logits/rejected": -13.533742904663086, "logps/chosen": -2455.484375, "logps/rejected": -2415.77099609375, "loss": 4.5615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -89.15655517578125, "rewards/margins": 7.738165855407715, "rewards/rejected": -96.89472198486328, "step": 14550 }, { "epoch": 0.84, "grad_norm": 2.430764330053556e-14, "learning_rate": 0.0007211579395487442, "logits/chosen": -12.240983963012695, "logits/rejected": -12.299960136413574, "logps/chosen": -2579.6640625, "logps/rejected": -2521.82080078125, "loss": 6.6089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -76.50045013427734, "rewards/margins": 12.64045524597168, "rewards/rejected": -89.14090728759766, "step": 14560 }, { "epoch": 0.84, "grad_norm": 0.07342910766601562, "learning_rate": 0.0007209644336081118, "logits/chosen": -14.3190336227417, "logits/rejected": -14.216471672058105, "logps/chosen": -2390.90087890625, "logps/rejected": -2168.842529296875, "loss": 20.9833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.15432739257812, "rewards/margins": -11.91357707977295, "rewards/rejected": -134.24075317382812, "step": 14570 }, { "epoch": 0.84, "grad_norm": 3.330420028646586e-08, "learning_rate": 0.0007207709276674794, "logits/chosen": -12.167903900146484, "logits/rejected": -12.160469055175781, "logps/chosen": -2285.964111328125, "logps/rejected": -1993.091796875, "loss": 16.1775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.7474136352539, "rewards/margins": 0.34538689255714417, "rewards/rejected": -70.09280395507812, "step": 14580 }, { "epoch": 0.84, "grad_norm": 7.407950033666566e-05, "learning_rate": 0.000720577421726847, "logits/chosen": -11.885479927062988, "logits/rejected": -11.712082862854004, "logps/chosen": -2516.530029296875, "logps/rejected": -2363.896240234375, "loss": 14.7865, "rewards/accuracies": 0.5, "rewards/chosen": -133.47166442871094, "rewards/margins": -4.194116115570068, "rewards/rejected": -129.27755737304688, "step": 14590 }, { "epoch": 0.85, "grad_norm": 72.1185302734375, "learning_rate": 0.0007203839157862147, "logits/chosen": -13.487306594848633, "logits/rejected": -13.38879680633545, "logps/chosen": -2319.41259765625, "logps/rejected": -2286.159423828125, "loss": 8.4298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.08253479003906, "rewards/margins": 0.5582855343818665, "rewards/rejected": -166.64080810546875, "step": 14600 }, { "epoch": 0.85, "grad_norm": 137.3105926513672, "learning_rate": 0.0007201904098455823, "logits/chosen": -11.054709434509277, "logits/rejected": -10.727020263671875, "logps/chosen": -2885.93017578125, "logps/rejected": -2991.0556640625, "loss": 16.1593, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -148.30711364746094, "rewards/margins": -12.9027738571167, "rewards/rejected": -135.40435791015625, "step": 14610 }, { "epoch": 0.85, "grad_norm": 23.863418579101562, "learning_rate": 0.0007199969039049499, "logits/chosen": -10.528918266296387, "logits/rejected": -10.831323623657227, "logps/chosen": -2912.282958984375, "logps/rejected": -2856.7001953125, "loss": 8.6705, "rewards/accuracies": 0.5, "rewards/chosen": -174.20310974121094, "rewards/margins": -6.857891082763672, "rewards/rejected": -167.34519958496094, "step": 14620 }, { "epoch": 0.85, "grad_norm": 58.69261169433594, "learning_rate": 0.0007198033979643175, "logits/chosen": -12.981328964233398, "logits/rejected": -13.038151741027832, "logps/chosen": -2711.18115234375, "logps/rejected": -2298.45751953125, "loss": 29.5412, "rewards/accuracies": 0.5, "rewards/chosen": -129.7305145263672, "rewards/margins": -19.7653865814209, "rewards/rejected": -109.96513366699219, "step": 14630 }, { "epoch": 0.85, "grad_norm": 98.52741241455078, "learning_rate": 0.0007196098920236851, "logits/chosen": -12.614656448364258, "logits/rejected": -12.481109619140625, "logps/chosen": -2747.43701171875, "logps/rejected": -2361.020751953125, "loss": 25.988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -135.42352294921875, "rewards/margins": -16.2562255859375, "rewards/rejected": -119.16729736328125, "step": 14640 }, { "epoch": 0.85, "grad_norm": 47.70903015136719, "learning_rate": 0.0007194163860830527, "logits/chosen": -11.442076683044434, "logits/rejected": -11.403453826904297, "logps/chosen": -2656.23291015625, "logps/rejected": -2445.913330078125, "loss": 15.2813, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -118.5823974609375, "rewards/margins": -9.982222557067871, "rewards/rejected": -108.60018157958984, "step": 14650 }, { "epoch": 0.85, "grad_norm": 4.164352560565021e-07, "learning_rate": 0.0007192228801424205, "logits/chosen": -10.102628707885742, "logits/rejected": -10.017208099365234, "logps/chosen": -2600.82373046875, "logps/rejected": -2195.57763671875, "loss": 1.2753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -86.48283386230469, "rewards/margins": 16.867176055908203, "rewards/rejected": -103.35001373291016, "step": 14660 }, { "epoch": 0.85, "grad_norm": 37.83590316772461, "learning_rate": 0.0007190293742017881, "logits/chosen": -9.594494819641113, "logits/rejected": -9.135393142700195, "logps/chosen": -2487.1181640625, "logps/rejected": -2403.525390625, "loss": 8.041, "rewards/accuracies": 0.5, "rewards/chosen": -71.06736755371094, "rewards/margins": -3.052812099456787, "rewards/rejected": -68.01454162597656, "step": 14670 }, { "epoch": 0.85, "grad_norm": 0.00021704052051063627, "learning_rate": 0.0007188358682611557, "logits/chosen": -10.169172286987305, "logits/rejected": -10.076009750366211, "logps/chosen": -2674.6826171875, "logps/rejected": -2482.114013671875, "loss": 4.7902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -90.16648864746094, "rewards/margins": 0.9453128576278687, "rewards/rejected": -91.11180114746094, "step": 14680 }, { "epoch": 0.85, "grad_norm": 7.390919812473064e-15, "learning_rate": 0.0007186423623205232, "logits/chosen": -10.893807411193848, "logits/rejected": -10.870145797729492, "logps/chosen": -2757.00048828125, "logps/rejected": -2177.22314453125, "loss": 37.609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.7669448852539, "rewards/margins": -29.154041290283203, "rewards/rejected": -89.61289978027344, "step": 14690 }, { "epoch": 0.85, "grad_norm": 59.072444915771484, "learning_rate": 0.0007184488563798908, "logits/chosen": -13.86260986328125, "logits/rejected": -13.898574829101562, "logps/chosen": -2717.390625, "logps/rejected": -2655.24853515625, "loss": 10.8123, "rewards/accuracies": 0.5, "rewards/chosen": -138.77371215820312, "rewards/margins": -3.5259265899658203, "rewards/rejected": -135.24777221679688, "step": 14700 }, { "epoch": 0.85, "grad_norm": 137.95130920410156, "learning_rate": 0.0007182553504392584, "logits/chosen": -13.568913459777832, "logits/rejected": -13.577554702758789, "logps/chosen": -2886.45361328125, "logps/rejected": -2909.60791015625, "loss": 19.5066, "rewards/accuracies": 0.5, "rewards/chosen": -158.1844024658203, "rewards/margins": -9.99042797088623, "rewards/rejected": -148.19395446777344, "step": 14710 }, { "epoch": 0.85, "grad_norm": 193.31622314453125, "learning_rate": 0.0007180618444986261, "logits/chosen": -14.447450637817383, "logits/rejected": -14.357831954956055, "logps/chosen": -2650.363037109375, "logps/rejected": -2475.21044921875, "loss": 28.6101, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.13821411132812, "rewards/margins": -21.882047653198242, "rewards/rejected": -152.25616455078125, "step": 14720 }, { "epoch": 0.85, "grad_norm": 71.97062683105469, "learning_rate": 0.0007178683385579937, "logits/chosen": -11.756772994995117, "logits/rejected": -11.551132202148438, "logps/chosen": -3133.56396484375, "logps/rejected": -2650.898193359375, "loss": 4.8808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.7205352783203, "rewards/margins": 12.270647048950195, "rewards/rejected": -148.99118041992188, "step": 14730 }, { "epoch": 0.85, "grad_norm": 0.0022454478312283754, "learning_rate": 0.0007176748326173613, "logits/chosen": -13.581092834472656, "logits/rejected": -13.466694831848145, "logps/chosen": -2518.65869140625, "logps/rejected": -2210.07861328125, "loss": 6.9212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -77.93899536132812, "rewards/margins": 2.097123384475708, "rewards/rejected": -80.0361099243164, "step": 14740 }, { "epoch": 0.85, "grad_norm": 1.2439710594946973e-08, "learning_rate": 0.0007174813266767289, "logits/chosen": -13.994035720825195, "logits/rejected": -13.837285995483398, "logps/chosen": -2623.3369140625, "logps/rejected": -2545.205078125, "loss": 2.5757, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -118.85272216796875, "rewards/margins": 27.45647621154785, "rewards/rejected": -146.30918884277344, "step": 14750 }, { "epoch": 0.85, "grad_norm": 7.67470210782462e-15, "learning_rate": 0.0007172878207360966, "logits/chosen": -14.167932510375977, "logits/rejected": -14.510385513305664, "logps/chosen": -2737.5166015625, "logps/rejected": -2623.98193359375, "loss": 1.6336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.13656616210938, "rewards/margins": 14.025650024414062, "rewards/rejected": -143.16221618652344, "step": 14760 }, { "epoch": 0.85, "grad_norm": 139.9106903076172, "learning_rate": 0.0007170943147954643, "logits/chosen": -10.251974105834961, "logits/rejected": -10.124876022338867, "logps/chosen": -2559.6689453125, "logps/rejected": -2656.945068359375, "loss": 8.8569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.23227310180664, "rewards/margins": 9.123834609985352, "rewards/rejected": -31.356109619140625, "step": 14770 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 0.0007169008088548319, "logits/chosen": -13.930689811706543, "logits/rejected": -13.839323043823242, "logps/chosen": -2699.26123046875, "logps/rejected": -2747.23486328125, "loss": 2.415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.17301177978516, "rewards/margins": 9.853245735168457, "rewards/rejected": -137.02626037597656, "step": 14780 }, { "epoch": 0.86, "grad_norm": 0.12779520452022552, "learning_rate": 0.0007167073029141995, "logits/chosen": -11.859640121459961, "logits/rejected": -12.181784629821777, "logps/chosen": -3112.23974609375, "logps/rejected": -3131.128173828125, "loss": 5.805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -130.52923583984375, "rewards/margins": 6.7220563888549805, "rewards/rejected": -137.25128173828125, "step": 14790 }, { "epoch": 0.86, "grad_norm": 3.823242150247097e-05, "learning_rate": 0.0007165137969735671, "logits/chosen": -16.366790771484375, "logits/rejected": -16.00379753112793, "logps/chosen": -2673.989013671875, "logps/rejected": -2609.46044921875, "loss": 5.6448, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.99038696289062, "rewards/margins": 2.7442288398742676, "rewards/rejected": -210.734619140625, "step": 14800 }, { "epoch": 0.86, "grad_norm": 2.1468875566199586e-09, "learning_rate": 0.0007163202910329347, "logits/chosen": -13.349166870117188, "logits/rejected": -13.003950119018555, "logps/chosen": -2847.2646484375, "logps/rejected": -2575.243408203125, "loss": 18.0305, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -112.660400390625, "rewards/margins": -10.340578079223633, "rewards/rejected": -102.31980895996094, "step": 14810 }, { "epoch": 0.86, "grad_norm": 1.3738933546392218e-07, "learning_rate": 0.0007161267850923023, "logits/chosen": -12.857072830200195, "logits/rejected": -13.331846237182617, "logps/chosen": -2456.090576171875, "logps/rejected": -2604.76220703125, "loss": 7.5255, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -163.5978240966797, "rewards/margins": 0.6549835205078125, "rewards/rejected": -164.25279235839844, "step": 14820 }, { "epoch": 0.86, "grad_norm": 7.6613664627075195, "learning_rate": 0.00071593327915167, "logits/chosen": -11.654672622680664, "logits/rejected": -11.6210355758667, "logps/chosen": -2727.90966796875, "logps/rejected": -2515.032958984375, "loss": 7.522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.81068420410156, "rewards/margins": 0.5607234835624695, "rewards/rejected": -129.3714141845703, "step": 14830 }, { "epoch": 0.86, "grad_norm": 443.1741943359375, "learning_rate": 0.0007157397732110376, "logits/chosen": -11.975582122802734, "logits/rejected": -11.844925880432129, "logps/chosen": -2729.39013671875, "logps/rejected": -3174.23388671875, "loss": 13.2775, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -62.16133499145508, "rewards/margins": -6.414182186126709, "rewards/rejected": -55.74715042114258, "step": 14840 }, { "epoch": 0.86, "grad_norm": 1.5096755789922867e-10, "learning_rate": 0.0007155462672704052, "logits/chosen": -16.829126358032227, "logits/rejected": -16.76603889465332, "logps/chosen": -2639.096435546875, "logps/rejected": -2704.415283203125, "loss": 18.9469, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.8594512939453, "rewards/margins": -9.245333671569824, "rewards/rejected": -164.61410522460938, "step": 14850 }, { "epoch": 0.86, "grad_norm": 0.012451579794287682, "learning_rate": 0.0007153527613297729, "logits/chosen": -15.0276517868042, "logits/rejected": -14.850563049316406, "logps/chosen": -2978.341796875, "logps/rejected": -2884.682373046875, "loss": 13.7351, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -170.19619750976562, "rewards/margins": -7.880253791809082, "rewards/rejected": -162.31594848632812, "step": 14860 }, { "epoch": 0.86, "grad_norm": 65.9451675415039, "learning_rate": 0.0007151592553891405, "logits/chosen": -16.699331283569336, "logits/rejected": -16.745031356811523, "logps/chosen": -2453.12255859375, "logps/rejected": -2353.25439453125, "loss": 6.3189, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -209.75619506835938, "rewards/margins": -0.4786834716796875, "rewards/rejected": -209.2775421142578, "step": 14870 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 0.0007149657494485082, "logits/chosen": -15.135665893554688, "logits/rejected": -15.167119026184082, "logps/chosen": -2801.087646484375, "logps/rejected": -2389.032958984375, "loss": 2.361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -132.626953125, "rewards/margins": 30.5685977935791, "rewards/rejected": -163.19552612304688, "step": 14880 }, { "epoch": 0.86, "grad_norm": 8.550089455883203e-12, "learning_rate": 0.0007147722435078758, "logits/chosen": -14.177032470703125, "logits/rejected": -14.086709976196289, "logps/chosen": -2867.837646484375, "logps/rejected": -2643.493896484375, "loss": 2.0424, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -139.11695861816406, "rewards/margins": 8.73676586151123, "rewards/rejected": -147.8537139892578, "step": 14890 }, { "epoch": 0.86, "grad_norm": 114.22180938720703, "learning_rate": 0.0007145787375672434, "logits/chosen": -14.568618774414062, "logits/rejected": -14.875231742858887, "logps/chosen": -2017.185791015625, "logps/rejected": -2466.873779296875, "loss": 5.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.48121643066406, "rewards/margins": 27.658458709716797, "rewards/rejected": -199.13967895507812, "step": 14900 }, { "epoch": 0.86, "grad_norm": 45.892250061035156, "learning_rate": 0.0007143852316266109, "logits/chosen": -13.954302787780762, "logits/rejected": -14.311358451843262, "logps/chosen": -2948.39013671875, "logps/rejected": -2697.1533203125, "loss": 9.1382, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -107.9586181640625, "rewards/margins": -4.408856391906738, "rewards/rejected": -103.54975891113281, "step": 14910 }, { "epoch": 0.86, "grad_norm": 3.6326911449432373, "learning_rate": 0.0007141917256859785, "logits/chosen": -12.12981128692627, "logits/rejected": -11.83900260925293, "logps/chosen": -2628.461181640625, "logps/rejected": -2576.66162109375, "loss": 4.3726, "rewards/accuracies": 0.5, "rewards/chosen": -144.4381866455078, "rewards/margins": 2.9166152477264404, "rewards/rejected": -147.3548126220703, "step": 14920 }, { "epoch": 0.86, "grad_norm": 0.0036184119526296854, "learning_rate": 0.0007139982197453461, "logits/chosen": -12.606992721557617, "logits/rejected": -12.377174377441406, "logps/chosen": -2646.517333984375, "logps/rejected": -2503.656982421875, "loss": 11.9407, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -155.35470581054688, "rewards/margins": -8.06104564666748, "rewards/rejected": -147.29367065429688, "step": 14930 }, { "epoch": 0.86, "grad_norm": 6.33540781549275e-21, "learning_rate": 0.0007138047138047138, "logits/chosen": -11.170024871826172, "logits/rejected": -11.086504936218262, "logps/chosen": -2741.53564453125, "logps/rejected": -2220.93310546875, "loss": 27.1403, "rewards/accuracies": 0.5, "rewards/chosen": -128.7738800048828, "rewards/margins": -15.002367973327637, "rewards/rejected": -113.77149963378906, "step": 14940 }, { "epoch": 0.87, "grad_norm": 97.31742858886719, "learning_rate": 0.0007136112078640814, "logits/chosen": -12.764422416687012, "logits/rejected": -12.864145278930664, "logps/chosen": -2692.52880859375, "logps/rejected": -2658.3154296875, "loss": 4.2556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.40321350097656, "rewards/margins": 3.733407974243164, "rewards/rejected": -136.13662719726562, "step": 14950 }, { "epoch": 0.87, "grad_norm": 0.01557180006057024, "learning_rate": 0.000713417701923449, "logits/chosen": -12.940312385559082, "logits/rejected": -13.267333984375, "logps/chosen": -2502.81884765625, "logps/rejected": -2265.42333984375, "loss": 1.3703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.04049682617188, "rewards/margins": 10.045549392700195, "rewards/rejected": -148.08602905273438, "step": 14960 }, { "epoch": 0.87, "grad_norm": 1.975708619283978e-05, "learning_rate": 0.0007132241959828167, "logits/chosen": -14.082287788391113, "logits/rejected": -14.004963874816895, "logps/chosen": -2314.65380859375, "logps/rejected": -2013.176513671875, "loss": 30.3911, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -195.3883056640625, "rewards/margins": -24.82598876953125, "rewards/rejected": -170.5623016357422, "step": 14970 }, { "epoch": 0.87, "grad_norm": 0.005389743018895388, "learning_rate": 0.0007130306900421843, "logits/chosen": -10.315892219543457, "logits/rejected": -10.456077575683594, "logps/chosen": -2731.16064453125, "logps/rejected": -2282.473876953125, "loss": 8.0965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -96.9106674194336, "rewards/margins": 19.378437042236328, "rewards/rejected": -116.28910827636719, "step": 14980 }, { "epoch": 0.87, "grad_norm": 4.0846339288691524e-07, "learning_rate": 0.000712837184101552, "logits/chosen": -11.919556617736816, "logits/rejected": -12.013189315795898, "logps/chosen": -2726.745361328125, "logps/rejected": -2716.586669921875, "loss": 1.6447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.7106170654297, "rewards/margins": 11.474678039550781, "rewards/rejected": -147.185302734375, "step": 14990 }, { "epoch": 0.87, "grad_norm": 118.28707122802734, "learning_rate": 0.0007126436781609196, "logits/chosen": -11.61884593963623, "logits/rejected": -11.65909481048584, "logps/chosen": -2744.913818359375, "logps/rejected": -2286.94384765625, "loss": 12.3973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.86024475097656, "rewards/margins": -0.8154258728027344, "rewards/rejected": -132.04483032226562, "step": 15000 }, { "epoch": 0.87, "grad_norm": 28.402963638305664, "learning_rate": 0.0007124501722202872, "logits/chosen": -13.351658821105957, "logits/rejected": -13.519149780273438, "logps/chosen": -2460.771484375, "logps/rejected": -2222.576171875, "loss": 17.0274, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -173.9989013671875, "rewards/margins": -13.407926559448242, "rewards/rejected": -160.59097290039062, "step": 15010 }, { "epoch": 0.87, "grad_norm": 6.252697832173117e-22, "learning_rate": 0.0007122566662796548, "logits/chosen": -13.155545234680176, "logits/rejected": -13.191276550292969, "logps/chosen": -2379.80712890625, "logps/rejected": -2232.19873046875, "loss": 17.6224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -136.1026611328125, "rewards/margins": 4.746337890625, "rewards/rejected": -140.8489990234375, "step": 15020 }, { "epoch": 0.87, "grad_norm": 72.41828918457031, "learning_rate": 0.0007120631603390224, "logits/chosen": -13.35597038269043, "logits/rejected": -13.61034107208252, "logps/chosen": -2408.21142578125, "logps/rejected": -2547.344482421875, "loss": 14.1929, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.38612365722656, "rewards/margins": -5.9392900466918945, "rewards/rejected": -111.44682312011719, "step": 15030 }, { "epoch": 0.87, "grad_norm": 310.2215270996094, "learning_rate": 0.00071186965439839, "logits/chosen": -11.204126358032227, "logits/rejected": -11.330097198486328, "logps/chosen": -2714.270751953125, "logps/rejected": -2492.058837890625, "loss": 4.0176, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -98.8392333984375, "rewards/margins": -0.08519268035888672, "rewards/rejected": -98.75404357910156, "step": 15040 }, { "epoch": 0.87, "grad_norm": 0.0029895976185798645, "learning_rate": 0.0007116761484577577, "logits/chosen": -14.721258163452148, "logits/rejected": -14.838445663452148, "logps/chosen": -2398.13818359375, "logps/rejected": -2036.3736572265625, "loss": 9.3375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.9195098876953, "rewards/margins": -2.5807137489318848, "rewards/rejected": -135.33880615234375, "step": 15050 }, { "epoch": 0.87, "grad_norm": 64.81777954101562, "learning_rate": 0.0007114826425171253, "logits/chosen": -12.283388137817383, "logits/rejected": -12.300378799438477, "logps/chosen": -2616.341064453125, "logps/rejected": -2497.940673828125, "loss": 21.6669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -131.32733154296875, "rewards/margins": -10.482564926147461, "rewards/rejected": -120.84477233886719, "step": 15060 }, { "epoch": 0.87, "grad_norm": 5.068269820185378e-05, "learning_rate": 0.000711289136576493, "logits/chosen": -11.600739479064941, "logits/rejected": -11.538052558898926, "logps/chosen": -2719.26025390625, "logps/rejected": -2333.80029296875, "loss": 8.4959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -123.41812896728516, "rewards/margins": 5.989256858825684, "rewards/rejected": -129.40737915039062, "step": 15070 }, { "epoch": 0.87, "grad_norm": 3.1465048789978027, "learning_rate": 0.0007110956306358606, "logits/chosen": -13.281492233276367, "logits/rejected": -13.57616138458252, "logps/chosen": -3052.0341796875, "logps/rejected": -2774.98681640625, "loss": 2.8847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -159.76719665527344, "rewards/margins": 5.51729679107666, "rewards/rejected": -165.28448486328125, "step": 15080 }, { "epoch": 0.87, "grad_norm": 0.000725765828974545, "learning_rate": 0.0007109021246952282, "logits/chosen": -14.951345443725586, "logits/rejected": -14.878438949584961, "logps/chosen": -2644.490234375, "logps/rejected": -2731.43212890625, "loss": 10.2387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -193.75003051757812, "rewards/margins": 4.000556945800781, "rewards/rejected": -197.75057983398438, "step": 15090 }, { "epoch": 0.87, "grad_norm": 0.02667100541293621, "learning_rate": 0.0007107086187545958, "logits/chosen": -16.45688247680664, "logits/rejected": -16.584068298339844, "logps/chosen": -2798.391845703125, "logps/rejected": -2834.2890625, "loss": 3.8214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -203.6160125732422, "rewards/margins": 2.600895881652832, "rewards/rejected": -206.2169189453125, "step": 15100 }, { "epoch": 0.87, "grad_norm": 3.6100693250773475e-05, "learning_rate": 0.0007105151128139635, "logits/chosen": -16.782840728759766, "logits/rejected": -17.071208953857422, "logps/chosen": -2704.246337890625, "logps/rejected": -2291.876953125, "loss": 5.6364, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.47512817382812, "rewards/margins": -0.2434704750776291, "rewards/rejected": -166.23165893554688, "step": 15110 }, { "epoch": 0.88, "grad_norm": 103.18717193603516, "learning_rate": 0.0007103216068733311, "logits/chosen": -16.81735610961914, "logits/rejected": -16.874500274658203, "logps/chosen": -2672.1650390625, "logps/rejected": -2460.77685546875, "loss": 8.6762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.3324432373047, "rewards/margins": 2.568563938140869, "rewards/rejected": -162.9010009765625, "step": 15120 }, { "epoch": 0.88, "grad_norm": 4.87834707525725e-13, "learning_rate": 0.0007101281009326986, "logits/chosen": -19.26009750366211, "logits/rejected": -19.364261627197266, "logps/chosen": -2004.715576171875, "logps/rejected": -1887.673095703125, "loss": 12.5069, "rewards/accuracies": 0.5, "rewards/chosen": -137.64732360839844, "rewards/margins": -6.609086513519287, "rewards/rejected": -131.03823852539062, "step": 15130 }, { "epoch": 0.88, "grad_norm": 67.45552062988281, "learning_rate": 0.0007099345949920662, "logits/chosen": -14.7869291305542, "logits/rejected": -14.775976181030273, "logps/chosen": -2356.761474609375, "logps/rejected": -2392.9765625, "loss": 14.8771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -124.5988998413086, "rewards/margins": -5.269124984741211, "rewards/rejected": -119.32977294921875, "step": 15140 }, { "epoch": 0.88, "grad_norm": 7.192644261522219e-05, "learning_rate": 0.0007097410890514338, "logits/chosen": -17.673381805419922, "logits/rejected": -17.65656280517578, "logps/chosen": -2260.113525390625, "logps/rejected": -2122.3349609375, "loss": 3.9051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -160.05535888671875, "rewards/margins": 7.808991432189941, "rewards/rejected": -167.86434936523438, "step": 15150 }, { "epoch": 0.88, "grad_norm": 7.738042845373272e-12, "learning_rate": 0.0007095475831108014, "logits/chosen": -13.675407409667969, "logits/rejected": -13.584634780883789, "logps/chosen": -2735.919189453125, "logps/rejected": -2440.56787109375, "loss": 13.0318, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.9405059814453, "rewards/margins": -3.2020962238311768, "rewards/rejected": -128.7384033203125, "step": 15160 }, { "epoch": 0.88, "grad_norm": 43.34801483154297, "learning_rate": 0.0007093540771701691, "logits/chosen": -13.947319030761719, "logits/rejected": -13.697148323059082, "logps/chosen": -2464.79638671875, "logps/rejected": -1856.8515625, "loss": 12.6646, "rewards/accuracies": 0.5, "rewards/chosen": -72.31824493408203, "rewards/margins": -7.907156467437744, "rewards/rejected": -64.41108703613281, "step": 15170 }, { "epoch": 0.88, "grad_norm": 108.89617156982422, "learning_rate": 0.0007091605712295368, "logits/chosen": -17.29723358154297, "logits/rejected": -18.8783016204834, "logps/chosen": -2588.018310546875, "logps/rejected": -2141.291259765625, "loss": 19.3733, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -117.19352722167969, "rewards/margins": -9.231887817382812, "rewards/rejected": -107.9616470336914, "step": 15180 }, { "epoch": 0.88, "grad_norm": 147.38043212890625, "learning_rate": 0.0007089670652889044, "logits/chosen": -15.429956436157227, "logits/rejected": -15.566347122192383, "logps/chosen": -2610.14453125, "logps/rejected": -2481.83837890625, "loss": 3.6053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.0425796508789, "rewards/margins": 13.188905715942383, "rewards/rejected": -137.23147583007812, "step": 15190 }, { "epoch": 0.88, "grad_norm": 108.54765319824219, "learning_rate": 0.000708773559348272, "logits/chosen": -20.117605209350586, "logits/rejected": -21.919498443603516, "logps/chosen": -2272.617919921875, "logps/rejected": -2031.8355712890625, "loss": 27.7258, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -195.96006774902344, "rewards/margins": -14.724538803100586, "rewards/rejected": -181.2355194091797, "step": 15200 }, { "epoch": 0.88, "grad_norm": 1.3591885863334374e-12, "learning_rate": 0.0007085800534076396, "logits/chosen": -15.561467170715332, "logits/rejected": -15.465179443359375, "logps/chosen": -2221.63232421875, "logps/rejected": -1893.6390380859375, "loss": 7.3882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -100.94205474853516, "rewards/margins": 8.543729782104492, "rewards/rejected": -109.48579406738281, "step": 15210 }, { "epoch": 0.88, "grad_norm": 81.5925064086914, "learning_rate": 0.0007083865474670073, "logits/chosen": -14.626774787902832, "logits/rejected": -14.706814765930176, "logps/chosen": -2533.66455078125, "logps/rejected": -2137.04345703125, "loss": 26.3886, "rewards/accuracies": 0.5, "rewards/chosen": -88.13276672363281, "rewards/margins": -7.408551216125488, "rewards/rejected": -80.72420501708984, "step": 15220 }, { "epoch": 0.88, "grad_norm": 66.95452117919922, "learning_rate": 0.0007081930415263749, "logits/chosen": -16.14716339111328, "logits/rejected": -15.8618745803833, "logps/chosen": -2189.341552734375, "logps/rejected": -2055.981689453125, "loss": 8.0092, "rewards/accuracies": 0.5, "rewards/chosen": -150.80599975585938, "rewards/margins": 8.741762161254883, "rewards/rejected": -159.54776000976562, "step": 15230 }, { "epoch": 0.88, "grad_norm": 108.54920196533203, "learning_rate": 0.0007079995355857425, "logits/chosen": -17.582284927368164, "logits/rejected": -17.591604232788086, "logps/chosen": -2093.952880859375, "logps/rejected": -2018.691162109375, "loss": 12.1655, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -182.93057250976562, "rewards/margins": -5.612048149108887, "rewards/rejected": -177.31851196289062, "step": 15240 }, { "epoch": 0.88, "grad_norm": 5.631609916687012, "learning_rate": 0.0007078060296451101, "logits/chosen": -12.222677230834961, "logits/rejected": -12.115259170532227, "logps/chosen": -2916.89013671875, "logps/rejected": -2497.7412109375, "loss": 7.2768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -100.96653747558594, "rewards/margins": 8.339361190795898, "rewards/rejected": -109.305908203125, "step": 15250 }, { "epoch": 0.88, "grad_norm": 0.07437151670455933, "learning_rate": 0.0007076125237044777, "logits/chosen": -14.526827812194824, "logits/rejected": -14.471054077148438, "logps/chosen": -2522.505126953125, "logps/rejected": -2502.002685546875, "loss": 0.7847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.33206176757812, "rewards/margins": 5.6201887130737305, "rewards/rejected": -151.95225524902344, "step": 15260 }, { "epoch": 0.88, "grad_norm": 0.007260071113705635, "learning_rate": 0.0007074190177638453, "logits/chosen": -13.918785095214844, "logits/rejected": -13.849332809448242, "logps/chosen": -2394.82177734375, "logps/rejected": -2006.177001953125, "loss": 8.9573, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -179.27516174316406, "rewards/margins": -3.7999935150146484, "rewards/rejected": -175.47515869140625, "step": 15270 }, { "epoch": 0.88, "grad_norm": 19.49174690246582, "learning_rate": 0.0007072255118232131, "logits/chosen": -13.806553840637207, "logits/rejected": -13.684944152832031, "logps/chosen": -2396.88720703125, "logps/rejected": -2139.5927734375, "loss": 8.2814, "rewards/accuracies": 0.5, "rewards/chosen": -148.8805694580078, "rewards/margins": -0.2876585125923157, "rewards/rejected": -148.5928955078125, "step": 15280 }, { "epoch": 0.89, "grad_norm": 225.24227905273438, "learning_rate": 0.0007070320058825807, "logits/chosen": -10.7145357131958, "logits/rejected": -10.966145515441895, "logps/chosen": -2617.641357421875, "logps/rejected": -2433.895751953125, "loss": 8.0516, "rewards/accuracies": 0.5, "rewards/chosen": -102.716796875, "rewards/margins": 0.12555618584156036, "rewards/rejected": -102.84234619140625, "step": 15290 }, { "epoch": 0.89, "grad_norm": 59.17365264892578, "learning_rate": 0.0007068384999419483, "logits/chosen": -10.287628173828125, "logits/rejected": -10.231569290161133, "logps/chosen": -2629.146728515625, "logps/rejected": -2625.06884765625, "loss": 4.8377, "rewards/accuracies": 0.5, "rewards/chosen": -32.18838882446289, "rewards/margins": 6.393458366394043, "rewards/rejected": -38.58184814453125, "step": 15300 }, { "epoch": 0.89, "grad_norm": 1.192079782485962, "learning_rate": 0.0007066449940013159, "logits/chosen": -15.587972640991211, "logits/rejected": -15.480569839477539, "logps/chosen": -2414.661865234375, "logps/rejected": -2289.97509765625, "loss": 21.1481, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -187.1182098388672, "rewards/margins": -18.34830093383789, "rewards/rejected": -168.76992797851562, "step": 15310 }, { "epoch": 0.89, "grad_norm": 82.71418762207031, "learning_rate": 0.0007064514880606835, "logits/chosen": -17.259761810302734, "logits/rejected": -17.18808364868164, "logps/chosen": -2636.071533203125, "logps/rejected": -2477.96240234375, "loss": 12.7461, "rewards/accuracies": 0.5, "rewards/chosen": -154.3265380859375, "rewards/margins": 6.449980735778809, "rewards/rejected": -160.77651977539062, "step": 15320 }, { "epoch": 0.89, "grad_norm": 0.032537467777729034, "learning_rate": 0.0007062579821200511, "logits/chosen": -16.82027816772461, "logits/rejected": -16.55632781982422, "logps/chosen": -2389.65771484375, "logps/rejected": -2244.905517578125, "loss": 14.8415, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.2657928466797, "rewards/margins": -1.3931442499160767, "rewards/rejected": -168.87265014648438, "step": 15330 }, { "epoch": 0.89, "grad_norm": 0.04012855142354965, "learning_rate": 0.0007060644761794188, "logits/chosen": -15.924158096313477, "logits/rejected": -16.043920516967773, "logps/chosen": -2311.7939453125, "logps/rejected": -2455.280029296875, "loss": 28.5288, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -158.95863342285156, "rewards/margins": -24.531265258789062, "rewards/rejected": -134.42738342285156, "step": 15340 }, { "epoch": 0.89, "grad_norm": 5.0772261062093094e-08, "learning_rate": 0.0007058709702387863, "logits/chosen": -13.872457504272461, "logits/rejected": -13.8789701461792, "logps/chosen": -2646.48583984375, "logps/rejected": -2526.939697265625, "loss": 14.0237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -139.3995361328125, "rewards/margins": 5.935190677642822, "rewards/rejected": -145.33473205566406, "step": 15350 }, { "epoch": 0.89, "grad_norm": 0.0051218802109360695, "learning_rate": 0.0007056774642981539, "logits/chosen": -19.341331481933594, "logits/rejected": -19.355716705322266, "logps/chosen": -2364.4169921875, "logps/rejected": -2152.87255859375, "loss": 6.4326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.74221801757812, "rewards/margins": 11.0020112991333, "rewards/rejected": -148.7442169189453, "step": 15360 }, { "epoch": 0.89, "grad_norm": 82.66143035888672, "learning_rate": 0.0007054839583575215, "logits/chosen": -16.24399185180664, "logits/rejected": -16.173999786376953, "logps/chosen": -2657.88623046875, "logps/rejected": -2423.18310546875, "loss": 15.7991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.55419921875, "rewards/margins": -12.497848510742188, "rewards/rejected": -158.05636596679688, "step": 15370 }, { "epoch": 0.89, "grad_norm": 0.00010770748485811055, "learning_rate": 0.0007052904524168891, "logits/chosen": -16.115848541259766, "logits/rejected": -16.260372161865234, "logps/chosen": -2669.30078125, "logps/rejected": -2566.06494140625, "loss": 9.022, "rewards/accuracies": 0.5, "rewards/chosen": -152.9901123046875, "rewards/margins": 0.991912841796875, "rewards/rejected": -153.98202514648438, "step": 15380 }, { "epoch": 0.89, "grad_norm": 0.011314940638840199, "learning_rate": 0.0007050969464762569, "logits/chosen": -14.0493803024292, "logits/rejected": -13.875988960266113, "logps/chosen": -2553.866943359375, "logps/rejected": -2258.15869140625, "loss": 22.5699, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -123.89495849609375, "rewards/margins": -15.623161315917969, "rewards/rejected": -108.27180480957031, "step": 15390 }, { "epoch": 0.89, "grad_norm": 14.533297538757324, "learning_rate": 0.0007049034405356245, "logits/chosen": -10.559961318969727, "logits/rejected": -10.714892387390137, "logps/chosen": -2157.571533203125, "logps/rejected": -1818.9779052734375, "loss": 10.2394, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -112.14024353027344, "rewards/margins": -6.436410427093506, "rewards/rejected": -105.7038345336914, "step": 15400 }, { "epoch": 0.89, "grad_norm": 75.2231674194336, "learning_rate": 0.0007047099345949921, "logits/chosen": -11.530240058898926, "logits/rejected": -12.089165687561035, "logps/chosen": -2939.405517578125, "logps/rejected": -2439.344970703125, "loss": 17.2626, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.35305786132812, "rewards/margins": -12.389344215393066, "rewards/rejected": -116.96370697021484, "step": 15410 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 0.0007045164286543597, "logits/chosen": -13.729680061340332, "logits/rejected": -14.064181327819824, "logps/chosen": -2811.785400390625, "logps/rejected": -2636.451171875, "loss": 12.0726, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -130.9889373779297, "rewards/margins": 7.888494968414307, "rewards/rejected": -138.87744140625, "step": 15420 }, { "epoch": 0.89, "grad_norm": 7.846760272979736, "learning_rate": 0.0007043229227137273, "logits/chosen": -11.160598754882812, "logits/rejected": -11.355175971984863, "logps/chosen": -2535.005126953125, "logps/rejected": -2357.65087890625, "loss": 16.5088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -57.079498291015625, "rewards/margins": 0.5553741455078125, "rewards/rejected": -57.63486862182617, "step": 15430 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 0.0007041294167730949, "logits/chosen": -14.058459281921387, "logits/rejected": -14.06244945526123, "logps/chosen": -2460.25341796875, "logps/rejected": -2333.615966796875, "loss": 10.3642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.03468322753906, "rewards/margins": 2.3761093616485596, "rewards/rejected": -172.41079711914062, "step": 15440 }, { "epoch": 0.89, "grad_norm": 47.90544891357422, "learning_rate": 0.0007039359108324626, "logits/chosen": -16.27695655822754, "logits/rejected": -17.24211311340332, "logps/chosen": -2279.49072265625, "logps/rejected": -2106.37451171875, "loss": 13.0533, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -189.7101593017578, "rewards/margins": -10.006587028503418, "rewards/rejected": -179.7035369873047, "step": 15450 }, { "epoch": 0.89, "grad_norm": 30.835969924926758, "learning_rate": 0.0007037424048918302, "logits/chosen": -13.001788139343262, "logits/rejected": -12.830037117004395, "logps/chosen": -2656.464111328125, "logps/rejected": -2166.7177734375, "loss": 23.8747, "rewards/accuracies": 0.5, "rewards/chosen": -92.54705810546875, "rewards/margins": -16.062108993530273, "rewards/rejected": -76.48493957519531, "step": 15460 }, { "epoch": 0.9, "grad_norm": 0.0006442879093810916, "learning_rate": 0.0007035488989511978, "logits/chosen": -12.832208633422852, "logits/rejected": -12.937045097351074, "logps/chosen": -2100.302490234375, "logps/rejected": -2139.0966796875, "loss": 7.6201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.6132583618164, "rewards/margins": 10.75483226776123, "rewards/rejected": -118.36808776855469, "step": 15470 }, { "epoch": 0.9, "grad_norm": 54.75959396362305, "learning_rate": 0.0007033553930105654, "logits/chosen": -13.792529106140137, "logits/rejected": -13.511645317077637, "logps/chosen": -2255.337158203125, "logps/rejected": -1905.065185546875, "loss": 20.3943, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.15036010742188, "rewards/margins": -14.777727127075195, "rewards/rejected": -163.3726348876953, "step": 15480 }, { "epoch": 0.9, "grad_norm": 6.94768687026226e-06, "learning_rate": 0.0007031618870699331, "logits/chosen": -12.403328895568848, "logits/rejected": -12.194402694702148, "logps/chosen": -2153.27001953125, "logps/rejected": -2042.365478515625, "loss": 1.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -111.93260192871094, "rewards/margins": 14.537503242492676, "rewards/rejected": -126.4701156616211, "step": 15490 }, { "epoch": 0.9, "grad_norm": 5.922450529993739e-09, "learning_rate": 0.0007029683811293007, "logits/chosen": -10.460841178894043, "logits/rejected": -10.42749309539795, "logps/chosen": -2358.64990234375, "logps/rejected": -2237.679443359375, "loss": 3.511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -58.2451057434082, "rewards/margins": 18.708776473999023, "rewards/rejected": -76.95388793945312, "step": 15500 }, { "epoch": 0.9, "grad_norm": 0.008485040627419949, "learning_rate": 0.0007027748751886684, "logits/chosen": -12.372694969177246, "logits/rejected": -11.961111068725586, "logps/chosen": -2707.145751953125, "logps/rejected": -3626.10693359375, "loss": 2.2241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -187.7734832763672, "rewards/margins": 64.13099670410156, "rewards/rejected": -251.9044647216797, "step": 15510 }, { "epoch": 0.9, "grad_norm": 72.89688873291016, "learning_rate": 0.000702581369248036, "logits/chosen": -13.654184341430664, "logits/rejected": -13.361706733703613, "logps/chosen": -2584.48974609375, "logps/rejected": -2257.871826171875, "loss": 21.541, "rewards/accuracies": 0.5, "rewards/chosen": -180.8762664794922, "rewards/margins": -7.677515506744385, "rewards/rejected": -173.19876098632812, "step": 15520 }, { "epoch": 0.9, "grad_norm": 40.384559631347656, "learning_rate": 0.0007023878633074036, "logits/chosen": -12.929974555969238, "logits/rejected": -13.031764030456543, "logps/chosen": -2737.87255859375, "logps/rejected": -2573.12939453125, "loss": 1.3631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.72096252441406, "rewards/margins": 18.083209991455078, "rewards/rejected": -111.8041763305664, "step": 15530 }, { "epoch": 0.9, "grad_norm": 106.62391662597656, "learning_rate": 0.0007021943573667712, "logits/chosen": -13.968707084655762, "logits/rejected": -14.089094161987305, "logps/chosen": -2441.73095703125, "logps/rejected": -2288.92333984375, "loss": 13.7766, "rewards/accuracies": 0.5, "rewards/chosen": -143.24118041992188, "rewards/margins": -7.192673683166504, "rewards/rejected": -136.0485076904297, "step": 15540 }, { "epoch": 0.9, "grad_norm": 284.774169921875, "learning_rate": 0.0007020008514261388, "logits/chosen": -12.888708114624023, "logits/rejected": -12.814245223999023, "logps/chosen": -2818.519287109375, "logps/rejected": -2318.915771484375, "loss": 22.9116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.08071899414062, "rewards/margins": -6.40097713470459, "rewards/rejected": -127.67974853515625, "step": 15550 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 0.0007018073454855065, "logits/chosen": -12.508095741271973, "logits/rejected": -12.893221855163574, "logps/chosen": -2859.51220703125, "logps/rejected": -2680.438232421875, "loss": 18.2157, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -183.66978454589844, "rewards/margins": 0.6816695928573608, "rewards/rejected": -184.3514404296875, "step": 15560 }, { "epoch": 0.9, "grad_norm": 295.0335388183594, "learning_rate": 0.000701613839544874, "logits/chosen": -13.338638305664062, "logits/rejected": -13.166865348815918, "logps/chosen": -2824.79638671875, "logps/rejected": -2714.88623046875, "loss": 12.598, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.48280334472656, "rewards/margins": -9.502359390258789, "rewards/rejected": -143.98043823242188, "step": 15570 }, { "epoch": 0.9, "grad_norm": 97.18180084228516, "learning_rate": 0.0007014203336042416, "logits/chosen": -10.814725875854492, "logits/rejected": -10.808125495910645, "logps/chosen": -2700.040283203125, "logps/rejected": -2618.298583984375, "loss": 20.316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.49734497070312, "rewards/margins": -12.022967338562012, "rewards/rejected": -168.47438049316406, "step": 15580 }, { "epoch": 0.9, "grad_norm": 187.344970703125, "learning_rate": 0.0007012268276636092, "logits/chosen": -12.28868579864502, "logits/rejected": -11.846209526062012, "logps/chosen": -2660.37255859375, "logps/rejected": -2328.389404296875, "loss": 9.0163, "rewards/accuracies": 0.5, "rewards/chosen": -196.36021423339844, "rewards/margins": -2.6488893032073975, "rewards/rejected": -193.71133422851562, "step": 15590 }, { "epoch": 0.9, "grad_norm": 0.006851975340396166, "learning_rate": 0.0007010333217229769, "logits/chosen": -11.480133056640625, "logits/rejected": -11.429113388061523, "logps/chosen": -2884.827880859375, "logps/rejected": -2487.922607421875, "loss": 6.1172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.03836059570312, "rewards/margins": -1.4099725484848022, "rewards/rejected": -148.62835693359375, "step": 15600 }, { "epoch": 0.9, "grad_norm": 4.808099448097569e-13, "learning_rate": 0.0007008398157823445, "logits/chosen": -9.83594036102295, "logits/rejected": -9.796956062316895, "logps/chosen": -2090.109619140625, "logps/rejected": -1965.808837890625, "loss": 10.6374, "rewards/accuracies": 0.5, "rewards/chosen": -67.31647491455078, "rewards/margins": 0.18900910019874573, "rewards/rejected": -67.50547790527344, "step": 15610 }, { "epoch": 0.9, "grad_norm": 31.486488342285156, "learning_rate": 0.0007006463098417122, "logits/chosen": -9.24150276184082, "logits/rejected": -9.206697463989258, "logps/chosen": -2281.72119140625, "logps/rejected": -2052.51806640625, "loss": 19.4396, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -123.49836730957031, "rewards/margins": -15.505281448364258, "rewards/rejected": -107.9930648803711, "step": 15620 }, { "epoch": 0.9, "grad_norm": 79.41417694091797, "learning_rate": 0.0007004528039010798, "logits/chosen": -9.325251579284668, "logits/rejected": -9.492834091186523, "logps/chosen": -2547.52294921875, "logps/rejected": -2365.935546875, "loss": 9.2213, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -136.4344940185547, "rewards/margins": 2.4667491912841797, "rewards/rejected": -138.90126037597656, "step": 15630 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 0.0007002592979604474, "logits/chosen": -9.94618034362793, "logits/rejected": -9.813000679016113, "logps/chosen": -2719.34130859375, "logps/rejected": -2754.16796875, "loss": 10.4651, "rewards/accuracies": 0.5, "rewards/chosen": -129.27528381347656, "rewards/margins": 4.668086051940918, "rewards/rejected": -133.943359375, "step": 15640 }, { "epoch": 0.91, "grad_norm": 68.53373718261719, "learning_rate": 0.000700065792019815, "logits/chosen": -14.839590072631836, "logits/rejected": -14.704322814941406, "logps/chosen": -2456.58837890625, "logps/rejected": -2539.931640625, "loss": 6.991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -203.20489501953125, "rewards/margins": -2.178860902786255, "rewards/rejected": -201.02603149414062, "step": 15650 }, { "epoch": 0.91, "grad_norm": 1.249731183052063, "learning_rate": 0.0006998722860791826, "logits/chosen": -12.52495002746582, "logits/rejected": -12.71294116973877, "logps/chosen": -2725.50634765625, "logps/rejected": -2431.656982421875, "loss": 15.5795, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -197.52061462402344, "rewards/margins": -8.019765853881836, "rewards/rejected": -189.50086975097656, "step": 15660 }, { "epoch": 0.91, "grad_norm": 2.0697424554945963e-14, "learning_rate": 0.0006996787801385502, "logits/chosen": -9.118019104003906, "logits/rejected": -9.171823501586914, "logps/chosen": -2724.73095703125, "logps/rejected": -2273.4375, "loss": 21.8495, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -164.27264404296875, "rewards/margins": -17.31454849243164, "rewards/rejected": -146.95806884765625, "step": 15670 }, { "epoch": 0.91, "grad_norm": 14.718199729919434, "learning_rate": 0.0006994852741979179, "logits/chosen": -12.250990867614746, "logits/rejected": -12.18425464630127, "logps/chosen": -2635.47509765625, "logps/rejected": -2490.019775390625, "loss": 6.792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.13262939453125, "rewards/margins": -1.1798588037490845, "rewards/rejected": -153.9527587890625, "step": 15680 }, { "epoch": 0.91, "grad_norm": 76.04226684570312, "learning_rate": 0.0006992917682572855, "logits/chosen": -10.758683204650879, "logits/rejected": -10.689685821533203, "logps/chosen": -2796.140869140625, "logps/rejected": -2632.453369140625, "loss": 2.5224, "rewards/accuracies": 0.5, "rewards/chosen": -95.12159729003906, "rewards/margins": 5.933127403259277, "rewards/rejected": -101.05472564697266, "step": 15690 }, { "epoch": 0.91, "grad_norm": 39.81969451904297, "learning_rate": 0.0006990982623166532, "logits/chosen": -11.975311279296875, "logits/rejected": -11.882028579711914, "logps/chosen": -2268.89501953125, "logps/rejected": -2184.334716796875, "loss": 22.6644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -95.20877838134766, "rewards/margins": -4.401068687438965, "rewards/rejected": -90.8077163696289, "step": 15700 }, { "epoch": 0.91, "grad_norm": 7.872800722452666e-08, "learning_rate": 0.0006989047563760208, "logits/chosen": -10.634114265441895, "logits/rejected": -10.65028190612793, "logps/chosen": -2520.06787109375, "logps/rejected": -2192.98974609375, "loss": 3.0803, "rewards/accuracies": 0.5, "rewards/chosen": -87.64765930175781, "rewards/margins": 7.6263628005981445, "rewards/rejected": -95.27401733398438, "step": 15710 }, { "epoch": 0.91, "grad_norm": 0.000376234675059095, "learning_rate": 0.0006987112504353884, "logits/chosen": -11.479815483093262, "logits/rejected": -11.349542617797852, "logps/chosen": -2126.61376953125, "logps/rejected": -2227.0419921875, "loss": 12.2397, "rewards/accuracies": 0.5, "rewards/chosen": -83.32244110107422, "rewards/margins": 2.4574227333068848, "rewards/rejected": -85.77986145019531, "step": 15720 }, { "epoch": 0.91, "grad_norm": 85.55680847167969, "learning_rate": 0.000698517744494756, "logits/chosen": -9.804190635681152, "logits/rejected": -9.75111198425293, "logps/chosen": -2576.15771484375, "logps/rejected": -2521.112548828125, "loss": 16.3721, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -115.28257751464844, "rewards/margins": -6.259629726409912, "rewards/rejected": -109.02296447753906, "step": 15730 }, { "epoch": 0.91, "grad_norm": 140.79910278320312, "learning_rate": 0.0006983242385541237, "logits/chosen": -12.498442649841309, "logits/rejected": -12.5284423828125, "logps/chosen": -1917.370849609375, "logps/rejected": -1952.805908203125, "loss": 14.9641, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -168.73068237304688, "rewards/margins": -2.272446393966675, "rewards/rejected": -166.45822143554688, "step": 15740 }, { "epoch": 0.91, "grad_norm": 6.398091077635878e-18, "learning_rate": 0.0006981307326134913, "logits/chosen": -13.350715637207031, "logits/rejected": -13.506582260131836, "logps/chosen": -2556.00244140625, "logps/rejected": -2625.173095703125, "loss": 4.8465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.7154541015625, "rewards/margins": 12.113019943237305, "rewards/rejected": -135.82847595214844, "step": 15750 }, { "epoch": 0.91, "grad_norm": 42.94975662231445, "learning_rate": 0.0006979372266728589, "logits/chosen": -12.587156295776367, "logits/rejected": -12.628677368164062, "logps/chosen": -2296.240234375, "logps/rejected": -2439.806884765625, "loss": 4.8576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.5269012451172, "rewards/margins": 5.9972734451293945, "rewards/rejected": -178.524169921875, "step": 15760 }, { "epoch": 0.91, "grad_norm": 0.005339197814464569, "learning_rate": 0.0006977437207322265, "logits/chosen": -11.397557258605957, "logits/rejected": -11.660171508789062, "logps/chosen": -2862.583251953125, "logps/rejected": -2424.114990234375, "loss": 8.974, "rewards/accuracies": 0.5, "rewards/chosen": -105.76847839355469, "rewards/margins": 0.8451774716377258, "rewards/rejected": -106.6136474609375, "step": 15770 }, { "epoch": 0.91, "grad_norm": 12.465283393859863, "learning_rate": 0.000697550214791594, "logits/chosen": -10.967860221862793, "logits/rejected": -10.89097785949707, "logps/chosen": -2505.21630859375, "logps/rejected": -2572.77880859375, "loss": 1.9684, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.3771209716797, "rewards/margins": 3.6143832206726074, "rewards/rejected": -132.99148559570312, "step": 15780 }, { "epoch": 0.91, "grad_norm": 1.9117523431777954, "learning_rate": 0.0006973567088509616, "logits/chosen": -11.933679580688477, "logits/rejected": -11.70939826965332, "logps/chosen": -2662.71630859375, "logps/rejected": -2565.232666015625, "loss": 17.417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.79263305664062, "rewards/margins": -5.56313943862915, "rewards/rejected": -192.22947692871094, "step": 15790 }, { "epoch": 0.91, "grad_norm": 3.6486073995210356e-22, "learning_rate": 0.0006971632029103293, "logits/chosen": -12.040834426879883, "logits/rejected": -12.037206649780273, "logps/chosen": -2543.37548828125, "logps/rejected": -2522.962890625, "loss": 7.6581, "rewards/accuracies": 0.5, "rewards/chosen": -120.3264389038086, "rewards/margins": 14.067941665649414, "rewards/rejected": -134.39437866210938, "step": 15800 }, { "epoch": 0.92, "grad_norm": 2.001811175454454e-15, "learning_rate": 0.000696969696969697, "logits/chosen": -14.769701957702637, "logits/rejected": -14.747714042663574, "logps/chosen": -2545.578857421875, "logps/rejected": -2628.81005859375, "loss": 3.2939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.91172790527344, "rewards/margins": 13.570596694946289, "rewards/rejected": -198.48233032226562, "step": 15810 }, { "epoch": 0.92, "grad_norm": 67.50548553466797, "learning_rate": 0.0006967761910290646, "logits/chosen": -12.185748100280762, "logits/rejected": -12.23381233215332, "logps/chosen": -2900.10302734375, "logps/rejected": -2593.111328125, "loss": 6.6464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.26246643066406, "rewards/margins": 10.694236755371094, "rewards/rejected": -187.95669555664062, "step": 15820 }, { "epoch": 0.92, "grad_norm": 1.3187289090311593e-14, "learning_rate": 0.0006965826850884322, "logits/chosen": -12.993896484375, "logits/rejected": -12.577794075012207, "logps/chosen": -2460.08984375, "logps/rejected": -2057.616455078125, "loss": 23.852, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -184.23574829101562, "rewards/margins": -19.290111541748047, "rewards/rejected": -164.94564819335938, "step": 15830 }, { "epoch": 0.92, "grad_norm": 105.56854248046875, "learning_rate": 0.0006963891791477998, "logits/chosen": -14.04200267791748, "logits/rejected": -13.755892753601074, "logps/chosen": -2527.74951171875, "logps/rejected": -2087.52294921875, "loss": 42.3336, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -193.05996704101562, "rewards/margins": -39.871559143066406, "rewards/rejected": -153.1884002685547, "step": 15840 }, { "epoch": 0.92, "grad_norm": 4.468808128876844e-06, "learning_rate": 0.0006961956732071675, "logits/chosen": -14.314852714538574, "logits/rejected": -14.347808837890625, "logps/chosen": -2461.6748046875, "logps/rejected": -2154.862548828125, "loss": 18.6194, "rewards/accuracies": 0.5, "rewards/chosen": -179.6395721435547, "rewards/margins": -8.875092506408691, "rewards/rejected": -170.76449584960938, "step": 15850 }, { "epoch": 0.92, "grad_norm": 8.452696320659925e-09, "learning_rate": 0.0006960021672665351, "logits/chosen": -13.12458610534668, "logits/rejected": -13.188840866088867, "logps/chosen": -2350.8447265625, "logps/rejected": -1953.7779541015625, "loss": 2.9458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.8514404296875, "rewards/margins": 6.4627790451049805, "rewards/rejected": -152.314208984375, "step": 15860 }, { "epoch": 0.92, "grad_norm": 74.85616302490234, "learning_rate": 0.0006958086613259027, "logits/chosen": -15.67322826385498, "logits/rejected": -15.600253105163574, "logps/chosen": -2106.141845703125, "logps/rejected": -1843.3402099609375, "loss": 24.4803, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -176.4815673828125, "rewards/margins": -19.312881469726562, "rewards/rejected": -157.16868591308594, "step": 15870 }, { "epoch": 0.92, "grad_norm": 86.69760131835938, "learning_rate": 0.0006956151553852703, "logits/chosen": -14.461990356445312, "logits/rejected": -14.497090339660645, "logps/chosen": -2505.335693359375, "logps/rejected": -2419.22705078125, "loss": 21.7162, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -110.21763610839844, "rewards/margins": -14.564501762390137, "rewards/rejected": -95.65312957763672, "step": 15880 }, { "epoch": 0.92, "grad_norm": 0.35458657145500183, "learning_rate": 0.0006954216494446379, "logits/chosen": -15.580146789550781, "logits/rejected": -15.303624153137207, "logps/chosen": -2357.802001953125, "logps/rejected": -2162.0693359375, "loss": 12.3803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -89.43057250976562, "rewards/margins": 13.127084732055664, "rewards/rejected": -102.5576400756836, "step": 15890 }, { "epoch": 0.92, "grad_norm": 1.7750665866405756e-20, "learning_rate": 0.0006952281435040055, "logits/chosen": -13.14819049835205, "logits/rejected": -13.461359977722168, "logps/chosen": -2678.734375, "logps/rejected": -2302.810302734375, "loss": 6.5812, "rewards/accuracies": 0.5, "rewards/chosen": -34.986549377441406, "rewards/margins": 6.8240065574646, "rewards/rejected": -41.81055450439453, "step": 15900 }, { "epoch": 0.92, "grad_norm": 1.0104105679448411e-15, "learning_rate": 0.0006950346375633733, "logits/chosen": -12.14819622039795, "logits/rejected": -12.264410018920898, "logps/chosen": -2010.1331787109375, "logps/rejected": -2015.5726318359375, "loss": 16.4362, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -111.16853332519531, "rewards/margins": -8.61109733581543, "rewards/rejected": -102.55744934082031, "step": 15910 }, { "epoch": 0.92, "grad_norm": 3.9106693293433636e-05, "learning_rate": 0.0006948411316227409, "logits/chosen": -14.894757270812988, "logits/rejected": -14.697454452514648, "logps/chosen": -2015.146728515625, "logps/rejected": -1962.466796875, "loss": 5.7645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -92.67131805419922, "rewards/margins": 18.89461898803711, "rewards/rejected": -111.5659408569336, "step": 15920 }, { "epoch": 0.92, "grad_norm": 57.422760009765625, "learning_rate": 0.0006946476256821085, "logits/chosen": -14.208450317382812, "logits/rejected": -14.459185600280762, "logps/chosen": -2300.92138671875, "logps/rejected": -2090.882080078125, "loss": 1.0419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.48780822753906, "rewards/margins": 13.55572509765625, "rewards/rejected": -138.04354858398438, "step": 15930 }, { "epoch": 0.92, "grad_norm": 4.774351119995117, "learning_rate": 0.0006944541197414761, "logits/chosen": -10.315813064575195, "logits/rejected": -10.234663009643555, "logps/chosen": -2367.61376953125, "logps/rejected": -2655.71875, "loss": 5.0323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -59.495399475097656, "rewards/margins": 7.793072700500488, "rewards/rejected": -67.28845977783203, "step": 15940 }, { "epoch": 0.92, "grad_norm": 2.152172601199709e-05, "learning_rate": 0.0006942606138008437, "logits/chosen": -13.190234184265137, "logits/rejected": -13.006991386413574, "logps/chosen": -2367.759765625, "logps/rejected": -2485.6298828125, "loss": 12.1673, "rewards/accuracies": 0.5, "rewards/chosen": -163.2901153564453, "rewards/margins": -2.6140143871307373, "rewards/rejected": -160.6761016845703, "step": 15950 }, { "epoch": 0.92, "grad_norm": 0.0034946894738823175, "learning_rate": 0.0006940671078602114, "logits/chosen": -15.4741849899292, "logits/rejected": -16.667797088623047, "logps/chosen": -2551.00830078125, "logps/rejected": -2686.17919921875, "loss": 3.3363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -149.19451904296875, "rewards/margins": 13.438382148742676, "rewards/rejected": -162.63290405273438, "step": 15960 }, { "epoch": 0.92, "grad_norm": 61.66499710083008, "learning_rate": 0.000693873601919579, "logits/chosen": -17.18808364868164, "logits/rejected": -17.683570861816406, "logps/chosen": -2501.7939453125, "logps/rejected": -2367.97509765625, "loss": 15.5579, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -215.60009765625, "rewards/margins": -9.0703125, "rewards/rejected": -206.52978515625, "step": 15970 }, { "epoch": 0.92, "grad_norm": 0.0010035262675955892, "learning_rate": 0.0006936800959789466, "logits/chosen": -13.7355375289917, "logits/rejected": -13.634137153625488, "logps/chosen": -2615.24072265625, "logps/rejected": -2481.938720703125, "loss": 6.7575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.34146881103516, "rewards/margins": 8.79869556427002, "rewards/rejected": -126.1401596069336, "step": 15980 }, { "epoch": 0.93, "grad_norm": 1.9488481283187866, "learning_rate": 0.0006934865900383142, "logits/chosen": -16.034446716308594, "logits/rejected": -15.864458084106445, "logps/chosen": -2335.596435546875, "logps/rejected": -1913.9261474609375, "loss": 6.2223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.10958862304688, "rewards/margins": 1.8578726053237915, "rewards/rejected": -148.96746826171875, "step": 15990 }, { "epoch": 0.93, "grad_norm": 31.288347244262695, "learning_rate": 0.0006932930840976817, "logits/chosen": -15.849241256713867, "logits/rejected": -15.90356159210205, "logps/chosen": -2453.721923828125, "logps/rejected": -2253.921875, "loss": 13.7242, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -135.74850463867188, "rewards/margins": -10.293877601623535, "rewards/rejected": -125.45462799072266, "step": 16000 }, { "epoch": 0.93, "grad_norm": 1.8671042489870615e-09, "learning_rate": 0.0006930995781570493, "logits/chosen": -9.973295211791992, "logits/rejected": -9.985569953918457, "logps/chosen": -2513.077392578125, "logps/rejected": -2255.88671875, "loss": 5.2491, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.0887451171875, "rewards/margins": 7.4336090087890625, "rewards/rejected": -61.5223503112793, "step": 16010 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 0.0006929060722164171, "logits/chosen": -13.579599380493164, "logits/rejected": -13.399354934692383, "logps/chosen": -2004.0072021484375, "logps/rejected": -2181.5673828125, "loss": 1.5788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -133.0792694091797, "rewards/margins": 19.41120719909668, "rewards/rejected": -152.490478515625, "step": 16020 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 0.0006927125662757847, "logits/chosen": -13.322959899902344, "logits/rejected": -13.526535034179688, "logps/chosen": -2444.426513671875, "logps/rejected": -2345.091796875, "loss": 7.5617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.4039306640625, "rewards/margins": 1.9268958568572998, "rewards/rejected": -112.33082580566406, "step": 16030 }, { "epoch": 0.93, "grad_norm": 111.98966217041016, "learning_rate": 0.0006925190603351523, "logits/chosen": -12.981633186340332, "logits/rejected": -13.044651985168457, "logps/chosen": -2459.067138671875, "logps/rejected": -2174.95263671875, "loss": 26.1276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -129.55325317382812, "rewards/margins": -10.431845664978027, "rewards/rejected": -119.12139892578125, "step": 16040 }, { "epoch": 0.93, "grad_norm": 539.2302856445312, "learning_rate": 0.0006923255543945199, "logits/chosen": -16.21759605407715, "logits/rejected": -16.501832962036133, "logps/chosen": -2478.97119140625, "logps/rejected": -1930.614013671875, "loss": 21.3124, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.2049102783203, "rewards/margins": -16.377685546875, "rewards/rejected": -156.82720947265625, "step": 16050 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 0.0006921320484538875, "logits/chosen": -16.39192008972168, "logits/rejected": -16.199289321899414, "logps/chosen": -2392.5546875, "logps/rejected": -1970.4625244140625, "loss": 16.6897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -95.71439361572266, "rewards/margins": 7.6671929359436035, "rewards/rejected": -103.381591796875, "step": 16060 }, { "epoch": 0.93, "grad_norm": 26.732290267944336, "learning_rate": 0.0006919385425132551, "logits/chosen": -17.37397003173828, "logits/rejected": -17.340383529663086, "logps/chosen": -2583.4443359375, "logps/rejected": -2342.39208984375, "loss": 1.3592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -79.60526275634766, "rewards/margins": 27.277917861938477, "rewards/rejected": -106.8831787109375, "step": 16070 }, { "epoch": 0.93, "grad_norm": 120.77631378173828, "learning_rate": 0.0006917450365726228, "logits/chosen": -22.168237686157227, "logits/rejected": -22.028728485107422, "logps/chosen": -2421.40625, "logps/rejected": -2631.841552734375, "loss": 0.9789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -207.1208038330078, "rewards/margins": 27.242008209228516, "rewards/rejected": -234.36276245117188, "step": 16080 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 0.0006915515306319904, "logits/chosen": -19.055824279785156, "logits/rejected": -18.948902130126953, "logps/chosen": -2878.178466796875, "logps/rejected": -2798.998291015625, "loss": 4.2581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.26229858398438, "rewards/margins": 11.03773021697998, "rewards/rejected": -165.30003356933594, "step": 16090 }, { "epoch": 0.93, "grad_norm": 2.4991557598114014, "learning_rate": 0.000691358024691358, "logits/chosen": -17.55953598022461, "logits/rejected": -17.2977237701416, "logps/chosen": -2876.08935546875, "logps/rejected": -2911.27197265625, "loss": 4.0235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.82034301757812, "rewards/margins": 3.0586516857147217, "rewards/rejected": -173.87899780273438, "step": 16100 }, { "epoch": 0.93, "grad_norm": 46.00844192504883, "learning_rate": 0.0006911645187507256, "logits/chosen": -19.280763626098633, "logits/rejected": -21.064102172851562, "logps/chosen": -2567.248779296875, "logps/rejected": -2454.852294921875, "loss": 12.9364, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.05892944335938, "rewards/margins": 29.397747039794922, "rewards/rejected": -185.45669555664062, "step": 16110 }, { "epoch": 0.93, "grad_norm": 1.9027167397211997e-08, "learning_rate": 0.0006909710128100933, "logits/chosen": -14.711578369140625, "logits/rejected": -15.743998527526855, "logps/chosen": -2868.58544921875, "logps/rejected": -2742.20458984375, "loss": 4.9626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -104.2027359008789, "rewards/margins": 14.905012130737305, "rewards/rejected": -119.10774993896484, "step": 16120 }, { "epoch": 0.93, "grad_norm": 76.25520324707031, "learning_rate": 0.000690777506869461, "logits/chosen": -15.541296005249023, "logits/rejected": -15.662347793579102, "logps/chosen": -2731.106689453125, "logps/rejected": -2349.08251953125, "loss": 24.3843, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -129.4941864013672, "rewards/margins": -9.647567749023438, "rewards/rejected": -119.84663391113281, "step": 16130 }, { "epoch": 0.93, "grad_norm": 52.41022491455078, "learning_rate": 0.0006905840009288286, "logits/chosen": -20.613418579101562, "logits/rejected": -21.73526954650879, "logps/chosen": -2419.84814453125, "logps/rejected": -1946.1243896484375, "loss": 47.2623, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -180.40695190429688, "rewards/margins": -43.32058334350586, "rewards/rejected": -137.0863494873047, "step": 16140 }, { "epoch": 0.93, "grad_norm": 9.26716765548008e-08, "learning_rate": 0.0006903904949881962, "logits/chosen": -16.763866424560547, "logits/rejected": -17.10068702697754, "logps/chosen": -2589.661376953125, "logps/rejected": -2378.59228515625, "loss": 6.0636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -91.68556213378906, "rewards/margins": 4.905953407287598, "rewards/rejected": -96.59150695800781, "step": 16150 }, { "epoch": 0.94, "grad_norm": 53.35482406616211, "learning_rate": 0.0006901969890475638, "logits/chosen": -17.929031372070312, "logits/rejected": -17.987625122070312, "logps/chosen": -2146.85009765625, "logps/rejected": -2221.836181640625, "loss": 5.5701, "rewards/accuracies": 0.5, "rewards/chosen": -155.05447387695312, "rewards/margins": 0.3843750059604645, "rewards/rejected": -155.43887329101562, "step": 16160 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 0.0006900034831069314, "logits/chosen": -15.707018852233887, "logits/rejected": -15.604934692382812, "logps/chosen": -2473.732666015625, "logps/rejected": -2400.385498046875, "loss": 4.6543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -96.57539367675781, "rewards/margins": 11.164369583129883, "rewards/rejected": -107.7397689819336, "step": 16170 }, { "epoch": 0.94, "grad_norm": 2.8941927666892298e-05, "learning_rate": 0.000689809977166299, "logits/chosen": -14.503705024719238, "logits/rejected": -14.382095336914062, "logps/chosen": -2675.09521484375, "logps/rejected": -2574.5732421875, "loss": 8.4424, "rewards/accuracies": 0.5, "rewards/chosen": -148.76828002929688, "rewards/margins": 5.9771881103515625, "rewards/rejected": -154.74545288085938, "step": 16180 }, { "epoch": 0.94, "grad_norm": 0.10238289833068848, "learning_rate": 0.0006896164712256667, "logits/chosen": -13.505887031555176, "logits/rejected": -13.463696479797363, "logps/chosen": -2235.239013671875, "logps/rejected": -1992.1490478515625, "loss": 1.4648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -116.37025451660156, "rewards/margins": 13.092554092407227, "rewards/rejected": -129.4628143310547, "step": 16190 }, { "epoch": 0.94, "grad_norm": 136.03033447265625, "learning_rate": 0.0006894229652850343, "logits/chosen": -12.820295333862305, "logits/rejected": -12.75008773803711, "logps/chosen": -2472.878662109375, "logps/rejected": -2188.329833984375, "loss": 23.8953, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.53326416015625, "rewards/margins": -21.829282760620117, "rewards/rejected": -150.7039794921875, "step": 16200 }, { "epoch": 0.94, "grad_norm": 3.417839593566896e-07, "learning_rate": 0.0006892294593444019, "logits/chosen": -11.197677612304688, "logits/rejected": -11.13133716583252, "logps/chosen": -2757.210205078125, "logps/rejected": -2532.663818359375, "loss": 4.8015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -47.54017639160156, "rewards/margins": 16.533710479736328, "rewards/rejected": -64.07388305664062, "step": 16210 }, { "epoch": 0.94, "grad_norm": 63.05555725097656, "learning_rate": 0.0006890359534037694, "logits/chosen": -14.041528701782227, "logits/rejected": -14.209554672241211, "logps/chosen": -2490.257568359375, "logps/rejected": -2313.6201171875, "loss": 24.6213, "rewards/accuracies": 0.5, "rewards/chosen": -139.0771026611328, "rewards/margins": -14.239130973815918, "rewards/rejected": -124.83797454833984, "step": 16220 }, { "epoch": 0.94, "grad_norm": 80.42525482177734, "learning_rate": 0.0006888424474631371, "logits/chosen": -15.321734428405762, "logits/rejected": -15.220125198364258, "logps/chosen": -2584.15087890625, "logps/rejected": -2344.74462890625, "loss": 6.9325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -78.74778747558594, "rewards/margins": 16.95011329650879, "rewards/rejected": -95.69789123535156, "step": 16230 }, { "epoch": 0.94, "grad_norm": 57.37306213378906, "learning_rate": 0.0006886489415225047, "logits/chosen": -18.0440673828125, "logits/rejected": -17.93210220336914, "logps/chosen": -2679.47900390625, "logps/rejected": -2428.6748046875, "loss": 15.5337, "rewards/accuracies": 0.5, "rewards/chosen": -151.47769165039062, "rewards/margins": -7.377209663391113, "rewards/rejected": -144.10047912597656, "step": 16240 }, { "epoch": 0.94, "grad_norm": 1.56043518018123e-06, "learning_rate": 0.0006884554355818724, "logits/chosen": -15.836895942687988, "logits/rejected": -15.872520446777344, "logps/chosen": -2492.10205078125, "logps/rejected": -2310.70068359375, "loss": 14.8983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.5496826171875, "rewards/margins": -9.140350341796875, "rewards/rejected": -126.4093246459961, "step": 16250 }, { "epoch": 0.94, "grad_norm": 134.286865234375, "learning_rate": 0.00068826192964124, "logits/chosen": -14.277926445007324, "logits/rejected": -14.244590759277344, "logps/chosen": -2571.725830078125, "logps/rejected": -2536.59130859375, "loss": 11.385, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -154.06832885742188, "rewards/margins": -11.104551315307617, "rewards/rejected": -142.96377563476562, "step": 16260 }, { "epoch": 0.94, "grad_norm": 79.71509552001953, "learning_rate": 0.0006880684237006076, "logits/chosen": -14.50806999206543, "logits/rejected": -14.881999015808105, "logps/chosen": -2384.92041015625, "logps/rejected": -2413.140625, "loss": 7.8, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.96929168701172, "rewards/margins": 0.7366352081298828, "rewards/rejected": -34.70592498779297, "step": 16270 }, { "epoch": 0.94, "grad_norm": 2.512878967408927e-11, "learning_rate": 0.0006878749177599752, "logits/chosen": -12.717470169067383, "logits/rejected": -12.612020492553711, "logps/chosen": -2585.07568359375, "logps/rejected": -2483.74853515625, "loss": 16.5243, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.97281646728516, "rewards/margins": -9.172859191894531, "rewards/rejected": -78.79995727539062, "step": 16280 }, { "epoch": 0.94, "grad_norm": 28.263105392456055, "learning_rate": 0.0006876814118193428, "logits/chosen": -15.207353591918945, "logits/rejected": -15.197135925292969, "logps/chosen": -2294.95751953125, "logps/rejected": -2234.029052734375, "loss": 5.2337, "rewards/accuracies": 0.5, "rewards/chosen": -127.1368179321289, "rewards/margins": 2.9065558910369873, "rewards/rejected": -130.0433807373047, "step": 16290 }, { "epoch": 0.94, "grad_norm": 0.45224714279174805, "learning_rate": 0.0006874879058787104, "logits/chosen": -14.454971313476562, "logits/rejected": -14.436979293823242, "logps/chosen": -2196.66015625, "logps/rejected": -2071.12646484375, "loss": 7.0791, "rewards/accuracies": 0.5, "rewards/chosen": -139.71714782714844, "rewards/margins": -3.0829365253448486, "rewards/rejected": -136.63421630859375, "step": 16300 }, { "epoch": 0.94, "grad_norm": 1.01947021484375, "learning_rate": 0.0006872943999380781, "logits/chosen": -15.422613143920898, "logits/rejected": -15.069976806640625, "logps/chosen": -2003.699951171875, "logps/rejected": -2128.281005859375, "loss": 5.6886, "rewards/accuracies": 0.5, "rewards/chosen": -154.02816772460938, "rewards/margins": -0.6952045559883118, "rewards/rejected": -153.33297729492188, "step": 16310 }, { "epoch": 0.94, "grad_norm": 0.5915248394012451, "learning_rate": 0.0006871008939974457, "logits/chosen": -15.310678482055664, "logits/rejected": -15.117727279663086, "logps/chosen": -2650.72998046875, "logps/rejected": -2578.0927734375, "loss": 5.3682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.54031372070312, "rewards/margins": 0.6174983978271484, "rewards/rejected": -161.15780639648438, "step": 16320 }, { "epoch": 0.95, "grad_norm": 34.63254165649414, "learning_rate": 0.0006869073880568134, "logits/chosen": -13.075586318969727, "logits/rejected": -12.908299446105957, "logps/chosen": -2537.86474609375, "logps/rejected": -2536.3173828125, "loss": 0.8581, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -102.58915710449219, "rewards/margins": 29.38435935974121, "rewards/rejected": -131.9735107421875, "step": 16330 }, { "epoch": 0.95, "grad_norm": 52.30113220214844, "learning_rate": 0.000686713882116181, "logits/chosen": -11.02645492553711, "logits/rejected": -11.089422225952148, "logps/chosen": -2931.51220703125, "logps/rejected": -2818.41943359375, "loss": 4.578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -115.361572265625, "rewards/margins": 16.555818557739258, "rewards/rejected": -131.91738891601562, "step": 16340 }, { "epoch": 0.95, "grad_norm": 140.3708038330078, "learning_rate": 0.0006865203761755486, "logits/chosen": -11.726676940917969, "logits/rejected": -11.564881324768066, "logps/chosen": -2565.76025390625, "logps/rejected": -2509.49755859375, "loss": 21.9973, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -148.70968627929688, "rewards/margins": -15.1087646484375, "rewards/rejected": -133.60089111328125, "step": 16350 }, { "epoch": 0.95, "grad_norm": 8.282339224174873e-12, "learning_rate": 0.0006863268702349163, "logits/chosen": -11.977556228637695, "logits/rejected": -12.009442329406738, "logps/chosen": -2199.135009765625, "logps/rejected": -2502.22705078125, "loss": 3.9679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.06282043457031, "rewards/margins": 4.868741035461426, "rewards/rejected": -121.93156433105469, "step": 16360 }, { "epoch": 0.95, "grad_norm": 60.58433151245117, "learning_rate": 0.0006861333642942839, "logits/chosen": -11.869241714477539, "logits/rejected": -11.686417579650879, "logps/chosen": -2619.725830078125, "logps/rejected": -2187.60693359375, "loss": 13.7783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.27244567871094, "rewards/margins": -9.709207534790039, "rewards/rejected": -143.563232421875, "step": 16370 }, { "epoch": 0.95, "grad_norm": 12.537569046020508, "learning_rate": 0.0006859398583536515, "logits/chosen": -8.724302291870117, "logits/rejected": -8.724096298217773, "logps/chosen": -2716.80419921875, "logps/rejected": -2396.91064453125, "loss": 5.5454, "rewards/accuracies": 0.5, "rewards/chosen": -53.823028564453125, "rewards/margins": 2.784641742706299, "rewards/rejected": -56.60767364501953, "step": 16380 }, { "epoch": 0.95, "grad_norm": 1.84491765499115, "learning_rate": 0.0006857463524130191, "logits/chosen": -10.486846923828125, "logits/rejected": -10.414406776428223, "logps/chosen": -2426.55224609375, "logps/rejected": -2515.654052734375, "loss": 15.8675, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -104.66102600097656, "rewards/margins": -14.480913162231445, "rewards/rejected": -90.18009948730469, "step": 16390 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 0.0006855528464723867, "logits/chosen": -10.576448440551758, "logits/rejected": -10.520051002502441, "logps/chosen": -2529.466552734375, "logps/rejected": -2149.03271484375, "loss": 15.7995, "rewards/accuracies": 0.5, "rewards/chosen": -94.94654846191406, "rewards/margins": 0.2153697907924652, "rewards/rejected": -95.16191101074219, "step": 16400 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 0.0006853593405317543, "logits/chosen": -13.575571060180664, "logits/rejected": -13.578481674194336, "logps/chosen": -2285.93212890625, "logps/rejected": -1967.069580078125, "loss": 1.9259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.55000305175781, "rewards/margins": 17.174983978271484, "rewards/rejected": -108.7249984741211, "step": 16410 }, { "epoch": 0.95, "grad_norm": 2.23698059560186e-17, "learning_rate": 0.000685165834591122, "logits/chosen": -11.355958938598633, "logits/rejected": -11.262747764587402, "logps/chosen": -2830.00048828125, "logps/rejected": -2771.09130859375, "loss": 10.2081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -43.67205047607422, "rewards/margins": 5.663512706756592, "rewards/rejected": -49.33556365966797, "step": 16420 }, { "epoch": 0.95, "grad_norm": 0.0003661078226286918, "learning_rate": 0.0006849723286504896, "logits/chosen": -15.584248542785645, "logits/rejected": -15.606870651245117, "logps/chosen": -2459.807861328125, "logps/rejected": -2401.49072265625, "loss": 4.7693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.92568969726562, "rewards/margins": 0.04810905456542969, "rewards/rejected": -141.97381591796875, "step": 16430 }, { "epoch": 0.95, "grad_norm": 70.4504623413086, "learning_rate": 0.0006847788227098572, "logits/chosen": -14.306844711303711, "logits/rejected": -14.238121032714844, "logps/chosen": -2622.21875, "logps/rejected": -2245.26318359375, "loss": 6.1617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.54054260253906, "rewards/margins": 10.521772384643555, "rewards/rejected": -158.06231689453125, "step": 16440 }, { "epoch": 0.95, "grad_norm": 1.5699709310368348e-09, "learning_rate": 0.0006845853167692248, "logits/chosen": -12.972986221313477, "logits/rejected": -13.052034378051758, "logps/chosen": -2626.091064453125, "logps/rejected": -2176.869873046875, "loss": 1.9951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -87.01393127441406, "rewards/margins": 6.317506790161133, "rewards/rejected": -93.3314437866211, "step": 16450 }, { "epoch": 0.95, "grad_norm": 4.623716354370117, "learning_rate": 0.0006843918108285924, "logits/chosen": -17.432247161865234, "logits/rejected": -17.448312759399414, "logps/chosen": -2465.51416015625, "logps/rejected": -2458.879150390625, "loss": 8.9182, "rewards/accuracies": 0.5, "rewards/chosen": -211.07766723632812, "rewards/margins": 6.442213535308838, "rewards/rejected": -217.51986694335938, "step": 16460 }, { "epoch": 0.95, "grad_norm": 0.12312636524438858, "learning_rate": 0.00068419830488796, "logits/chosen": -11.669448852539062, "logits/rejected": -11.994614601135254, "logps/chosen": -2840.67822265625, "logps/rejected": -2299.05908203125, "loss": 19.3181, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -138.54234313964844, "rewards/margins": -6.843790531158447, "rewards/rejected": -131.69854736328125, "step": 16470 }, { "epoch": 0.95, "grad_norm": 17.35113525390625, "learning_rate": 0.0006840047989473277, "logits/chosen": -12.9441499710083, "logits/rejected": -13.050836563110352, "logps/chosen": -2521.49462890625, "logps/rejected": -2564.322265625, "loss": 11.5883, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -136.78221130371094, "rewards/margins": -9.492292404174805, "rewards/rejected": -127.28990173339844, "step": 16480 }, { "epoch": 0.95, "grad_norm": 142.209228515625, "learning_rate": 0.0006838112930066953, "logits/chosen": -11.408563613891602, "logits/rejected": -11.431796073913574, "logps/chosen": -2428.37451171875, "logps/rejected": -2319.28173828125, "loss": 26.9457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -96.25950622558594, "rewards/margins": -2.965850830078125, "rewards/rejected": -93.29366302490234, "step": 16490 }, { "epoch": 0.96, "grad_norm": 80.69332885742188, "learning_rate": 0.0006836177870660629, "logits/chosen": -11.286466598510742, "logits/rejected": -11.361579895019531, "logps/chosen": -2529.20166015625, "logps/rejected": -2457.228759765625, "loss": 11.6467, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -145.09727478027344, "rewards/margins": -9.921041488647461, "rewards/rejected": -135.17623901367188, "step": 16500 }, { "epoch": 0.96, "grad_norm": 113.0801773071289, "learning_rate": 0.0006834242811254305, "logits/chosen": -11.405423164367676, "logits/rejected": -11.556130409240723, "logps/chosen": -2377.390625, "logps/rejected": -2228.591552734375, "loss": 15.1032, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -139.01104736328125, "rewards/margins": -4.2359819412231445, "rewards/rejected": -134.77505493164062, "step": 16510 }, { "epoch": 0.96, "grad_norm": 0.023385141044855118, "learning_rate": 0.0006832307751847981, "logits/chosen": -14.745298385620117, "logits/rejected": -14.579795837402344, "logps/chosen": -2471.184326171875, "logps/rejected": -2293.354248046875, "loss": 16.3223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -130.14292907714844, "rewards/margins": -13.757472038269043, "rewards/rejected": -116.38545227050781, "step": 16520 }, { "epoch": 0.96, "grad_norm": 0.13850626349449158, "learning_rate": 0.0006830372692441658, "logits/chosen": -14.417160034179688, "logits/rejected": -14.423263549804688, "logps/chosen": -2620.248046875, "logps/rejected": -2430.268310546875, "loss": 6.3824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.97361755371094, "rewards/margins": 11.851371765136719, "rewards/rejected": -165.82498168945312, "step": 16530 }, { "epoch": 0.96, "grad_norm": 3.158846139907837, "learning_rate": 0.0006828437633035335, "logits/chosen": -13.846763610839844, "logits/rejected": -14.018533706665039, "logps/chosen": -2584.99951171875, "logps/rejected": -2492.73486328125, "loss": 31.4723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.20175170898438, "rewards/margins": -9.577320098876953, "rewards/rejected": -122.62443542480469, "step": 16540 }, { "epoch": 0.96, "grad_norm": 56.5400390625, "learning_rate": 0.0006826502573629011, "logits/chosen": -14.576715469360352, "logits/rejected": -14.195379257202148, "logps/chosen": -3072.936767578125, "logps/rejected": -2947.947998046875, "loss": 12.1976, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -164.50228881835938, "rewards/margins": -10.542935371398926, "rewards/rejected": -153.9593505859375, "step": 16550 }, { "epoch": 0.96, "grad_norm": 435.5718994140625, "learning_rate": 0.0006824567514222687, "logits/chosen": -11.187326431274414, "logits/rejected": -10.79414176940918, "logps/chosen": -3008.1923828125, "logps/rejected": -2736.513671875, "loss": 10.2641, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -119.21934509277344, "rewards/margins": 11.199974060058594, "rewards/rejected": -130.4193115234375, "step": 16560 }, { "epoch": 0.96, "grad_norm": 18.55550193786621, "learning_rate": 0.0006822632454816363, "logits/chosen": -12.452592849731445, "logits/rejected": -12.558815002441406, "logps/chosen": -2898.524169921875, "logps/rejected": -2643.315673828125, "loss": 8.6862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -155.88104248046875, "rewards/margins": 1.1565544605255127, "rewards/rejected": -157.03758239746094, "step": 16570 }, { "epoch": 0.96, "grad_norm": 11.259390830993652, "learning_rate": 0.000682069739541004, "logits/chosen": -11.193643569946289, "logits/rejected": -11.26000690460205, "logps/chosen": -2756.206298828125, "logps/rejected": -2056.339599609375, "loss": 3.7665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -126.36136627197266, "rewards/margins": 7.319228172302246, "rewards/rejected": -133.68060302734375, "step": 16580 }, { "epoch": 0.96, "grad_norm": 176.3284912109375, "learning_rate": 0.0006818762336003716, "logits/chosen": -13.818862915039062, "logits/rejected": -13.745976448059082, "logps/chosen": -2370.2265625, "logps/rejected": -2244.29638671875, "loss": 35.3872, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.69003295898438, "rewards/margins": -25.324993133544922, "rewards/rejected": -141.3650360107422, "step": 16590 }, { "epoch": 0.96, "grad_norm": 0.0002107664622599259, "learning_rate": 0.0006816827276597392, "logits/chosen": -8.676213264465332, "logits/rejected": -8.57557487487793, "logps/chosen": -3015.151611328125, "logps/rejected": -2732.71630859375, "loss": 1.2324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -47.027889251708984, "rewards/margins": 12.096793174743652, "rewards/rejected": -59.12468338012695, "step": 16600 }, { "epoch": 0.96, "grad_norm": 0.2931738793849945, "learning_rate": 0.0006814892217191068, "logits/chosen": -13.6966552734375, "logits/rejected": -13.66655158996582, "logps/chosen": -2608.177001953125, "logps/rejected": -2129.1953125, "loss": 17.3309, "rewards/accuracies": 0.5, "rewards/chosen": -187.52029418945312, "rewards/margins": -5.770147800445557, "rewards/rejected": -181.75013732910156, "step": 16610 }, { "epoch": 0.96, "grad_norm": 35.5658073425293, "learning_rate": 0.0006812957157784744, "logits/chosen": -14.963438034057617, "logits/rejected": -14.86530876159668, "logps/chosen": -2369.068603515625, "logps/rejected": -2402.16357421875, "loss": 1.7675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -148.8173065185547, "rewards/margins": 8.022869110107422, "rewards/rejected": -156.84017944335938, "step": 16620 }, { "epoch": 0.96, "grad_norm": 12.872346878051758, "learning_rate": 0.000681102209837842, "logits/chosen": -15.39850902557373, "logits/rejected": -15.211499214172363, "logps/chosen": -2687.197998046875, "logps/rejected": -2420.585693359375, "loss": 7.1257, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.17503356933594, "rewards/margins": 1.3984546661376953, "rewards/rejected": -130.573486328125, "step": 16630 }, { "epoch": 0.96, "grad_norm": 3.186861022186349e-06, "learning_rate": 0.0006809087038972098, "logits/chosen": -14.137407302856445, "logits/rejected": -14.383343696594238, "logps/chosen": -2690.54736328125, "logps/rejected": -2487.759033203125, "loss": 4.0933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.82449340820312, "rewards/margins": 9.999421119689941, "rewards/rejected": -167.82391357421875, "step": 16640 }, { "epoch": 0.96, "grad_norm": 5.260234470227942e-10, "learning_rate": 0.0006807151979565774, "logits/chosen": -14.345585823059082, "logits/rejected": -14.314886093139648, "logps/chosen": -2504.311767578125, "logps/rejected": -2797.65087890625, "loss": 6.36, "rewards/accuracies": 0.5, "rewards/chosen": -138.68417358398438, "rewards/margins": 2.694329023361206, "rewards/rejected": -141.37850952148438, "step": 16650 }, { "epoch": 0.96, "grad_norm": 0.0002900355902966112, "learning_rate": 0.0006805216920159449, "logits/chosen": -12.436959266662598, "logits/rejected": -12.870428085327148, "logps/chosen": -2883.930419921875, "logps/rejected": -2726.13134765625, "loss": 11.0049, "rewards/accuracies": 0.5, "rewards/chosen": -184.3879852294922, "rewards/margins": -4.25176477432251, "rewards/rejected": -180.13621520996094, "step": 16660 }, { "epoch": 0.96, "grad_norm": 72.09419250488281, "learning_rate": 0.0006803281860753125, "logits/chosen": -11.394247055053711, "logits/rejected": -11.353813171386719, "logps/chosen": -2552.486328125, "logps/rejected": -2452.8154296875, "loss": 6.5393, "rewards/accuracies": 0.5, "rewards/chosen": -139.2146453857422, "rewards/margins": -1.5214401483535767, "rewards/rejected": -137.69320678710938, "step": 16670 }, { "epoch": 0.97, "grad_norm": 65.271484375, "learning_rate": 0.0006801346801346801, "logits/chosen": -10.717644691467285, "logits/rejected": -10.346813201904297, "logps/chosen": -2606.337158203125, "logps/rejected": -2647.73681640625, "loss": 8.4986, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -148.2521514892578, "rewards/margins": 12.308769226074219, "rewards/rejected": -160.5609130859375, "step": 16680 }, { "epoch": 0.97, "grad_norm": 6.837909950263565e-06, "learning_rate": 0.0006799411741940477, "logits/chosen": -13.113987922668457, "logits/rejected": -13.194616317749023, "logps/chosen": -2796.05712890625, "logps/rejected": -2553.20263671875, "loss": 9.0544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.80865478515625, "rewards/margins": -5.685642242431641, "rewards/rejected": -169.12301635742188, "step": 16690 }, { "epoch": 0.97, "grad_norm": 0.0008733192225918174, "learning_rate": 0.0006797476682534154, "logits/chosen": -12.455240249633789, "logits/rejected": -12.361759185791016, "logps/chosen": -2556.512939453125, "logps/rejected": -2443.538330078125, "loss": 5.8472, "rewards/accuracies": 0.5, "rewards/chosen": -93.36135864257812, "rewards/margins": -1.351165533065796, "rewards/rejected": -92.01019287109375, "step": 16700 }, { "epoch": 0.97, "grad_norm": 129.9339599609375, "learning_rate": 0.000679554162312783, "logits/chosen": -14.388811111450195, "logits/rejected": -14.43421459197998, "logps/chosen": -2353.00048828125, "logps/rejected": -2389.1064453125, "loss": 5.9082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -128.23294067382812, "rewards/margins": 17.894145965576172, "rewards/rejected": -146.12710571289062, "step": 16710 }, { "epoch": 0.97, "grad_norm": 3.595375528675504e-05, "learning_rate": 0.0006793606563721506, "logits/chosen": -14.919901847839355, "logits/rejected": -15.021771430969238, "logps/chosen": -2536.017822265625, "logps/rejected": -2427.763427734375, "loss": 21.1046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.21914672851562, "rewards/margins": -1.916015625, "rewards/rejected": -145.30311584472656, "step": 16720 }, { "epoch": 0.97, "grad_norm": 87.43331146240234, "learning_rate": 0.0006791671504315182, "logits/chosen": -13.588859558105469, "logits/rejected": -13.364477157592773, "logps/chosen": -2384.41845703125, "logps/rejected": -2188.90283203125, "loss": 9.9191, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -167.3664093017578, "rewards/margins": -6.4131975173950195, "rewards/rejected": -160.95321655273438, "step": 16730 }, { "epoch": 0.97, "grad_norm": 3.739508867263794, "learning_rate": 0.0006789736444908858, "logits/chosen": -16.437156677246094, "logits/rejected": -16.203296661376953, "logps/chosen": -2481.30859375, "logps/rejected": -2294.42041015625, "loss": 15.174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -217.2021942138672, "rewards/margins": -11.4778470993042, "rewards/rejected": -205.72433471679688, "step": 16740 }, { "epoch": 0.97, "grad_norm": 3.7182562351226807, "learning_rate": 0.0006787801385502535, "logits/chosen": -15.348469734191895, "logits/rejected": -15.466934204101562, "logps/chosen": -2504.85400390625, "logps/rejected": -2318.871337890625, "loss": 18.2476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.10610961914062, "rewards/margins": -13.456914901733398, "rewards/rejected": -131.64920043945312, "step": 16750 }, { "epoch": 0.97, "grad_norm": 83.69182586669922, "learning_rate": 0.0006785866326096212, "logits/chosen": -12.542672157287598, "logits/rejected": -12.945425033569336, "logps/chosen": -2456.24658203125, "logps/rejected": -2301.718994140625, "loss": 19.1766, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.7076873779297, "rewards/margins": -3.692842483520508, "rewards/rejected": -133.01486206054688, "step": 16760 }, { "epoch": 0.97, "grad_norm": 0.013076064176857471, "learning_rate": 0.0006783931266689888, "logits/chosen": -12.191067695617676, "logits/rejected": -12.234603881835938, "logps/chosen": -2397.50341796875, "logps/rejected": -2223.829833984375, "loss": 6.2632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.550537109375, "rewards/margins": 1.025559663772583, "rewards/rejected": -133.5760955810547, "step": 16770 }, { "epoch": 0.97, "grad_norm": 29.687339782714844, "learning_rate": 0.0006781996207283564, "logits/chosen": -13.607586860656738, "logits/rejected": -13.437294006347656, "logps/chosen": -2515.04248046875, "logps/rejected": -2444.30322265625, "loss": 14.3343, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -140.14637756347656, "rewards/margins": 2.119776964187622, "rewards/rejected": -142.26614379882812, "step": 16780 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 0.000678006114787724, "logits/chosen": -12.337597846984863, "logits/rejected": -12.454219818115234, "logps/chosen": -2485.467041015625, "logps/rejected": -2062.3173828125, "loss": 8.1192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -78.00819396972656, "rewards/margins": 19.81414031982422, "rewards/rejected": -97.82234191894531, "step": 16790 }, { "epoch": 0.97, "grad_norm": 78.43843078613281, "learning_rate": 0.0006778126088470916, "logits/chosen": -15.099286079406738, "logits/rejected": -14.95274829864502, "logps/chosen": -2281.878173828125, "logps/rejected": -2307.796875, "loss": 9.7538, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.3614959716797, "rewards/margins": -0.33199042081832886, "rewards/rejected": -137.02951049804688, "step": 16800 }, { "epoch": 0.97, "grad_norm": 2.3702324077778556e-20, "learning_rate": 0.0006776191029064593, "logits/chosen": -14.32465648651123, "logits/rejected": -14.382473945617676, "logps/chosen": -2538.74609375, "logps/rejected": -2270.367431640625, "loss": 28.2987, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -184.9864501953125, "rewards/margins": -20.077831268310547, "rewards/rejected": -164.90859985351562, "step": 16810 }, { "epoch": 0.97, "grad_norm": 30.024362564086914, "learning_rate": 0.0006774255969658269, "logits/chosen": -14.594012260437012, "logits/rejected": -14.703702926635742, "logps/chosen": -2228.15087890625, "logps/rejected": -2176.98583984375, "loss": 3.2104, "rewards/accuracies": 0.5, "rewards/chosen": -151.7254638671875, "rewards/margins": 2.8021934032440186, "rewards/rejected": -154.5276641845703, "step": 16820 }, { "epoch": 0.97, "grad_norm": 72.63645935058594, "learning_rate": 0.0006772320910251945, "logits/chosen": -14.547704696655273, "logits/rejected": -14.802530288696289, "logps/chosen": -2589.533935546875, "logps/rejected": -2239.59912109375, "loss": 15.4898, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.53012084960938, "rewards/margins": -9.818792343139648, "rewards/rejected": -146.71133422851562, "step": 16830 }, { "epoch": 0.97, "grad_norm": 119.19662475585938, "learning_rate": 0.0006770385850845621, "logits/chosen": -13.815826416015625, "logits/rejected": -13.85180950164795, "logps/chosen": -2551.70361328125, "logps/rejected": -2586.87451171875, "loss": 11.4747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.90330505371094, "rewards/margins": 5.381098747253418, "rewards/rejected": -156.284423828125, "step": 16840 }, { "epoch": 0.98, "grad_norm": 37.83616256713867, "learning_rate": 0.0006768450791439298, "logits/chosen": -16.88599395751953, "logits/rejected": -17.2011661529541, "logps/chosen": -2278.680419921875, "logps/rejected": -2268.174072265625, "loss": 20.2266, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -164.4847869873047, "rewards/margins": 2.692915439605713, "rewards/rejected": -167.17770385742188, "step": 16850 }, { "epoch": 0.98, "grad_norm": 2.900034189224243, "learning_rate": 0.0006766515732032974, "logits/chosen": -16.941017150878906, "logits/rejected": -16.811880111694336, "logps/chosen": -2886.048828125, "logps/rejected": -2823.02490234375, "loss": 6.1397, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -175.3374786376953, "rewards/margins": -4.756732940673828, "rewards/rejected": -170.58071899414062, "step": 16860 }, { "epoch": 0.98, "grad_norm": 112.63742065429688, "learning_rate": 0.0006764580672626651, "logits/chosen": -16.048389434814453, "logits/rejected": -16.209217071533203, "logps/chosen": -3045.997314453125, "logps/rejected": -2637.104736328125, "loss": 13.6517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.99169921875, "rewards/margins": -1.7532222270965576, "rewards/rejected": -147.2384796142578, "step": 16870 }, { "epoch": 0.98, "grad_norm": 97.08785247802734, "learning_rate": 0.0006762645613220326, "logits/chosen": -17.884981155395508, "logits/rejected": -18.967763900756836, "logps/chosen": -2521.67138671875, "logps/rejected": -2240.962158203125, "loss": 11.0108, "rewards/accuracies": 0.5, "rewards/chosen": -143.65867614746094, "rewards/margins": 5.167122840881348, "rewards/rejected": -148.8258056640625, "step": 16880 }, { "epoch": 0.98, "grad_norm": 114.32598876953125, "learning_rate": 0.0006760710553814002, "logits/chosen": -12.037275314331055, "logits/rejected": -11.883509635925293, "logps/chosen": -3058.682861328125, "logps/rejected": -3004.990966796875, "loss": 13.0118, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -90.8962173461914, "rewards/margins": -5.2980265617370605, "rewards/rejected": -85.59819030761719, "step": 16890 }, { "epoch": 0.98, "grad_norm": 29.181615829467773, "learning_rate": 0.0006758775494407678, "logits/chosen": -16.795001983642578, "logits/rejected": -16.27951431274414, "logps/chosen": -2423.158447265625, "logps/rejected": -2149.99560546875, "loss": 26.3903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.38699340820312, "rewards/margins": -18.63875961303711, "rewards/rejected": -140.74822998046875, "step": 16900 }, { "epoch": 0.98, "grad_norm": 57.9212760925293, "learning_rate": 0.0006756840435001354, "logits/chosen": -13.42944622039795, "logits/rejected": -13.679290771484375, "logps/chosen": -2580.923095703125, "logps/rejected": -2344.09912109375, "loss": 22.5729, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -148.34915161132812, "rewards/margins": -18.598011016845703, "rewards/rejected": -129.75112915039062, "step": 16910 }, { "epoch": 0.98, "grad_norm": 1.8790042216768217e-19, "learning_rate": 0.000675490537559503, "logits/chosen": -12.32433032989502, "logits/rejected": -12.485708236694336, "logps/chosen": -2778.042236328125, "logps/rejected": -2502.634033203125, "loss": 7.377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.7613525390625, "rewards/margins": 4.04323673248291, "rewards/rejected": -142.80458068847656, "step": 16920 }, { "epoch": 0.98, "grad_norm": 384.73394775390625, "learning_rate": 0.0006752970316188707, "logits/chosen": -12.885581970214844, "logits/rejected": -12.894861221313477, "logps/chosen": -2463.14404296875, "logps/rejected": -2779.758056640625, "loss": 12.3373, "rewards/accuracies": 0.5, "rewards/chosen": -138.49404907226562, "rewards/margins": -3.702371120452881, "rewards/rejected": -134.7917022705078, "step": 16930 }, { "epoch": 0.98, "grad_norm": 0.10033156722784042, "learning_rate": 0.0006751035256782383, "logits/chosen": -14.678337097167969, "logits/rejected": -14.916885375976562, "logps/chosen": -2414.7119140625, "logps/rejected": -2664.140869140625, "loss": 6.0199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.68727111816406, "rewards/margins": 6.766829490661621, "rewards/rejected": -170.45411682128906, "step": 16940 }, { "epoch": 0.98, "grad_norm": 30.770170211791992, "learning_rate": 0.0006749100197376059, "logits/chosen": -11.787627220153809, "logits/rejected": -11.871720314025879, "logps/chosen": -2069.2890625, "logps/rejected": -2496.01806640625, "loss": 7.8102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.28868103027344, "rewards/margins": 4.450967311859131, "rewards/rejected": -116.73966217041016, "step": 16950 }, { "epoch": 0.98, "grad_norm": 13.736590385437012, "learning_rate": 0.0006747165137969736, "logits/chosen": -13.734273910522461, "logits/rejected": -14.243410110473633, "logps/chosen": -2611.44287109375, "logps/rejected": -2512.47412109375, "loss": 9.7525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.79074096679688, "rewards/margins": -5.475255489349365, "rewards/rejected": -127.31547546386719, "step": 16960 }, { "epoch": 0.98, "grad_norm": 2.727203796126787e-09, "learning_rate": 0.0006745230078563412, "logits/chosen": -13.002978324890137, "logits/rejected": -13.4213285446167, "logps/chosen": -2588.57763671875, "logps/rejected": -2446.275634765625, "loss": 6.4302, "rewards/accuracies": 0.5, "rewards/chosen": -125.4709701538086, "rewards/margins": 1.3494179248809814, "rewards/rejected": -126.82039642333984, "step": 16970 }, { "epoch": 0.98, "grad_norm": 122.61830139160156, "learning_rate": 0.0006743295019157089, "logits/chosen": -14.520822525024414, "logits/rejected": -14.199688911437988, "logps/chosen": -2504.513427734375, "logps/rejected": -1864.0777587890625, "loss": 22.395, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -170.92025756835938, "rewards/margins": -16.555286407470703, "rewards/rejected": -154.36495971679688, "step": 16980 }, { "epoch": 0.98, "grad_norm": 176.92735290527344, "learning_rate": 0.0006741359959750765, "logits/chosen": -13.797389030456543, "logits/rejected": -13.750303268432617, "logps/chosen": -2938.19140625, "logps/rejected": -2618.783935546875, "loss": 8.2591, "rewards/accuracies": 0.5, "rewards/chosen": -110.71437072753906, "rewards/margins": 9.490922927856445, "rewards/rejected": -120.20528411865234, "step": 16990 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 0.0006739424900344441, "logits/chosen": -16.48208236694336, "logits/rejected": -16.60817527770996, "logps/chosen": -2681.88232421875, "logps/rejected": -2455.029541015625, "loss": 10.8433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.8107147216797, "rewards/margins": 1.262568712234497, "rewards/rejected": -165.07327270507812, "step": 17000 }, { "epoch": 0.98, "grad_norm": 8.694901225680951e-07, "learning_rate": 0.0006737489840938117, "logits/chosen": -15.084131240844727, "logits/rejected": -14.82746410369873, "logps/chosen": -2375.18896484375, "logps/rejected": -2572.1689453125, "loss": 6.9591, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -195.58352661132812, "rewards/margins": 2.7608959674835205, "rewards/rejected": -198.34442138671875, "step": 17010 }, { "epoch": 0.99, "grad_norm": 15.921225547790527, "learning_rate": 0.0006735554781531793, "logits/chosen": -17.178468704223633, "logits/rejected": -17.022071838378906, "logps/chosen": -2371.767822265625, "logps/rejected": -2296.26513671875, "loss": 5.0174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.02261352539062, "rewards/margins": 1.144768476486206, "rewards/rejected": -205.1673583984375, "step": 17020 }, { "epoch": 0.99, "grad_norm": 241.23773193359375, "learning_rate": 0.0006733619722125469, "logits/chosen": -14.320116996765137, "logits/rejected": -14.387962341308594, "logps/chosen": -2633.228515625, "logps/rejected": -2645.310546875, "loss": 9.9813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.25057983398438, "rewards/margins": 5.690296649932861, "rewards/rejected": -140.9408721923828, "step": 17030 }, { "epoch": 0.99, "grad_norm": 91.83394622802734, "learning_rate": 0.0006731684662719146, "logits/chosen": -15.805951118469238, "logits/rejected": -14.811637878417969, "logps/chosen": -2295.8896484375, "logps/rejected": -2345.537109375, "loss": 0.2787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -163.6619110107422, "rewards/margins": 21.210750579833984, "rewards/rejected": -184.87266540527344, "step": 17040 }, { "epoch": 0.99, "grad_norm": 223.47796630859375, "learning_rate": 0.0006729749603312822, "logits/chosen": -12.051717758178711, "logits/rejected": -12.419837951660156, "logps/chosen": -2776.73974609375, "logps/rejected": -2587.34765625, "loss": 3.9857, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -109.80511474609375, "rewards/margins": 14.220906257629395, "rewards/rejected": -124.0260238647461, "step": 17050 }, { "epoch": 0.99, "grad_norm": 276.5036315917969, "learning_rate": 0.0006727814543906499, "logits/chosen": -16.66776466369629, "logits/rejected": -16.461164474487305, "logps/chosen": -2128.231689453125, "logps/rejected": -2245.6943359375, "loss": 4.3502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.17355346679688, "rewards/margins": 11.453645706176758, "rewards/rejected": -152.627197265625, "step": 17060 }, { "epoch": 0.99, "grad_norm": 42.635807037353516, "learning_rate": 0.0006725879484500175, "logits/chosen": -16.060226440429688, "logits/rejected": -16.331165313720703, "logps/chosen": -2263.824951171875, "logps/rejected": -2127.158447265625, "loss": 13.5075, "rewards/accuracies": 0.5, "rewards/chosen": -155.05702209472656, "rewards/margins": -6.302333354949951, "rewards/rejected": -148.75473022460938, "step": 17070 }, { "epoch": 0.99, "grad_norm": 66.44335174560547, "learning_rate": 0.0006723944425093851, "logits/chosen": -16.770784378051758, "logits/rejected": -16.704370498657227, "logps/chosen": -2624.398193359375, "logps/rejected": -2589.827392578125, "loss": 6.0829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.53253173828125, "rewards/margins": -1.4952976703643799, "rewards/rejected": -196.0372314453125, "step": 17080 }, { "epoch": 0.99, "grad_norm": 20.02997398376465, "learning_rate": 0.0006722009365687527, "logits/chosen": -13.273820877075195, "logits/rejected": -13.245272636413574, "logps/chosen": -2697.787841796875, "logps/rejected": -2787.85986328125, "loss": 10.2711, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -109.3180923461914, "rewards/margins": -0.11983184516429901, "rewards/rejected": -109.1982650756836, "step": 17090 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 0.0006720074306281203, "logits/chosen": -13.814939498901367, "logits/rejected": -13.772236824035645, "logps/chosen": -2664.5205078125, "logps/rejected": -2764.59033203125, "loss": 1.005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -63.534507751464844, "rewards/margins": 27.302715301513672, "rewards/rejected": -90.83722686767578, "step": 17100 }, { "epoch": 0.99, "grad_norm": 0.2526407539844513, "learning_rate": 0.0006718139246874879, "logits/chosen": -14.231698989868164, "logits/rejected": -14.421737670898438, "logps/chosen": -2842.59912109375, "logps/rejected": -2683.82177734375, "loss": 3.3672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.56510925292969, "rewards/margins": 10.911974906921387, "rewards/rejected": -121.47709655761719, "step": 17110 }, { "epoch": 0.99, "grad_norm": 430.9693908691406, "learning_rate": 0.0006716204187468555, "logits/chosen": -18.398134231567383, "logits/rejected": -18.551536560058594, "logps/chosen": -2828.5205078125, "logps/rejected": -2655.76806640625, "loss": 26.8564, "rewards/accuracies": 0.5, "rewards/chosen": -185.95034790039062, "rewards/margins": -20.121265411376953, "rewards/rejected": -165.82908630371094, "step": 17120 }, { "epoch": 0.99, "grad_norm": 160.62982177734375, "learning_rate": 0.0006714269128062231, "logits/chosen": -14.76313304901123, "logits/rejected": -14.705450057983398, "logps/chosen": -2581.36474609375, "logps/rejected": -2525.303955078125, "loss": 11.5399, "rewards/accuracies": 0.5, "rewards/chosen": -130.72393798828125, "rewards/margins": 8.953987121582031, "rewards/rejected": -139.6779327392578, "step": 17130 }, { "epoch": 0.99, "grad_norm": 0.0004648693429771811, "learning_rate": 0.0006712334068655907, "logits/chosen": -12.10771369934082, "logits/rejected": -12.13582706451416, "logps/chosen": -2943.03759765625, "logps/rejected": -2921.804931640625, "loss": 9.145, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -174.79202270507812, "rewards/margins": -0.362457275390625, "rewards/rejected": -174.4295654296875, "step": 17140 }, { "epoch": 0.99, "grad_norm": 114.40019226074219, "learning_rate": 0.0006710399009249583, "logits/chosen": -12.605180740356445, "logits/rejected": -12.616312980651855, "logps/chosen": -2355.890869140625, "logps/rejected": -2097.10107421875, "loss": 35.3477, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.1212158203125, "rewards/margins": -28.7901554107666, "rewards/rejected": -143.33108520507812, "step": 17150 }, { "epoch": 0.99, "grad_norm": 143.44715881347656, "learning_rate": 0.000670846394984326, "logits/chosen": -11.237001419067383, "logits/rejected": -11.347240447998047, "logps/chosen": -2830.151123046875, "logps/rejected": -2841.0625, "loss": 5.038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.3089141845703, "rewards/margins": 2.6672489643096924, "rewards/rejected": -203.97616577148438, "step": 17160 }, { "epoch": 0.99, "grad_norm": 83.10061645507812, "learning_rate": 0.0006706528890436937, "logits/chosen": -13.498046875, "logits/rejected": -13.3994722366333, "logps/chosen": -2460.065673828125, "logps/rejected": -2304.385498046875, "loss": 17.5882, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.16941833496094, "rewards/margins": -13.156023025512695, "rewards/rejected": -169.01339721679688, "step": 17170 }, { "epoch": 0.99, "grad_norm": 77.80581665039062, "learning_rate": 0.0006704593831030613, "logits/chosen": -12.72118091583252, "logits/rejected": -12.967729568481445, "logps/chosen": -2214.336669921875, "logps/rejected": -2294.56201171875, "loss": 5.1252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.30899047851562, "rewards/margins": 9.590337753295898, "rewards/rejected": -161.8993377685547, "step": 17180 }, { "epoch": 1.0, "grad_norm": 49.20290756225586, "learning_rate": 0.0006702658771624289, "logits/chosen": -11.704977035522461, "logits/rejected": -11.918642044067383, "logps/chosen": -2778.027099609375, "logps/rejected": -2325.96875, "loss": 12.1668, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -132.0320587158203, "rewards/margins": -8.511796951293945, "rewards/rejected": -123.520263671875, "step": 17190 }, { "epoch": 1.0, "grad_norm": 3.799505918777868e-07, "learning_rate": 0.0006700723712217965, "logits/chosen": -13.112833976745605, "logits/rejected": -13.062113761901855, "logps/chosen": -2084.1923828125, "logps/rejected": -2272.439453125, "loss": 9.8879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.3115997314453, "rewards/margins": 0.11806373298168182, "rewards/rejected": -128.42965698242188, "step": 17200 }, { "epoch": 1.0, "grad_norm": 5.346635413958184e-17, "learning_rate": 0.0006698788652811642, "logits/chosen": -11.374582290649414, "logits/rejected": -11.371338844299316, "logps/chosen": -2487.5361328125, "logps/rejected": -2228.05078125, "loss": 33.1783, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -133.48660278320312, "rewards/margins": -13.8687162399292, "rewards/rejected": -119.61787414550781, "step": 17210 }, { "epoch": 1.0, "grad_norm": 44.81690979003906, "learning_rate": 0.0006696853593405318, "logits/chosen": -15.787834167480469, "logits/rejected": -15.857721328735352, "logps/chosen": -2416.640625, "logps/rejected": -2296.707275390625, "loss": 10.5215, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -204.50955200195312, "rewards/margins": -7.318617343902588, "rewards/rejected": -197.19093322753906, "step": 17220 }, { "epoch": 1.0, "grad_norm": 200.02374267578125, "learning_rate": 0.0006694918533998994, "logits/chosen": -13.347536087036133, "logits/rejected": -13.226350784301758, "logps/chosen": -2920.37548828125, "logps/rejected": -2574.378173828125, "loss": 21.875, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -186.4380340576172, "rewards/margins": -18.8193416595459, "rewards/rejected": -167.61868286132812, "step": 17230 }, { "epoch": 1.0, "grad_norm": 138.8402862548828, "learning_rate": 0.000669298347459267, "logits/chosen": -13.51280403137207, "logits/rejected": -14.49821662902832, "logps/chosen": -2950.93798828125, "logps/rejected": -2571.595947265625, "loss": 20.9342, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -147.13381958007812, "rewards/margins": -14.91718578338623, "rewards/rejected": -132.21664428710938, "step": 17240 }, { "epoch": 1.0, "grad_norm": 104.24055480957031, "learning_rate": 0.0006691048415186346, "logits/chosen": -13.386533737182617, "logits/rejected": -13.636140823364258, "logps/chosen": -2701.829345703125, "logps/rejected": -2648.36669921875, "loss": 7.652, "rewards/accuracies": 0.5, "rewards/chosen": -130.30368041992188, "rewards/margins": -1.808272123336792, "rewards/rejected": -128.49539184570312, "step": 17250 }, { "epoch": 1.0, "grad_norm": 87.15492248535156, "learning_rate": 0.0006689113355780022, "logits/chosen": -11.677017211914062, "logits/rejected": -11.47572135925293, "logps/chosen": -2831.62646484375, "logps/rejected": -2923.576904296875, "loss": 7.0688, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -150.2016143798828, "rewards/margins": -6.475190162658691, "rewards/rejected": -143.7264404296875, "step": 17260 }, { "epoch": 1.0, "grad_norm": 73.02522277832031, "learning_rate": 0.00066871782963737, "logits/chosen": -13.01305103302002, "logits/rejected": -12.996095657348633, "logps/chosen": -2752.316650390625, "logps/rejected": -2606.21435546875, "loss": 10.0736, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -197.31068420410156, "rewards/margins": -7.609986782073975, "rewards/rejected": -189.70066833496094, "step": 17270 }, { "epoch": 1.0, "grad_norm": 108.61541748046875, "learning_rate": 0.0006685243236967376, "logits/chosen": -15.525837898254395, "logits/rejected": -14.94761848449707, "logps/chosen": -2845.23046875, "logps/rejected": -2759.33544921875, "loss": 9.2637, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -221.6757354736328, "rewards/margins": -7.228728294372559, "rewards/rejected": -214.447021484375, "step": 17280 }, { "epoch": 1.0, "grad_norm": 60.41061782836914, "learning_rate": 0.0006683308177561052, "logits/chosen": -13.726919174194336, "logits/rejected": -13.706561088562012, "logps/chosen": -2657.184326171875, "logps/rejected": -2669.644775390625, "loss": 9.6216, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -196.5580291748047, "rewards/margins": -0.9277328252792358, "rewards/rejected": -195.63027954101562, "step": 17290 }, { "epoch": 1.0, "grad_norm": 5.993386776453891e-12, "learning_rate": 0.0006681373118154728, "logits/chosen": -11.81205940246582, "logits/rejected": -12.269027709960938, "logps/chosen": -2839.21240234375, "logps/rejected": -2356.6865234375, "loss": 11.1644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.48079681396484, "rewards/margins": 1.2592684030532837, "rewards/rejected": -105.74005126953125, "step": 17300 }, { "epoch": 1.0, "grad_norm": 70.86190795898438, "learning_rate": 0.0006679438058748404, "logits/chosen": -13.344609260559082, "logits/rejected": -13.363784790039062, "logps/chosen": -3096.875, "logps/rejected": -3022.44482421875, "loss": 14.3261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.8410186767578, "rewards/margins": -7.812252998352051, "rewards/rejected": -182.0287628173828, "step": 17310 }, { "epoch": 1.0, "grad_norm": 105.93169403076172, "learning_rate": 0.0006677502999342079, "logits/chosen": -13.2353515625, "logits/rejected": -13.807426452636719, "logps/chosen": -2655.204833984375, "logps/rejected": -2411.57568359375, "loss": 6.8999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.1870574951172, "rewards/margins": 13.606008529663086, "rewards/rejected": -169.79306030273438, "step": 17320 }, { "epoch": 1.0, "grad_norm": 53.96027755737305, "learning_rate": 0.0006675567939935756, "logits/chosen": -14.505186080932617, "logits/rejected": -14.570291519165039, "logps/chosen": -2896.67529296875, "logps/rejected": -2790.944580078125, "loss": 12.3435, "rewards/accuracies": 0.5, "rewards/chosen": -247.027099609375, "rewards/margins": -9.153104782104492, "rewards/rejected": -237.8739776611328, "step": 17330 }, { "epoch": 1.0, "grad_norm": 3.900956153869629, "learning_rate": 0.0006673632880529432, "logits/chosen": -16.677349090576172, "logits/rejected": -16.83564567565918, "logps/chosen": -2455.63671875, "logps/rejected": -2174.39794921875, "loss": 9.7674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.29469299316406, "rewards/margins": -0.6659210324287415, "rewards/rejected": -188.6287841796875, "step": 17340 }, { "epoch": 1.0, "grad_norm": 1.907468152789625e-18, "learning_rate": 0.0006671697821123108, "logits/chosen": -13.180944442749023, "logits/rejected": -13.413419723510742, "logps/chosen": -2710.8203125, "logps/rejected": -2735.1865234375, "loss": 1.237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.91725158691406, "rewards/margins": 13.586176872253418, "rewards/rejected": -135.50343322753906, "step": 17350 }, { "epoch": 1.0, "grad_norm": 0.010352255776524544, "learning_rate": 0.0006669762761716784, "logits/chosen": -10.403585433959961, "logits/rejected": -10.443693161010742, "logps/chosen": -3085.607666015625, "logps/rejected": -2606.045654296875, "loss": 1.8378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.54405212402344, "rewards/margins": 21.220935821533203, "rewards/rejected": -109.7649917602539, "step": 17360 }, { "epoch": 1.01, "grad_norm": 0.09411859512329102, "learning_rate": 0.000666782770231046, "logits/chosen": -15.437784194946289, "logits/rejected": -15.737855911254883, "logps/chosen": -2757.2607421875, "logps/rejected": -2671.271240234375, "loss": 8.8929, "rewards/accuracies": 0.5, "rewards/chosen": -209.3268585205078, "rewards/margins": -4.34494686126709, "rewards/rejected": -204.9818878173828, "step": 17370 }, { "epoch": 1.01, "grad_norm": 7.883125263674629e-09, "learning_rate": 0.0006665892642904138, "logits/chosen": -17.485172271728516, "logits/rejected": -18.96499252319336, "logps/chosen": -2562.97412109375, "logps/rejected": -2554.662109375, "loss": 1.166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -226.50674438476562, "rewards/margins": 8.51398754119873, "rewards/rejected": -235.02072143554688, "step": 17380 }, { "epoch": 1.01, "grad_norm": 7.229784045742482e-12, "learning_rate": 0.0006663957583497814, "logits/chosen": -14.023443222045898, "logits/rejected": -13.8378324508667, "logps/chosen": -2683.75244140625, "logps/rejected": -2711.02294921875, "loss": 4.546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.76620483398438, "rewards/margins": 10.634353637695312, "rewards/rejected": -174.4005584716797, "step": 17390 }, { "epoch": 1.01, "grad_norm": 472.00384521484375, "learning_rate": 0.000666202252409149, "logits/chosen": -13.824790954589844, "logits/rejected": -13.69348430633545, "logps/chosen": -2509.62060546875, "logps/rejected": -2756.604736328125, "loss": 8.3589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -96.08851623535156, "rewards/margins": 0.16709861159324646, "rewards/rejected": -96.25563049316406, "step": 17400 }, { "epoch": 1.01, "grad_norm": 0.0, "learning_rate": 0.0006660087464685166, "logits/chosen": -13.964184761047363, "logits/rejected": -14.251848220825195, "logps/chosen": -3058.86865234375, "logps/rejected": -2883.4033203125, "loss": 11.8776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.9281768798828, "rewards/margins": 14.518415451049805, "rewards/rejected": -210.4465789794922, "step": 17410 }, { "epoch": 1.01, "grad_norm": 47.47995376586914, "learning_rate": 0.0006658152405278842, "logits/chosen": -13.243687629699707, "logits/rejected": -13.313191413879395, "logps/chosen": -2682.03759765625, "logps/rejected": -2856.05615234375, "loss": 6.7229, "rewards/accuracies": 0.5, "rewards/chosen": -186.1696319580078, "rewards/margins": -1.3300774097442627, "rewards/rejected": -184.8395538330078, "step": 17420 }, { "epoch": 1.01, "grad_norm": 0.015443410724401474, "learning_rate": 0.0006656217345872518, "logits/chosen": -10.23155403137207, "logits/rejected": -10.669511795043945, "logps/chosen": -3283.61474609375, "logps/rejected": -3137.271484375, "loss": 4.9074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.7535171508789, "rewards/margins": 10.785618782043457, "rewards/rejected": -120.53913879394531, "step": 17430 }, { "epoch": 1.01, "grad_norm": 74.42005920410156, "learning_rate": 0.0006654282286466195, "logits/chosen": -9.628753662109375, "logits/rejected": -9.715497016906738, "logps/chosen": -3234.935302734375, "logps/rejected": -2999.59716796875, "loss": 4.8305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -96.92827606201172, "rewards/margins": 4.767801761627197, "rewards/rejected": -101.69608306884766, "step": 17440 }, { "epoch": 1.01, "grad_norm": 1.4228314082487259e-09, "learning_rate": 0.0006652347227059871, "logits/chosen": -11.48670768737793, "logits/rejected": -11.642538070678711, "logps/chosen": -3166.16796875, "logps/rejected": -3024.86328125, "loss": 4.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -92.56138610839844, "rewards/margins": 7.0547637939453125, "rewards/rejected": -99.61615753173828, "step": 17450 }, { "epoch": 1.01, "grad_norm": 46.53230285644531, "learning_rate": 0.0006650412167653547, "logits/chosen": -13.90623664855957, "logits/rejected": -14.564390182495117, "logps/chosen": -3114.500732421875, "logps/rejected": -2510.20556640625, "loss": 29.4367, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.96182250976562, "rewards/margins": -25.437366485595703, "rewards/rejected": -147.52442932128906, "step": 17460 }, { "epoch": 1.01, "grad_norm": 36.692909240722656, "learning_rate": 0.0006648477108247223, "logits/chosen": -12.211560249328613, "logits/rejected": -12.19792652130127, "logps/chosen": -2843.139404296875, "logps/rejected": -2771.014404296875, "loss": 3.9191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -74.42642974853516, "rewards/margins": 14.33134937286377, "rewards/rejected": -88.7577896118164, "step": 17470 }, { "epoch": 1.01, "grad_norm": 14.95302677154541, "learning_rate": 0.00066465420488409, "logits/chosen": -20.584680557250977, "logits/rejected": -20.73858070373535, "logps/chosen": -2603.29248046875, "logps/rejected": -2792.928466796875, "loss": 5.4417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -206.8356170654297, "rewards/margins": 9.77924919128418, "rewards/rejected": -216.61489868164062, "step": 17480 }, { "epoch": 1.01, "grad_norm": 91.05277252197266, "learning_rate": 0.0006644606989434577, "logits/chosen": -20.72182846069336, "logits/rejected": -22.717208862304688, "logps/chosen": -2780.970703125, "logps/rejected": -2551.84912109375, "loss": 13.2717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.1885986328125, "rewards/margins": 7.268230438232422, "rewards/rejected": -207.4568328857422, "step": 17490 }, { "epoch": 1.01, "grad_norm": 54.95801544189453, "learning_rate": 0.0006642671930028253, "logits/chosen": -22.16752815246582, "logits/rejected": -21.636741638183594, "logps/chosen": -2801.22509765625, "logps/rejected": -2879.72509765625, "loss": 5.8043, "rewards/accuracies": 0.5, "rewards/chosen": -215.0410919189453, "rewards/margins": 0.6061380505561829, "rewards/rejected": -215.64724731445312, "step": 17500 }, { "epoch": 1.01, "grad_norm": 79.731201171875, "learning_rate": 0.0006640736870621929, "logits/chosen": -15.934492111206055, "logits/rejected": -15.83709716796875, "logps/chosen": -2849.66357421875, "logps/rejected": -2751.96826171875, "loss": 13.4508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.78582763671875, "rewards/margins": -2.7020225524902344, "rewards/rejected": -119.08380126953125, "step": 17510 }, { "epoch": 1.01, "grad_norm": 161.7686309814453, "learning_rate": 0.0006638801811215605, "logits/chosen": -18.44345474243164, "logits/rejected": -19.036680221557617, "logps/chosen": -2597.520263671875, "logps/rejected": -2357.91064453125, "loss": 31.426, "rewards/accuracies": 0.5, "rewards/chosen": -160.41937255859375, "rewards/margins": -25.351276397705078, "rewards/rejected": -135.06808471679688, "step": 17520 }, { "epoch": 1.01, "grad_norm": 1.680837869644165, "learning_rate": 0.000663686675180928, "logits/chosen": -15.063055038452148, "logits/rejected": -15.193560600280762, "logps/chosen": -2936.155517578125, "logps/rejected": -2783.38427734375, "loss": 9.3244, "rewards/accuracies": 0.5, "rewards/chosen": -183.29830932617188, "rewards/margins": -5.043107032775879, "rewards/rejected": -178.25521850585938, "step": 17530 }, { "epoch": 1.02, "grad_norm": 94.82209014892578, "learning_rate": 0.0006634931692402956, "logits/chosen": -15.677334785461426, "logits/rejected": -15.754809379577637, "logps/chosen": -2846.2685546875, "logps/rejected": -2861.248779296875, "loss": 9.2018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.7021026611328, "rewards/margins": -1.1162160634994507, "rewards/rejected": -167.5858917236328, "step": 17540 }, { "epoch": 1.02, "grad_norm": 0.16096284985542297, "learning_rate": 0.0006632996632996632, "logits/chosen": -16.122207641601562, "logits/rejected": -16.55355453491211, "logps/chosen": -2962.698486328125, "logps/rejected": -2899.833251953125, "loss": 4.1481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -255.9418182373047, "rewards/margins": 10.254496574401855, "rewards/rejected": -266.1963195800781, "step": 17550 }, { "epoch": 1.02, "grad_norm": 99.73542022705078, "learning_rate": 0.0006631061573590309, "logits/chosen": -14.457353591918945, "logits/rejected": -14.383842468261719, "logps/chosen": -2808.69189453125, "logps/rejected": -2617.04638671875, "loss": 22.2548, "rewards/accuracies": 0.5, "rewards/chosen": -203.52206420898438, "rewards/margins": -14.830116271972656, "rewards/rejected": -188.69195556640625, "step": 17560 }, { "epoch": 1.02, "grad_norm": 1123.2705078125, "learning_rate": 0.0006629126514183985, "logits/chosen": -10.125015258789062, "logits/rejected": -10.189727783203125, "logps/chosen": -3539.895263671875, "logps/rejected": -2976.2880859375, "loss": 35.4571, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -141.73355102539062, "rewards/margins": -21.306344985961914, "rewards/rejected": -120.42720031738281, "step": 17570 }, { "epoch": 1.02, "grad_norm": 0.1952313631772995, "learning_rate": 0.0006627191454777661, "logits/chosen": -12.821592330932617, "logits/rejected": -12.9094820022583, "logps/chosen": -3062.50244140625, "logps/rejected": -2760.822021484375, "loss": 7.6953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -86.6431655883789, "rewards/margins": 8.047589302062988, "rewards/rejected": -94.69075775146484, "step": 17580 }, { "epoch": 1.02, "grad_norm": 14.039278984069824, "learning_rate": 0.0006625256395371338, "logits/chosen": -20.188617706298828, "logits/rejected": -20.13604164123535, "logps/chosen": -2487.5048828125, "logps/rejected": -2367.00244140625, "loss": 26.7322, "rewards/accuracies": 0.5, "rewards/chosen": -218.6527862548828, "rewards/margins": -7.4902143478393555, "rewards/rejected": -211.16256713867188, "step": 17590 }, { "epoch": 1.02, "grad_norm": 0.0022299839183688164, "learning_rate": 0.0006623321335965014, "logits/chosen": -22.28093147277832, "logits/rejected": -22.159683227539062, "logps/chosen": -2372.795654296875, "logps/rejected": -2094.630615234375, "loss": 30.0713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -211.3531036376953, "rewards/margins": -18.73554039001465, "rewards/rejected": -192.6175537109375, "step": 17600 }, { "epoch": 1.02, "grad_norm": 3.733696085206334e-17, "learning_rate": 0.0006621386276558691, "logits/chosen": -13.965087890625, "logits/rejected": -14.234869003295898, "logps/chosen": -2626.229248046875, "logps/rejected": -2198.068359375, "loss": 4.0881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -81.115234375, "rewards/margins": 17.36105728149414, "rewards/rejected": -98.47628021240234, "step": 17610 }, { "epoch": 1.02, "grad_norm": 71.615966796875, "learning_rate": 0.0006619451217152367, "logits/chosen": -18.85550308227539, "logits/rejected": -18.84213638305664, "logps/chosen": -2501.74951171875, "logps/rejected": -2430.758056640625, "loss": 6.7885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.19100952148438, "rewards/margins": 4.474759578704834, "rewards/rejected": -185.66578674316406, "step": 17620 }, { "epoch": 1.02, "grad_norm": 4.376558622667682e-16, "learning_rate": 0.0006617516157746043, "logits/chosen": -13.722312927246094, "logits/rejected": -14.754066467285156, "logps/chosen": -2764.142822265625, "logps/rejected": -2602.4013671875, "loss": 4.8261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.42513275146484, "rewards/margins": 14.215604782104492, "rewards/rejected": -112.64073181152344, "step": 17630 }, { "epoch": 1.02, "grad_norm": 0.06914792209863663, "learning_rate": 0.0006615581098339719, "logits/chosen": -19.59287452697754, "logits/rejected": -19.406368255615234, "logps/chosen": -2394.12255859375, "logps/rejected": -2276.4072265625, "loss": 11.581, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -169.8548583984375, "rewards/margins": -7.163933753967285, "rewards/rejected": -162.69093322753906, "step": 17640 }, { "epoch": 1.02, "grad_norm": 29.84234619140625, "learning_rate": 0.0006613646038933395, "logits/chosen": -19.744922637939453, "logits/rejected": -20.322683334350586, "logps/chosen": -2334.935791015625, "logps/rejected": -2334.316650390625, "loss": 6.5686, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -168.79823303222656, "rewards/margins": 2.514965057373047, "rewards/rejected": -171.31320190429688, "step": 17650 }, { "epoch": 1.02, "grad_norm": 106.04875946044922, "learning_rate": 0.0006611710979527071, "logits/chosen": -19.734947204589844, "logits/rejected": -19.71201515197754, "logps/chosen": -2435.571044921875, "logps/rejected": -1964.8675537109375, "loss": 33.5294, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -187.8620147705078, "rewards/margins": -24.091060638427734, "rewards/rejected": -163.77096557617188, "step": 17660 }, { "epoch": 1.02, "grad_norm": 3.379559220020947e-17, "learning_rate": 0.0006609775920120748, "logits/chosen": -12.852276802062988, "logits/rejected": -12.781783103942871, "logps/chosen": -2740.19970703125, "logps/rejected": -2564.426513671875, "loss": 10.501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.905635833740234, "rewards/margins": 7.502333641052246, "rewards/rejected": -32.40797424316406, "step": 17670 }, { "epoch": 1.02, "grad_norm": 104.05149841308594, "learning_rate": 0.0006607840860714424, "logits/chosen": -15.08366870880127, "logits/rejected": -15.498852729797363, "logps/chosen": -2483.12548828125, "logps/rejected": -2045.9918212890625, "loss": 24.2018, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -144.44674682617188, "rewards/margins": -13.153033256530762, "rewards/rejected": -131.29371643066406, "step": 17680 }, { "epoch": 1.02, "grad_norm": 0.0, "learning_rate": 0.0006605905801308101, "logits/chosen": -18.278093338012695, "logits/rejected": -18.26030731201172, "logps/chosen": -2430.2060546875, "logps/rejected": -2233.0546875, "loss": 2.6931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -169.03451538085938, "rewards/margins": 18.708450317382812, "rewards/rejected": -187.74298095703125, "step": 17690 }, { "epoch": 1.02, "grad_norm": 1.8689346688915975e-05, "learning_rate": 0.0006603970741901777, "logits/chosen": -16.257373809814453, "logits/rejected": -16.584632873535156, "logps/chosen": -2842.4580078125, "logps/rejected": -2681.01611328125, "loss": 13.8578, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -179.13821411132812, "rewards/margins": -4.554266929626465, "rewards/rejected": -174.58395385742188, "step": 17700 }, { "epoch": 1.03, "grad_norm": 0.5948460102081299, "learning_rate": 0.0006602035682495453, "logits/chosen": -16.724040985107422, "logits/rejected": -16.54261589050293, "logps/chosen": -2746.94921875, "logps/rejected": -2369.412353515625, "loss": 15.4205, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -201.00747680664062, "rewards/margins": -10.222067832946777, "rewards/rejected": -190.78543090820312, "step": 17710 }, { "epoch": 1.03, "grad_norm": 1.4543647353093547e-07, "learning_rate": 0.000660010062308913, "logits/chosen": -13.790922164916992, "logits/rejected": -13.87816333770752, "logps/chosen": -2737.460693359375, "logps/rejected": -2574.682861328125, "loss": 6.1195, "rewards/accuracies": 0.5, "rewards/chosen": -77.64704132080078, "rewards/margins": 6.565042018890381, "rewards/rejected": -84.21208190917969, "step": 17720 }, { "epoch": 1.03, "grad_norm": 0.17152923345565796, "learning_rate": 0.0006598165563682806, "logits/chosen": -18.250246047973633, "logits/rejected": -18.08517837524414, "logps/chosen": -2673.9306640625, "logps/rejected": -2737.135498046875, "loss": 5.9523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.79837036132812, "rewards/margins": 3.138958692550659, "rewards/rejected": -162.93731689453125, "step": 17730 }, { "epoch": 1.03, "grad_norm": 1.917420604513609e-06, "learning_rate": 0.0006596230504276482, "logits/chosen": -19.275712966918945, "logits/rejected": -19.96891212463379, "logps/chosen": -2738.02490234375, "logps/rejected": -2599.197998046875, "loss": 22.7811, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.2067413330078, "rewards/margins": -18.61920928955078, "rewards/rejected": -163.58753967285156, "step": 17740 }, { "epoch": 1.03, "grad_norm": 111.98210906982422, "learning_rate": 0.0006594295444870157, "logits/chosen": -14.39684009552002, "logits/rejected": -14.877850532531738, "logps/chosen": -3056.06884765625, "logps/rejected": -2764.86083984375, "loss": 10.0931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -151.6201934814453, "rewards/margins": -1.8737472295761108, "rewards/rejected": -149.74642944335938, "step": 17750 }, { "epoch": 1.03, "grad_norm": 1231.081298828125, "learning_rate": 0.0006592360385463833, "logits/chosen": -14.023541450500488, "logits/rejected": -14.012384414672852, "logps/chosen": -2482.703125, "logps/rejected": -2854.33935546875, "loss": 12.1875, "rewards/accuracies": 0.5, "rewards/chosen": -140.2369384765625, "rewards/margins": 4.780575752258301, "rewards/rejected": -145.0175323486328, "step": 17760 }, { "epoch": 1.03, "grad_norm": 2742.236572265625, "learning_rate": 0.0006590425326057509, "logits/chosen": -13.17841911315918, "logits/rejected": -13.033685684204102, "logps/chosen": -6844.54541015625, "logps/rejected": -7920.36083984375, "loss": 75.2435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -604.5906982421875, "rewards/margins": 80.44258117675781, "rewards/rejected": -685.0332641601562, "step": 17770 }, { "epoch": 1.03, "grad_norm": 0.008358840830624104, "learning_rate": 0.0006588490266651186, "logits/chosen": -10.959142684936523, "logits/rejected": -10.992681503295898, "logps/chosen": -5630.9443359375, "logps/rejected": -5435.75830078125, "loss": 60.0124, "rewards/accuracies": 0.5, "rewards/chosen": -376.5745544433594, "rewards/margins": -39.01749038696289, "rewards/rejected": -337.5570983886719, "step": 17780 }, { "epoch": 1.03, "grad_norm": 0.6539702415466309, "learning_rate": 0.0006586555207244862, "logits/chosen": -17.03015899658203, "logits/rejected": -17.508106231689453, "logps/chosen": -2421.60693359375, "logps/rejected": -2345.713623046875, "loss": 8.7089, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -144.42921447753906, "rewards/margins": -7.640832424163818, "rewards/rejected": -136.7883758544922, "step": 17790 }, { "epoch": 1.03, "grad_norm": 99.52996826171875, "learning_rate": 0.0006584620147838539, "logits/chosen": -18.989700317382812, "logits/rejected": -20.691852569580078, "logps/chosen": -2734.974853515625, "logps/rejected": -2587.062255859375, "loss": 15.129, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -173.14761352539062, "rewards/margins": -12.141853332519531, "rewards/rejected": -161.00576782226562, "step": 17800 }, { "epoch": 1.03, "grad_norm": 87.49645233154297, "learning_rate": 0.0006582685088432215, "logits/chosen": -19.157123565673828, "logits/rejected": -19.585750579833984, "logps/chosen": -2639.051025390625, "logps/rejected": -2430.7314453125, "loss": 15.2647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.78959655761719, "rewards/margins": 2.768346071243286, "rewards/rejected": -128.5579376220703, "step": 17810 }, { "epoch": 1.03, "grad_norm": 22.517663955688477, "learning_rate": 0.0006580750029025891, "logits/chosen": -17.90814971923828, "logits/rejected": -18.150981903076172, "logps/chosen": -2692.14599609375, "logps/rejected": -2566.784912109375, "loss": 1.4216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.61369323730469, "rewards/margins": 10.973971366882324, "rewards/rejected": -115.5876693725586, "step": 17820 }, { "epoch": 1.03, "grad_norm": 33.71894454956055, "learning_rate": 0.0006578814969619567, "logits/chosen": -14.572375297546387, "logits/rejected": -14.237896919250488, "logps/chosen": -2476.408203125, "logps/rejected": -2882.01416015625, "loss": 12.2924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -38.06282043457031, "rewards/margins": 7.985507965087891, "rewards/rejected": -46.04833221435547, "step": 17830 }, { "epoch": 1.03, "grad_norm": 6.470327207352966e-05, "learning_rate": 0.0006576879910213244, "logits/chosen": -13.920690536499023, "logits/rejected": -13.934762954711914, "logps/chosen": -2839.69482421875, "logps/rejected": -2763.318359375, "loss": 7.4091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -79.15858459472656, "rewards/margins": 5.098099231719971, "rewards/rejected": -84.25666809082031, "step": 17840 }, { "epoch": 1.03, "grad_norm": 0.003500445745885372, "learning_rate": 0.000657494485080692, "logits/chosen": -16.798830032348633, "logits/rejected": -16.833303451538086, "logps/chosen": -2848.56787109375, "logps/rejected": -2879.74658203125, "loss": 5.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.31631469726562, "rewards/margins": 7.444512367248535, "rewards/rejected": -175.76083374023438, "step": 17850 }, { "epoch": 1.03, "grad_norm": 34.23456573486328, "learning_rate": 0.0006573009791400596, "logits/chosen": -17.76957893371582, "logits/rejected": -18.002023696899414, "logps/chosen": -2636.969482421875, "logps/rejected": -2622.617431640625, "loss": 4.9274, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -162.1955108642578, "rewards/margins": -0.8667789697647095, "rewards/rejected": -161.32870483398438, "step": 17860 }, { "epoch": 1.03, "grad_norm": 50.334537506103516, "learning_rate": 0.0006571074731994272, "logits/chosen": -18.403362274169922, "logits/rejected": -18.182315826416016, "logps/chosen": -2908.449951171875, "logps/rejected": -2755.63671875, "loss": 3.8419, "rewards/accuracies": 0.5, "rewards/chosen": -170.08297729492188, "rewards/margins": 10.978769302368164, "rewards/rejected": -181.06173706054688, "step": 17870 }, { "epoch": 1.03, "grad_norm": 34.82969284057617, "learning_rate": 0.0006569139672587948, "logits/chosen": -19.983449935913086, "logits/rejected": -19.845388412475586, "logps/chosen": -2041.8818359375, "logps/rejected": -2079.51611328125, "loss": 17.4871, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -176.94451904296875, "rewards/margins": 2.113325595855713, "rewards/rejected": -179.05784606933594, "step": 17880 }, { "epoch": 1.04, "grad_norm": 0.0, "learning_rate": 0.0006567204613181624, "logits/chosen": -20.142324447631836, "logits/rejected": -20.2545166015625, "logps/chosen": -2598.561279296875, "logps/rejected": -2543.634765625, "loss": 19.8208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -225.25338745117188, "rewards/margins": -1.9515670537948608, "rewards/rejected": -223.3018035888672, "step": 17890 }, { "epoch": 1.04, "grad_norm": 0.0003966184158343822, "learning_rate": 0.0006565269553775302, "logits/chosen": -18.155921936035156, "logits/rejected": -18.140499114990234, "logps/chosen": -2440.546630859375, "logps/rejected": -2511.336669921875, "loss": 4.3088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -138.8961944580078, "rewards/margins": 10.01477336883545, "rewards/rejected": -148.91098022460938, "step": 17900 }, { "epoch": 1.04, "grad_norm": 102.61849212646484, "learning_rate": 0.0006563334494368978, "logits/chosen": -13.245721817016602, "logits/rejected": -13.261129379272461, "logps/chosen": -2803.1552734375, "logps/rejected": -2627.22607421875, "loss": 15.2831, "rewards/accuracies": 0.5, "rewards/chosen": -78.31920623779297, "rewards/margins": 0.27148762345314026, "rewards/rejected": -78.59069061279297, "step": 17910 }, { "epoch": 1.04, "grad_norm": 24.68962287902832, "learning_rate": 0.0006561399434962654, "logits/chosen": -19.664623260498047, "logits/rejected": -19.711889266967773, "logps/chosen": -2628.63427734375, "logps/rejected": -2670.437744140625, "loss": 6.3763, "rewards/accuracies": 0.5, "rewards/chosen": -192.92843627929688, "rewards/margins": 6.498021125793457, "rewards/rejected": -199.42648315429688, "step": 17920 }, { "epoch": 1.04, "grad_norm": 24.502281188964844, "learning_rate": 0.000655946437555633, "logits/chosen": -16.269582748413086, "logits/rejected": -16.387666702270508, "logps/chosen": -2891.2626953125, "logps/rejected": -2453.90087890625, "loss": 0.9863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -116.16729736328125, "rewards/margins": 14.064977645874023, "rewards/rejected": -130.23226928710938, "step": 17930 }, { "epoch": 1.04, "grad_norm": 1.545037865638733, "learning_rate": 0.0006557529316150006, "logits/chosen": -16.012393951416016, "logits/rejected": -16.103450775146484, "logps/chosen": -2668.552734375, "logps/rejected": -2426.701904296875, "loss": 17.4004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.01263427734375, "rewards/margins": -9.724967956542969, "rewards/rejected": -142.2876739501953, "step": 17940 }, { "epoch": 1.04, "grad_norm": 0.00018300407100468874, "learning_rate": 0.0006555594256743683, "logits/chosen": -17.818279266357422, "logits/rejected": -18.136028289794922, "logps/chosen": -2534.72412109375, "logps/rejected": -2360.069580078125, "loss": 8.7872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.36056518554688, "rewards/margins": 0.28185349702835083, "rewards/rejected": -170.6424102783203, "step": 17950 }, { "epoch": 1.04, "grad_norm": 0.09127259254455566, "learning_rate": 0.0006553659197337359, "logits/chosen": -12.784310340881348, "logits/rejected": -12.809232711791992, "logps/chosen": -3067.268798828125, "logps/rejected": -2913.534912109375, "loss": 11.7261, "rewards/accuracies": 0.5, "rewards/chosen": -72.84140014648438, "rewards/margins": -3.8006834983825684, "rewards/rejected": -69.04072570800781, "step": 17960 }, { "epoch": 1.04, "grad_norm": 2.2679953417537035e-06, "learning_rate": 0.0006551724137931034, "logits/chosen": -15.233316421508789, "logits/rejected": -14.990638732910156, "logps/chosen": -2369.342041015625, "logps/rejected": -2391.49365234375, "loss": 20.5517, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.10045623779297, "rewards/margins": -12.905674934387207, "rewards/rejected": -86.19478607177734, "step": 17970 }, { "epoch": 1.04, "grad_norm": 10.870464324951172, "learning_rate": 0.000654978907852471, "logits/chosen": -17.121173858642578, "logits/rejected": -17.62735939025879, "logps/chosen": -2334.56591796875, "logps/rejected": -2091.011962890625, "loss": 20.4746, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -160.8229217529297, "rewards/margins": -17.832597732543945, "rewards/rejected": -142.99032592773438, "step": 17980 }, { "epoch": 1.04, "grad_norm": 29.614803314208984, "learning_rate": 0.0006547854019118386, "logits/chosen": -17.747955322265625, "logits/rejected": -16.922176361083984, "logps/chosen": -2849.87255859375, "logps/rejected": -2793.995849609375, "loss": 2.2778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.3944320678711, "rewards/margins": 11.033716201782227, "rewards/rejected": -132.4281463623047, "step": 17990 }, { "epoch": 1.04, "grad_norm": 4.143230744791835e-15, "learning_rate": 0.0006545918959712062, "logits/chosen": -19.875152587890625, "logits/rejected": -19.82231330871582, "logps/chosen": -2333.53955078125, "logps/rejected": -2321.361083984375, "loss": 2.5431, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -169.7103271484375, "rewards/margins": 27.190567016601562, "rewards/rejected": -196.90090942382812, "step": 18000 }, { "epoch": 1.04, "grad_norm": 6.957010191399604e-05, "learning_rate": 0.000654398390030574, "logits/chosen": -18.670856475830078, "logits/rejected": -19.091552734375, "logps/chosen": -2573.2890625, "logps/rejected": -2379.01513671875, "loss": 11.27, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.80691528320312, "rewards/margins": 10.67328929901123, "rewards/rejected": -161.48019409179688, "step": 18010 }, { "epoch": 1.04, "grad_norm": 0.6652073860168457, "learning_rate": 0.0006542048840899416, "logits/chosen": -20.28523063659668, "logits/rejected": -20.9559326171875, "logps/chosen": -2901.73876953125, "logps/rejected": -2613.048095703125, "loss": 1.4205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -116.46449279785156, "rewards/margins": 17.209009170532227, "rewards/rejected": -133.67349243164062, "step": 18020 }, { "epoch": 1.04, "grad_norm": 8.862825779942796e-06, "learning_rate": 0.0006540113781493092, "logits/chosen": -22.228694915771484, "logits/rejected": -22.43842887878418, "logps/chosen": -2648.78857421875, "logps/rejected": -2457.345703125, "loss": 5.8735, "rewards/accuracies": 0.5, "rewards/chosen": -193.37796020507812, "rewards/margins": 6.278349876403809, "rewards/rejected": -199.65631103515625, "step": 18030 }, { "epoch": 1.04, "grad_norm": 1.6859407966918205e-11, "learning_rate": 0.0006538178722086768, "logits/chosen": -15.237470626831055, "logits/rejected": -15.499791145324707, "logps/chosen": -2640.771484375, "logps/rejected": -2570.847900390625, "loss": 8.5085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.07308197021484, "rewards/margins": 2.007110595703125, "rewards/rejected": -90.08019256591797, "step": 18040 }, { "epoch": 1.04, "grad_norm": 3.493600606918335, "learning_rate": 0.0006536243662680444, "logits/chosen": -13.844995498657227, "logits/rejected": -13.91840934753418, "logps/chosen": -2371.8173828125, "logps/rejected": -2442.843505859375, "loss": 5.9767, "rewards/accuracies": 0.5, "rewards/chosen": -168.07701110839844, "rewards/margins": -0.10077209770679474, "rewards/rejected": -167.9762420654297, "step": 18050 }, { "epoch": 1.05, "grad_norm": 1.3376915894425245e-11, "learning_rate": 0.000653430860327412, "logits/chosen": -17.15746307373047, "logits/rejected": -16.268930435180664, "logps/chosen": -2460.80615234375, "logps/rejected": -2409.262939453125, "loss": 18.7636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.71829223632812, "rewards/margins": 3.0829224586486816, "rewards/rejected": -176.80120849609375, "step": 18060 }, { "epoch": 1.05, "grad_norm": 1.1506161691834937e-12, "learning_rate": 0.0006532373543867797, "logits/chosen": -13.249155044555664, "logits/rejected": -13.247228622436523, "logps/chosen": -2594.248291015625, "logps/rejected": -2640.721923828125, "loss": 9.9994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -114.76307678222656, "rewards/margins": 4.191125392913818, "rewards/rejected": -118.9542007446289, "step": 18070 }, { "epoch": 1.05, "grad_norm": 1.1549410733335819e-10, "learning_rate": 0.0006530438484461473, "logits/chosen": -14.266204833984375, "logits/rejected": -14.280769348144531, "logps/chosen": -2655.161376953125, "logps/rejected": -2585.276123046875, "loss": 3.1355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -133.6451416015625, "rewards/margins": 5.528804779052734, "rewards/rejected": -139.17396545410156, "step": 18080 }, { "epoch": 1.05, "grad_norm": 65.55862426757812, "learning_rate": 0.0006528503425055149, "logits/chosen": -17.749900817871094, "logits/rejected": -17.884082794189453, "logps/chosen": -2475.465576171875, "logps/rejected": -2540.885009765625, "loss": 11.8471, "rewards/accuracies": 0.5, "rewards/chosen": -217.9957275390625, "rewards/margins": 4.879603385925293, "rewards/rejected": -222.87533569335938, "step": 18090 }, { "epoch": 1.05, "grad_norm": 0.0, "learning_rate": 0.0006526568365648825, "logits/chosen": -9.66442584991455, "logits/rejected": -9.562416076660156, "logps/chosen": -3665.371826171875, "logps/rejected": -3289.214111328125, "loss": 9.5408, "rewards/accuracies": 0.5, "rewards/chosen": -126.91878509521484, "rewards/margins": 3.802445888519287, "rewards/rejected": -130.7212371826172, "step": 18100 }, { "epoch": 1.05, "grad_norm": 5.593033165496308e-07, "learning_rate": 0.0006524633306242502, "logits/chosen": -15.12066650390625, "logits/rejected": -14.829816818237305, "logps/chosen": -2721.202880859375, "logps/rejected": -2207.58935546875, "loss": 8.133, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.20855712890625, "rewards/margins": 3.564960479736328, "rewards/rejected": -167.7735137939453, "step": 18110 }, { "epoch": 1.05, "grad_norm": 49.8522834777832, "learning_rate": 0.0006522698246836179, "logits/chosen": -13.312726020812988, "logits/rejected": -13.645004272460938, "logps/chosen": -2399.109375, "logps/rejected": -2407.430908203125, "loss": 5.1544, "rewards/accuracies": 0.5, "rewards/chosen": -91.56727600097656, "rewards/margins": 10.811846733093262, "rewards/rejected": -102.3791275024414, "step": 18120 }, { "epoch": 1.05, "grad_norm": 1.935260174334985e-09, "learning_rate": 0.0006520763187429855, "logits/chosen": -14.404935836791992, "logits/rejected": -14.400016784667969, "logps/chosen": -2579.782958984375, "logps/rejected": -2568.54541015625, "loss": 4.1486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -74.46351623535156, "rewards/margins": 9.194433212280273, "rewards/rejected": -83.65794372558594, "step": 18130 }, { "epoch": 1.05, "grad_norm": 97.90447235107422, "learning_rate": 0.0006518828128023531, "logits/chosen": -11.712196350097656, "logits/rejected": -11.836206436157227, "logps/chosen": -2751.72509765625, "logps/rejected": -2755.717041015625, "loss": 3.5322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -74.82429504394531, "rewards/margins": 0.8046678304672241, "rewards/rejected": -75.62895202636719, "step": 18140 }, { "epoch": 1.05, "grad_norm": 16.948911666870117, "learning_rate": 0.0006516893068617207, "logits/chosen": -13.425773620605469, "logits/rejected": -13.379196166992188, "logps/chosen": -2491.244873046875, "logps/rejected": -2467.28564453125, "loss": 4.6037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -143.30734252929688, "rewards/margins": 0.8458770513534546, "rewards/rejected": -144.15322875976562, "step": 18150 }, { "epoch": 1.05, "grad_norm": 0.0030910870991647243, "learning_rate": 0.0006514958009210883, "logits/chosen": -11.093270301818848, "logits/rejected": -11.11808967590332, "logps/chosen": -2555.7626953125, "logps/rejected": -2471.859130859375, "loss": 4.5886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.03903198242188, "rewards/margins": 2.3036694526672363, "rewards/rejected": -171.34271240234375, "step": 18160 }, { "epoch": 1.05, "grad_norm": 2.488131030986551e-06, "learning_rate": 0.000651302294980456, "logits/chosen": -10.985884666442871, "logits/rejected": -10.8902006149292, "logps/chosen": -2798.58056640625, "logps/rejected": -2354.966064453125, "loss": 1.9293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.7798614501953, "rewards/margins": 4.129740238189697, "rewards/rejected": -136.90957641601562, "step": 18170 }, { "epoch": 1.05, "grad_norm": 110.12316131591797, "learning_rate": 0.0006511087890398236, "logits/chosen": -13.087597846984863, "logits/rejected": -13.423629760742188, "logps/chosen": -2754.46337890625, "logps/rejected": -2621.26806640625, "loss": 3.7163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.28707885742188, "rewards/margins": 8.414031028747559, "rewards/rejected": -202.70111083984375, "step": 18180 }, { "epoch": 1.05, "grad_norm": 11.934343338012695, "learning_rate": 0.0006509152830991911, "logits/chosen": -13.275197982788086, "logits/rejected": -13.403196334838867, "logps/chosen": -2465.97412109375, "logps/rejected": -2390.68798828125, "loss": 5.7225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.36654663085938, "rewards/margins": 0.17589111626148224, "rewards/rejected": -205.54244995117188, "step": 18190 }, { "epoch": 1.05, "grad_norm": 44.05063247680664, "learning_rate": 0.0006507217771585587, "logits/chosen": -11.828393936157227, "logits/rejected": -11.90717887878418, "logps/chosen": -2831.72119140625, "logps/rejected": -2782.240234375, "loss": 9.7441, "rewards/accuracies": 0.5, "rewards/chosen": -188.10421752929688, "rewards/margins": -6.121962547302246, "rewards/rejected": -181.98223876953125, "step": 18200 }, { "epoch": 1.05, "grad_norm": 0.00036709161940962076, "learning_rate": 0.0006505282712179263, "logits/chosen": -12.121580123901367, "logits/rejected": -12.189722061157227, "logps/chosen": -2910.908447265625, "logps/rejected": -2778.807373046875, "loss": 26.6681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.85791015625, "rewards/margins": -14.101049423217773, "rewards/rejected": -118.75687408447266, "step": 18210 }, { "epoch": 1.05, "grad_norm": 1.1078535067099438e-07, "learning_rate": 0.000650334765277294, "logits/chosen": -12.542414665222168, "logits/rejected": -12.639643669128418, "logps/chosen": -3255.946044921875, "logps/rejected": -2763.612060546875, "loss": 2.6874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.6297607421875, "rewards/margins": 17.55990982055664, "rewards/rejected": -214.18966674804688, "step": 18220 }, { "epoch": 1.06, "grad_norm": 0.012088737450540066, "learning_rate": 0.0006501412593366617, "logits/chosen": -12.920709609985352, "logits/rejected": -12.868507385253906, "logps/chosen": -2984.66064453125, "logps/rejected": -2799.221435546875, "loss": 15.9431, "rewards/accuracies": 0.5, "rewards/chosen": -119.96952819824219, "rewards/margins": -12.50279426574707, "rewards/rejected": -107.46673583984375, "step": 18230 }, { "epoch": 1.06, "grad_norm": 8.048547897487879e-06, "learning_rate": 0.0006499477533960293, "logits/chosen": -15.946377754211426, "logits/rejected": -15.864021301269531, "logps/chosen": -2955.722412109375, "logps/rejected": -2728.6376953125, "loss": 31.5158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.56182861328125, "rewards/margins": -9.619417190551758, "rewards/rejected": -152.94241333007812, "step": 18240 }, { "epoch": 1.06, "grad_norm": 90.1219482421875, "learning_rate": 0.0006497542474553969, "logits/chosen": -19.484867095947266, "logits/rejected": -19.545333862304688, "logps/chosen": -2381.725341796875, "logps/rejected": -2255.910400390625, "loss": 13.8706, "rewards/accuracies": 0.5, "rewards/chosen": -140.5354766845703, "rewards/margins": -1.9426681995391846, "rewards/rejected": -138.59280395507812, "step": 18250 }, { "epoch": 1.06, "grad_norm": 11.078644752502441, "learning_rate": 0.0006495607415147645, "logits/chosen": -21.191478729248047, "logits/rejected": -21.708206176757812, "logps/chosen": -2555.10107421875, "logps/rejected": -2589.739013671875, "loss": 6.0559, "rewards/accuracies": 0.5, "rewards/chosen": -220.2880096435547, "rewards/margins": 15.934168815612793, "rewards/rejected": -236.22219848632812, "step": 18260 }, { "epoch": 1.06, "grad_norm": 0.0019464632496237755, "learning_rate": 0.0006493672355741321, "logits/chosen": -14.1913480758667, "logits/rejected": -14.470649719238281, "logps/chosen": -2839.57373046875, "logps/rejected": -2501.06298828125, "loss": 26.2031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -76.06132507324219, "rewards/margins": -21.349987030029297, "rewards/rejected": -54.711341857910156, "step": 18270 }, { "epoch": 1.06, "grad_norm": 1.1746955408398208e-07, "learning_rate": 0.0006491737296334997, "logits/chosen": -13.802218437194824, "logits/rejected": -13.852333068847656, "logps/chosen": -2730.520263671875, "logps/rejected": -2575.9912109375, "loss": 10.5681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.41239929199219, "rewards/margins": 1.1684051752090454, "rewards/rejected": -111.58082580566406, "step": 18280 }, { "epoch": 1.06, "grad_norm": 63.06666946411133, "learning_rate": 0.0006489802236928674, "logits/chosen": -14.077341079711914, "logits/rejected": -14.034856796264648, "logps/chosen": -2490.669921875, "logps/rejected": -2464.925537109375, "loss": 4.3986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -116.8666763305664, "rewards/margins": 9.18848991394043, "rewards/rejected": -126.05517578125, "step": 18290 }, { "epoch": 1.06, "grad_norm": 59.89677810668945, "learning_rate": 0.000648786717752235, "logits/chosen": -13.705610275268555, "logits/rejected": -14.067889213562012, "logps/chosen": -2913.98681640625, "logps/rejected": -2703.021240234375, "loss": 6.4356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.9784393310547, "rewards/margins": 6.599660396575928, "rewards/rejected": -147.57809448242188, "step": 18300 }, { "epoch": 1.06, "grad_norm": 0.0012730387970805168, "learning_rate": 0.0006485932118116026, "logits/chosen": -12.942420959472656, "logits/rejected": -12.87464427947998, "logps/chosen": -2698.73486328125, "logps/rejected": -2414.763671875, "loss": 7.6117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -107.99678802490234, "rewards/margins": 3.3577027320861816, "rewards/rejected": -111.35450744628906, "step": 18310 }, { "epoch": 1.06, "grad_norm": 115.6467514038086, "learning_rate": 0.0006483997058709703, "logits/chosen": -15.907655715942383, "logits/rejected": -15.855707168579102, "logps/chosen": -2773.44921875, "logps/rejected": -2542.77294921875, "loss": 21.713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.76930236816406, "rewards/margins": -14.200357437133789, "rewards/rejected": -143.56893920898438, "step": 18320 }, { "epoch": 1.06, "grad_norm": 9.073420524597168, "learning_rate": 0.0006482061999303379, "logits/chosen": -17.764970779418945, "logits/rejected": -17.74069595336914, "logps/chosen": -2569.608154296875, "logps/rejected": -2589.33056640625, "loss": 2.5761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -216.0121612548828, "rewards/margins": 15.244134902954102, "rewards/rejected": -231.2562713623047, "step": 18330 }, { "epoch": 1.06, "grad_norm": 0.031765297055244446, "learning_rate": 0.0006480126939897055, "logits/chosen": -14.99738883972168, "logits/rejected": -14.941308975219727, "logps/chosen": -2597.35791015625, "logps/rejected": -2677.241455078125, "loss": 5.108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.87643432617188, "rewards/margins": 5.657392978668213, "rewards/rejected": -167.5338134765625, "step": 18340 }, { "epoch": 1.06, "grad_norm": 0.010680228471755981, "learning_rate": 0.0006478191880490732, "logits/chosen": -16.579036712646484, "logits/rejected": -16.56783676147461, "logps/chosen": -2664.482177734375, "logps/rejected": -2509.15966796875, "loss": 7.4447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.68185424804688, "rewards/margins": 1.2084167003631592, "rewards/rejected": -156.89024353027344, "step": 18350 }, { "epoch": 1.06, "grad_norm": 30.593324661254883, "learning_rate": 0.0006476256821084408, "logits/chosen": -18.12723159790039, "logits/rejected": -18.103565216064453, "logps/chosen": -2681.145263671875, "logps/rejected": -2709.48486328125, "loss": 4.0834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -234.8867950439453, "rewards/margins": 6.491602897644043, "rewards/rejected": -241.37838745117188, "step": 18360 }, { "epoch": 1.06, "grad_norm": 1.6122471196630683e-13, "learning_rate": 0.0006474321761678084, "logits/chosen": -17.73172378540039, "logits/rejected": -18.06329917907715, "logps/chosen": -2754.20556640625, "logps/rejected": -2749.8232421875, "loss": 9.035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.24398803710938, "rewards/margins": 13.687644958496094, "rewards/rejected": -213.93161010742188, "step": 18370 }, { "epoch": 1.06, "grad_norm": 10.267723083496094, "learning_rate": 0.000647238670227176, "logits/chosen": -17.083011627197266, "logits/rejected": -17.074676513671875, "logps/chosen": -2491.38720703125, "logps/rejected": -2711.22802734375, "loss": 6.2867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.68580627441406, "rewards/margins": 3.981966495513916, "rewards/rejected": -174.66778564453125, "step": 18380 }, { "epoch": 1.06, "grad_norm": 2.0267880256052972e-13, "learning_rate": 0.0006470451642865436, "logits/chosen": -15.574933052062988, "logits/rejected": -15.632780075073242, "logps/chosen": -2587.67333984375, "logps/rejected": -2715.318359375, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -214.32632446289062, "rewards/margins": 30.114736557006836, "rewards/rejected": -244.44107055664062, "step": 18390 }, { "epoch": 1.07, "grad_norm": 0.015496517531573772, "learning_rate": 0.0006468516583459113, "logits/chosen": -16.571914672851562, "logits/rejected": -17.10669708251953, "logps/chosen": -2989.75146484375, "logps/rejected": -2377.63671875, "loss": 21.0468, "rewards/accuracies": 0.5, "rewards/chosen": -226.9674530029297, "rewards/margins": -16.65578269958496, "rewards/rejected": -210.31167602539062, "step": 18400 }, { "epoch": 1.07, "grad_norm": 0.5162507891654968, "learning_rate": 0.0006466581524052788, "logits/chosen": -16.868824005126953, "logits/rejected": -16.866168975830078, "logps/chosen": -2732.99365234375, "logps/rejected": -2519.905029296875, "loss": 6.8268, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -188.1628875732422, "rewards/margins": -5.314640045166016, "rewards/rejected": -182.84823608398438, "step": 18410 }, { "epoch": 1.07, "grad_norm": 0.0, "learning_rate": 0.0006464646464646465, "logits/chosen": -14.064114570617676, "logits/rejected": -14.126386642456055, "logps/chosen": -2813.98095703125, "logps/rejected": -2484.885986328125, "loss": 6.6432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.3629913330078, "rewards/margins": 12.448031425476074, "rewards/rejected": -150.81100463867188, "step": 18420 }, { "epoch": 1.07, "grad_norm": 0.0, "learning_rate": 0.0006462711405240141, "logits/chosen": -15.334185600280762, "logits/rejected": -14.840217590332031, "logps/chosen": -2985.962890625, "logps/rejected": -2620.7548828125, "loss": 6.6954, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.41586303710938, "rewards/margins": 12.079202651977539, "rewards/rejected": -216.4950714111328, "step": 18430 }, { "epoch": 1.07, "grad_norm": 489.8996276855469, "learning_rate": 0.0006460776345833817, "logits/chosen": -12.9453763961792, "logits/rejected": -12.999053955078125, "logps/chosen": -2734.45068359375, "logps/rejected": -2751.5283203125, "loss": 6.4207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -128.08795166015625, "rewards/margins": 0.3099174499511719, "rewards/rejected": -128.39785766601562, "step": 18440 }, { "epoch": 1.07, "grad_norm": 8.488736057188362e-06, "learning_rate": 0.0006458841286427493, "logits/chosen": -14.465617179870605, "logits/rejected": -14.861513137817383, "logps/chosen": -2656.832763671875, "logps/rejected": -2708.439697265625, "loss": 40.5077, "rewards/accuracies": 0.5, "rewards/chosen": -220.5743865966797, "rewards/margins": -11.806196212768555, "rewards/rejected": -208.7681884765625, "step": 18450 }, { "epoch": 1.07, "grad_norm": 0.9897474050521851, "learning_rate": 0.000645690622702117, "logits/chosen": -13.678378105163574, "logits/rejected": -13.877462387084961, "logps/chosen": -2720.913818359375, "logps/rejected": -2973.906494140625, "loss": 15.2709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -114.55789947509766, "rewards/margins": 2.5900871753692627, "rewards/rejected": -117.14798736572266, "step": 18460 }, { "epoch": 1.07, "grad_norm": 53.03737258911133, "learning_rate": 0.0006454971167614846, "logits/chosen": -16.345170974731445, "logits/rejected": -16.369159698486328, "logps/chosen": -2707.34716796875, "logps/rejected": -2262.95556640625, "loss": 41.3322, "rewards/accuracies": 0.5, "rewards/chosen": -234.85952758789062, "rewards/margins": -36.04633331298828, "rewards/rejected": -198.81314086914062, "step": 18470 }, { "epoch": 1.07, "grad_norm": 3.390065103303641e-05, "learning_rate": 0.0006453036108208522, "logits/chosen": -14.550959587097168, "logits/rejected": -14.540814399719238, "logps/chosen": -2558.34326171875, "logps/rejected": -2621.507568359375, "loss": 5.4155, "rewards/accuracies": 0.5, "rewards/chosen": -161.03294372558594, "rewards/margins": 3.1486213207244873, "rewards/rejected": -164.1815643310547, "step": 18480 }, { "epoch": 1.07, "grad_norm": 6.256654739379883, "learning_rate": 0.0006451101048802198, "logits/chosen": -15.327921867370605, "logits/rejected": -15.159469604492188, "logps/chosen": -2581.81396484375, "logps/rejected": -2405.64013671875, "loss": 6.0628, "rewards/accuracies": 0.5, "rewards/chosen": -150.5410614013672, "rewards/margins": 3.9623312950134277, "rewards/rejected": -154.50338745117188, "step": 18490 }, { "epoch": 1.07, "grad_norm": 0.015210007317364216, "learning_rate": 0.0006449165989395874, "logits/chosen": -17.18783950805664, "logits/rejected": -17.027297973632812, "logps/chosen": -2735.6826171875, "logps/rejected": -2740.43701171875, "loss": 2.6772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -203.3050079345703, "rewards/margins": 8.989025115966797, "rewards/rejected": -212.2940216064453, "step": 18500 }, { "epoch": 1.07, "grad_norm": 0.003910095430910587, "learning_rate": 0.000644723092998955, "logits/chosen": -14.071126937866211, "logits/rejected": -14.107185363769531, "logps/chosen": -2586.05712890625, "logps/rejected": -2386.455810546875, "loss": 21.129, "rewards/accuracies": 0.5, "rewards/chosen": -151.72073364257812, "rewards/margins": -17.330713272094727, "rewards/rejected": -134.3900146484375, "step": 18510 }, { "epoch": 1.07, "grad_norm": 48.673152923583984, "learning_rate": 0.0006445295870583227, "logits/chosen": -14.354853630065918, "logits/rejected": -14.359591484069824, "logps/chosen": -2813.942626953125, "logps/rejected": -2759.3759765625, "loss": 15.5464, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -144.94818115234375, "rewards/margins": -6.633845329284668, "rewards/rejected": -138.3143310546875, "step": 18520 }, { "epoch": 1.07, "grad_norm": 0.0, "learning_rate": 0.0006443360811176904, "logits/chosen": -13.126893997192383, "logits/rejected": -13.20947265625, "logps/chosen": -2779.88818359375, "logps/rejected": -2306.0244140625, "loss": 3.6827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -100.90190124511719, "rewards/margins": 12.569501876831055, "rewards/rejected": -113.47139739990234, "step": 18530 }, { "epoch": 1.07, "grad_norm": 94.47711181640625, "learning_rate": 0.000644142575177058, "logits/chosen": -15.215921401977539, "logits/rejected": -15.471264839172363, "logps/chosen": -2764.672607421875, "logps/rejected": -2600.560791015625, "loss": 21.0462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.83578491210938, "rewards/margins": -12.872329711914062, "rewards/rejected": -167.96347045898438, "step": 18540 }, { "epoch": 1.07, "grad_norm": 58.31540298461914, "learning_rate": 0.0006439490692364256, "logits/chosen": -16.171031951904297, "logits/rejected": -16.2322998046875, "logps/chosen": -2438.81787109375, "logps/rejected": -2350.939697265625, "loss": 5.8847, "rewards/accuracies": 0.5, "rewards/chosen": -182.5393524169922, "rewards/margins": -0.04955101013183594, "rewards/rejected": -182.48980712890625, "step": 18550 }, { "epoch": 1.07, "grad_norm": 6.520372020091258e-10, "learning_rate": 0.0006437555632957932, "logits/chosen": -13.236478805541992, "logits/rejected": -13.41456413269043, "logps/chosen": -2787.65869140625, "logps/rejected": -2471.39208984375, "loss": 4.2021, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.49668884277344, "rewards/margins": 10.128719329833984, "rewards/rejected": -129.6254119873047, "step": 18560 }, { "epoch": 1.07, "grad_norm": 64.639892578125, "learning_rate": 0.0006435620573551609, "logits/chosen": -15.481752395629883, "logits/rejected": -15.459635734558105, "logps/chosen": -2606.776123046875, "logps/rejected": -2511.28076171875, "loss": 16.5094, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -158.55747985839844, "rewards/margins": -6.494751930236816, "rewards/rejected": -152.06271362304688, "step": 18570 }, { "epoch": 1.08, "grad_norm": 0.0011564791202545166, "learning_rate": 0.0006433685514145285, "logits/chosen": -15.735422134399414, "logits/rejected": -16.29483985900879, "logps/chosen": -2469.67431640625, "logps/rejected": -2735.911865234375, "loss": 1.8292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.3331298828125, "rewards/margins": 28.41147804260254, "rewards/rejected": -212.74459838867188, "step": 18580 }, { "epoch": 1.08, "grad_norm": 46.59956741333008, "learning_rate": 0.0006431750454738961, "logits/chosen": -12.322563171386719, "logits/rejected": -12.516687393188477, "logps/chosen": -2657.18115234375, "logps/rejected": -2429.076904296875, "loss": 29.1942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -99.33233642578125, "rewards/margins": -16.800006866455078, "rewards/rejected": -82.53231811523438, "step": 18590 }, { "epoch": 1.08, "grad_norm": 0.00010116126213688403, "learning_rate": 0.0006429815395332637, "logits/chosen": -11.049848556518555, "logits/rejected": -11.189860343933105, "logps/chosen": -2631.807861328125, "logps/rejected": -2546.81494140625, "loss": 6.7714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -87.0969009399414, "rewards/margins": -2.430284023284912, "rewards/rejected": -84.66661071777344, "step": 18600 }, { "epoch": 1.08, "grad_norm": 1.6425647666562342e-14, "learning_rate": 0.0006427880335926313, "logits/chosen": -13.178567886352539, "logits/rejected": -13.328947067260742, "logps/chosen": -2437.9677734375, "logps/rejected": -2344.59033203125, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": -186.88922119140625, "rewards/margins": 13.88330364227295, "rewards/rejected": -200.77255249023438, "step": 18610 }, { "epoch": 1.08, "grad_norm": 0.010728963650763035, "learning_rate": 0.0006425945276519988, "logits/chosen": -11.857630729675293, "logits/rejected": -12.19336986541748, "logps/chosen": -2530.31298828125, "logps/rejected": -2584.834716796875, "loss": 8.2841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.8941650390625, "rewards/margins": 0.14087677001953125, "rewards/rejected": -110.03504943847656, "step": 18620 }, { "epoch": 1.08, "grad_norm": 1.4304753541946411, "learning_rate": 0.0006424010217113666, "logits/chosen": -14.107526779174805, "logits/rejected": -14.481622695922852, "logps/chosen": -2631.67431640625, "logps/rejected": -2343.370849609375, "loss": 10.9704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.00204467773438, "rewards/margins": -1.3627326488494873, "rewards/rejected": -199.6393280029297, "step": 18630 }, { "epoch": 1.08, "grad_norm": 140.9226531982422, "learning_rate": 0.0006422075157707342, "logits/chosen": -13.261438369750977, "logits/rejected": -13.404703140258789, "logps/chosen": -2848.9091796875, "logps/rejected": -2521.93115234375, "loss": 37.9997, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -135.58926391601562, "rewards/margins": -30.288049697875977, "rewards/rejected": -105.30122375488281, "step": 18640 }, { "epoch": 1.08, "grad_norm": 0.0010857509914785624, "learning_rate": 0.0006420140098301018, "logits/chosen": -16.233978271484375, "logits/rejected": -16.63918685913086, "logps/chosen": -2556.46142578125, "logps/rejected": -2590.917236328125, "loss": 14.5068, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.91500854492188, "rewards/margins": -9.242159843444824, "rewards/rejected": -161.67286682128906, "step": 18650 }, { "epoch": 1.08, "grad_norm": 0.0006157975294627249, "learning_rate": 0.0006418205038894694, "logits/chosen": -17.104524612426758, "logits/rejected": -18.10073471069336, "logps/chosen": -2544.732666015625, "logps/rejected": -2438.190185546875, "loss": 16.5451, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -145.85467529296875, "rewards/margins": -6.743910312652588, "rewards/rejected": -139.11077880859375, "step": 18660 }, { "epoch": 1.08, "grad_norm": 266.5450134277344, "learning_rate": 0.000641626997948837, "logits/chosen": -12.482576370239258, "logits/rejected": -12.531208038330078, "logps/chosen": -2534.276123046875, "logps/rejected": -2483.88818359375, "loss": 11.505, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -102.26289367675781, "rewards/margins": -2.066711902618408, "rewards/rejected": -100.1961669921875, "step": 18670 }, { "epoch": 1.08, "grad_norm": 286.1163024902344, "learning_rate": 0.0006414334920082046, "logits/chosen": -12.83008861541748, "logits/rejected": -13.228860855102539, "logps/chosen": -2670.96728515625, "logps/rejected": -2283.58935546875, "loss": 9.5993, "rewards/accuracies": 0.5, "rewards/chosen": -74.38529968261719, "rewards/margins": 6.408027648925781, "rewards/rejected": -80.79332733154297, "step": 18680 }, { "epoch": 1.08, "grad_norm": 1.2494499358750577e-11, "learning_rate": 0.0006412399860675723, "logits/chosen": -13.391357421875, "logits/rejected": -13.376322746276855, "logps/chosen": -2622.1064453125, "logps/rejected": -2531.30810546875, "loss": 2.581, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -148.55484008789062, "rewards/margins": 15.869850158691406, "rewards/rejected": -164.42466735839844, "step": 18690 }, { "epoch": 1.08, "grad_norm": 28.59694480895996, "learning_rate": 0.0006410464801269399, "logits/chosen": -13.791908264160156, "logits/rejected": -13.781512260437012, "logps/chosen": -2311.4482421875, "logps/rejected": -1838.058349609375, "loss": 6.1962, "rewards/accuracies": 0.5, "rewards/chosen": -133.84320068359375, "rewards/margins": 4.039305210113525, "rewards/rejected": -137.8824920654297, "step": 18700 }, { "epoch": 1.08, "grad_norm": 78.86100006103516, "learning_rate": 0.0006408529741863075, "logits/chosen": -11.758400917053223, "logits/rejected": -11.708978652954102, "logps/chosen": -2931.28857421875, "logps/rejected": -3219.963623046875, "loss": 7.2331, "rewards/accuracies": 0.5, "rewards/chosen": -117.36280822753906, "rewards/margins": 0.4836254119873047, "rewards/rejected": -117.84642028808594, "step": 18710 }, { "epoch": 1.08, "grad_norm": 5.5575379519723356e-05, "learning_rate": 0.0006406594682456751, "logits/chosen": -18.590206146240234, "logits/rejected": -18.60792350769043, "logps/chosen": -2426.788818359375, "logps/rejected": -2474.84423828125, "loss": 1.1893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.4740753173828, "rewards/margins": 9.244760513305664, "rewards/rejected": -218.7188262939453, "step": 18720 }, { "epoch": 1.08, "grad_norm": 5.21488520859755e-19, "learning_rate": 0.0006404659623050427, "logits/chosen": -16.339996337890625, "logits/rejected": -17.22274398803711, "logps/chosen": -2911.31396484375, "logps/rejected": -2585.45556640625, "loss": 7.0461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -218.56161499023438, "rewards/margins": 10.060446739196777, "rewards/rejected": -228.6220703125, "step": 18730 }, { "epoch": 1.08, "grad_norm": 3.894953614193633e-14, "learning_rate": 0.0006402724563644105, "logits/chosen": -16.373870849609375, "logits/rejected": -16.539104461669922, "logps/chosen": -2409.0927734375, "logps/rejected": -2230.962158203125, "loss": 4.924, "rewards/accuracies": 0.5, "rewards/chosen": -134.57736206054688, "rewards/margins": 11.708487510681152, "rewards/rejected": -146.28585815429688, "step": 18740 }, { "epoch": 1.09, "grad_norm": 11.50810718536377, "learning_rate": 0.0006400789504237781, "logits/chosen": -15.475316047668457, "logits/rejected": -15.602508544921875, "logps/chosen": -2421.437255859375, "logps/rejected": -2350.72509765625, "loss": 13.8446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.25357055664062, "rewards/margins": -1.7098610401153564, "rewards/rejected": -172.543701171875, "step": 18750 }, { "epoch": 1.09, "grad_norm": 40.4423713684082, "learning_rate": 0.0006398854444831457, "logits/chosen": -14.482098579406738, "logits/rejected": -14.54973316192627, "logps/chosen": -2447.87109375, "logps/rejected": -2332.670166015625, "loss": 11.5972, "rewards/accuracies": 0.5, "rewards/chosen": -170.46456909179688, "rewards/margins": -6.604469299316406, "rewards/rejected": -163.86009216308594, "step": 18760 }, { "epoch": 1.09, "grad_norm": 56.909610748291016, "learning_rate": 0.0006396919385425133, "logits/chosen": -16.093204498291016, "logits/rejected": -15.873682975769043, "logps/chosen": -2671.48779296875, "logps/rejected": -2811.310791015625, "loss": 4.6581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.943115234375, "rewards/margins": 1.5637578964233398, "rewards/rejected": -109.50687408447266, "step": 18770 }, { "epoch": 1.09, "grad_norm": 14.179445266723633, "learning_rate": 0.0006394984326018809, "logits/chosen": -16.984975814819336, "logits/rejected": -18.18759536743164, "logps/chosen": -2799.709716796875, "logps/rejected": -2208.045166015625, "loss": 10.1632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.4470977783203, "rewards/margins": 17.554218292236328, "rewards/rejected": -180.00131225585938, "step": 18780 }, { "epoch": 1.09, "grad_norm": 0.0, "learning_rate": 0.0006393049266612485, "logits/chosen": -11.93360710144043, "logits/rejected": -11.984992027282715, "logps/chosen": -3178.48291015625, "logps/rejected": -2784.57080078125, "loss": 5.7452, "rewards/accuracies": 0.5, "rewards/chosen": -84.55751037597656, "rewards/margins": 11.742630004882812, "rewards/rejected": -96.30013275146484, "step": 18790 }, { "epoch": 1.09, "grad_norm": 23.72662353515625, "learning_rate": 0.0006391114207206162, "logits/chosen": -14.825119018554688, "logits/rejected": -14.799077987670898, "logps/chosen": -2847.20263671875, "logps/rejected": -2494.70068359375, "loss": 16.3985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.40655517578125, "rewards/margins": -0.4384002685546875, "rewards/rejected": -172.96817016601562, "step": 18800 }, { "epoch": 1.09, "grad_norm": 0.0, "learning_rate": 0.0006389179147799838, "logits/chosen": -16.783916473388672, "logits/rejected": -17.865957260131836, "logps/chosen": -2578.155029296875, "logps/rejected": -2691.337890625, "loss": 2.1564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.87631225585938, "rewards/margins": 12.654867172241211, "rewards/rejected": -201.5311737060547, "step": 18810 }, { "epoch": 1.09, "grad_norm": 34.71656799316406, "learning_rate": 0.0006387244088393514, "logits/chosen": -16.91122055053711, "logits/rejected": -17.25710678100586, "logps/chosen": -2603.72021484375, "logps/rejected": -2514.8115234375, "loss": 8.9631, "rewards/accuracies": 0.5, "rewards/chosen": -136.82723999023438, "rewards/margins": -2.0385398864746094, "rewards/rejected": -134.7886962890625, "step": 18820 }, { "epoch": 1.09, "grad_norm": 19.649837493896484, "learning_rate": 0.000638530902898719, "logits/chosen": -13.528434753417969, "logits/rejected": -13.252342224121094, "logps/chosen": -2617.801513671875, "logps/rejected": -2375.80810546875, "loss": 10.8879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.02696228027344, "rewards/margins": 2.9109864234924316, "rewards/rejected": -155.9379425048828, "step": 18830 }, { "epoch": 1.09, "grad_norm": 1.4542387053247164e-19, "learning_rate": 0.0006383373969580867, "logits/chosen": -15.140657424926758, "logits/rejected": -15.154635429382324, "logps/chosen": -1910.202880859375, "logps/rejected": -1901.6802978515625, "loss": 13.2211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.80665588378906, "rewards/margins": 9.380895614624023, "rewards/rejected": -164.1875457763672, "step": 18840 }, { "epoch": 1.09, "grad_norm": 66.05923461914062, "learning_rate": 0.0006381438910174542, "logits/chosen": -10.824610710144043, "logits/rejected": -10.97273063659668, "logps/chosen": -2405.11572265625, "logps/rejected": -2439.01318359375, "loss": 7.7618, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -108.53975677490234, "rewards/margins": -4.8840131759643555, "rewards/rejected": -103.6557388305664, "step": 18850 }, { "epoch": 1.09, "grad_norm": 0.012790413573384285, "learning_rate": 0.0006379503850768219, "logits/chosen": -13.043855667114258, "logits/rejected": -12.960538864135742, "logps/chosen": -2682.28076171875, "logps/rejected": -2702.103515625, "loss": 10.6591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.50253295898438, "rewards/margins": -2.9002654552459717, "rewards/rejected": -166.60227966308594, "step": 18860 }, { "epoch": 1.09, "grad_norm": 1.3828233263318834e-12, "learning_rate": 0.0006377568791361895, "logits/chosen": -14.880714416503906, "logits/rejected": -15.12080192565918, "logps/chosen": -2542.48486328125, "logps/rejected": -2566.07958984375, "loss": 1.5208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -183.84591674804688, "rewards/margins": 11.118715286254883, "rewards/rejected": -194.96461486816406, "step": 18870 }, { "epoch": 1.09, "grad_norm": 4.960706064593978e-06, "learning_rate": 0.0006375633731955571, "logits/chosen": -11.030685424804688, "logits/rejected": -11.132460594177246, "logps/chosen": -2557.551513671875, "logps/rejected": -2200.0068359375, "loss": 15.6527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.73094177246094, "rewards/margins": -2.2571094036102295, "rewards/rejected": -108.47383880615234, "step": 18880 }, { "epoch": 1.09, "grad_norm": 2.2820476885954122e-07, "learning_rate": 0.0006373698672549247, "logits/chosen": -13.183090209960938, "logits/rejected": -13.139045715332031, "logps/chosen": -2529.93994140625, "logps/rejected": -2149.87255859375, "loss": 3.0795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -103.02647399902344, "rewards/margins": 12.886552810668945, "rewards/rejected": -115.91304016113281, "step": 18890 }, { "epoch": 1.09, "grad_norm": 1.441044569015503, "learning_rate": 0.0006371763613142923, "logits/chosen": -14.546331405639648, "logits/rejected": -14.778053283691406, "logps/chosen": -2345.91259765625, "logps/rejected": -2166.329345703125, "loss": 18.844, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.43905639648438, "rewards/margins": -15.836921691894531, "rewards/rejected": -121.60212707519531, "step": 18900 }, { "epoch": 1.09, "grad_norm": 3.3843672309075146e-10, "learning_rate": 0.0006369828553736599, "logits/chosen": -13.638799667358398, "logits/rejected": -13.656659126281738, "logps/chosen": -2833.658935546875, "logps/rejected": -2214.235107421875, "loss": 12.3274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.5411376953125, "rewards/margins": 8.746155738830566, "rewards/rejected": -147.28729248046875, "step": 18910 }, { "epoch": 1.1, "grad_norm": 104.84061431884766, "learning_rate": 0.0006367893494330276, "logits/chosen": -16.191059112548828, "logits/rejected": -17.133018493652344, "logps/chosen": -2933.07421875, "logps/rejected": -2684.655029296875, "loss": 11.0495, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -188.97122192382812, "rewards/margins": -9.937211990356445, "rewards/rejected": -179.03402709960938, "step": 18920 }, { "epoch": 1.1, "grad_norm": 19.573427200317383, "learning_rate": 0.0006365958434923952, "logits/chosen": -16.29667854309082, "logits/rejected": -16.56948471069336, "logps/chosen": -2521.08642578125, "logps/rejected": -2310.067138671875, "loss": 13.4835, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -148.0331268310547, "rewards/margins": -8.470131874084473, "rewards/rejected": -139.56301879882812, "step": 18930 }, { "epoch": 1.1, "grad_norm": 176.9124755859375, "learning_rate": 0.0006364023375517628, "logits/chosen": -19.37432289123535, "logits/rejected": -20.708492279052734, "logps/chosen": -2656.14501953125, "logps/rejected": -2755.55078125, "loss": 2.9417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.67758178710938, "rewards/margins": 15.805374145507812, "rewards/rejected": -176.48294067382812, "step": 18940 }, { "epoch": 1.1, "grad_norm": 0.008695689029991627, "learning_rate": 0.0006362088316111305, "logits/chosen": -12.512141227722168, "logits/rejected": -13.047345161437988, "logps/chosen": -2337.86669921875, "logps/rejected": -2640.65869140625, "loss": 12.7303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -69.88206481933594, "rewards/margins": -3.09578275680542, "rewards/rejected": -66.7862777709961, "step": 18950 }, { "epoch": 1.1, "grad_norm": 20.926698684692383, "learning_rate": 0.0006360153256704981, "logits/chosen": -14.180719375610352, "logits/rejected": -14.132040023803711, "logps/chosen": -2664.052734375, "logps/rejected": -2369.3134765625, "loss": 5.5756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.77558898925781, "rewards/margins": 13.426844596862793, "rewards/rejected": -107.20243835449219, "step": 18960 }, { "epoch": 1.1, "grad_norm": 0.0066204858012497425, "learning_rate": 0.0006358218197298658, "logits/chosen": -16.5673885345459, "logits/rejected": -16.94641876220703, "logps/chosen": -2228.561767578125, "logps/rejected": -2238.927734375, "loss": 6.2651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.86276245117188, "rewards/margins": 6.485212802886963, "rewards/rejected": -202.3479766845703, "step": 18970 }, { "epoch": 1.1, "grad_norm": 7.62041711807251, "learning_rate": 0.0006356283137892334, "logits/chosen": -14.957595825195312, "logits/rejected": -14.708475112915039, "logps/chosen": -2316.362548828125, "logps/rejected": -2029.2021484375, "loss": 5.2129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.20339965820312, "rewards/margins": 15.30212116241455, "rewards/rejected": -146.5055389404297, "step": 18980 }, { "epoch": 1.1, "grad_norm": 0.0008885202114470303, "learning_rate": 0.000635434807848601, "logits/chosen": -12.575508117675781, "logits/rejected": -12.481321334838867, "logps/chosen": -2518.669677734375, "logps/rejected": -2620.14501953125, "loss": 4.2517, "rewards/accuracies": 0.5, "rewards/chosen": -84.6268310546875, "rewards/margins": -0.7556654214859009, "rewards/rejected": -83.87117004394531, "step": 18990 }, { "epoch": 1.1, "grad_norm": 2.655186414718628, "learning_rate": 0.0006352413019079686, "logits/chosen": -13.232877731323242, "logits/rejected": -13.271661758422852, "logps/chosen": -2887.74658203125, "logps/rejected": -2627.570556640625, "loss": 3.3014, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -116.96942138671875, "rewards/margins": -2.4943442344665527, "rewards/rejected": -114.4750747680664, "step": 19000 }, { "epoch": 1.1, "grad_norm": 2.509296734842792e-07, "learning_rate": 0.0006350477959673362, "logits/chosen": -16.791608810424805, "logits/rejected": -16.665348052978516, "logps/chosen": -2328.99365234375, "logps/rejected": -1916.3359375, "loss": 29.6027, "rewards/accuracies": 0.5, "rewards/chosen": -159.98448181152344, "rewards/margins": -17.786495208740234, "rewards/rejected": -142.19798278808594, "step": 19010 }, { "epoch": 1.1, "grad_norm": 1.1942981048562729e-09, "learning_rate": 0.0006348542900267038, "logits/chosen": -15.362385749816895, "logits/rejected": -15.344103813171387, "logps/chosen": -2756.014892578125, "logps/rejected": -2181.813232421875, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": -105.69993591308594, "rewards/margins": 30.159961700439453, "rewards/rejected": -135.85989379882812, "step": 19020 }, { "epoch": 1.1, "grad_norm": 25.226364135742188, "learning_rate": 0.0006346607840860715, "logits/chosen": -17.681018829345703, "logits/rejected": -18.585514068603516, "logps/chosen": -2684.51220703125, "logps/rejected": -2426.870361328125, "loss": 1.6524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -155.68099975585938, "rewards/margins": 15.476966857910156, "rewards/rejected": -171.15797424316406, "step": 19030 }, { "epoch": 1.1, "grad_norm": 0.6884913444519043, "learning_rate": 0.0006344672781454391, "logits/chosen": -20.76845359802246, "logits/rejected": -20.93770980834961, "logps/chosen": -2372.113525390625, "logps/rejected": -2402.6396484375, "loss": 2.3456, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -177.16725158691406, "rewards/margins": 5.0173845291137695, "rewards/rejected": -182.18463134765625, "step": 19040 }, { "epoch": 1.1, "grad_norm": 2.1698936009458493e-17, "learning_rate": 0.0006342737722048068, "logits/chosen": -16.923538208007812, "logits/rejected": -17.3807315826416, "logps/chosen": -2530.644287109375, "logps/rejected": -2310.533203125, "loss": 27.1576, "rewards/accuracies": 0.5, "rewards/chosen": -142.75137329101562, "rewards/margins": -18.25326919555664, "rewards/rejected": -124.49809265136719, "step": 19050 }, { "epoch": 1.1, "grad_norm": 2.8539221333390152e-11, "learning_rate": 0.0006340802662641744, "logits/chosen": -14.693998336791992, "logits/rejected": -15.006998062133789, "logps/chosen": -2729.03564453125, "logps/rejected": -2318.7353515625, "loss": 34.2618, "rewards/accuracies": 0.5, "rewards/chosen": -140.1939697265625, "rewards/margins": -29.474590301513672, "rewards/rejected": -110.71937561035156, "step": 19060 }, { "epoch": 1.1, "grad_norm": 145.40493774414062, "learning_rate": 0.0006338867603235419, "logits/chosen": -12.548174858093262, "logits/rejected": -12.605488777160645, "logps/chosen": -3044.05322265625, "logps/rejected": -2702.05126953125, "loss": 13.2031, "rewards/accuracies": 0.5, "rewards/chosen": -148.3280487060547, "rewards/margins": -5.030269145965576, "rewards/rejected": -143.2977752685547, "step": 19070 }, { "epoch": 1.1, "grad_norm": 82.36003875732422, "learning_rate": 0.0006336932543829095, "logits/chosen": -12.337336540222168, "logits/rejected": -12.217978477478027, "logps/chosen": -3013.01708984375, "logps/rejected": -2517.33642578125, "loss": 8.1885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -150.9998016357422, "rewards/margins": -3.395965576171875, "rewards/rejected": -147.6038360595703, "step": 19080 }, { "epoch": 1.11, "grad_norm": 1.9141250504617346e-06, "learning_rate": 0.0006334997484422772, "logits/chosen": -13.829282760620117, "logits/rejected": -13.4330472946167, "logps/chosen": -2882.770263671875, "logps/rejected": -2874.483154296875, "loss": 15.5389, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.11630249023438, "rewards/margins": -11.602126121520996, "rewards/rejected": -170.51416015625, "step": 19090 }, { "epoch": 1.11, "grad_norm": 102.61868286132812, "learning_rate": 0.0006333062425016448, "logits/chosen": -14.566232681274414, "logits/rejected": -14.512929916381836, "logps/chosen": -2976.300537109375, "logps/rejected": -2604.41650390625, "loss": 21.8764, "rewards/accuracies": 0.5, "rewards/chosen": -182.89794921875, "rewards/margins": -15.435750961303711, "rewards/rejected": -167.46218872070312, "step": 19100 }, { "epoch": 1.11, "grad_norm": 0.28902631998062134, "learning_rate": 0.0006331127365610124, "logits/chosen": -15.116874694824219, "logits/rejected": -15.038782119750977, "logps/chosen": -3121.625244140625, "logps/rejected": -2954.15478515625, "loss": 10.0624, "rewards/accuracies": 0.5, "rewards/chosen": -166.3004150390625, "rewards/margins": -5.1813812255859375, "rewards/rejected": -161.11903381347656, "step": 19110 }, { "epoch": 1.11, "grad_norm": 0.0005553787341341376, "learning_rate": 0.00063291923062038, "logits/chosen": -15.759027481079102, "logits/rejected": -16.879701614379883, "logps/chosen": -2926.26171875, "logps/rejected": -2632.264892578125, "loss": 7.7482, "rewards/accuracies": 0.5, "rewards/chosen": -143.973388671875, "rewards/margins": 3.387188673019409, "rewards/rejected": -147.36058044433594, "step": 19120 }, { "epoch": 1.11, "grad_norm": 237.58224487304688, "learning_rate": 0.0006327257246797476, "logits/chosen": -15.494132995605469, "logits/rejected": -16.305965423583984, "logps/chosen": -2802.964599609375, "logps/rejected": -2288.951171875, "loss": 9.6027, "rewards/accuracies": 0.5, "rewards/chosen": -127.44439697265625, "rewards/margins": 26.628192901611328, "rewards/rejected": -154.07260131835938, "step": 19130 }, { "epoch": 1.11, "grad_norm": 2.8500376429534124e-13, "learning_rate": 0.0006325322187391152, "logits/chosen": -13.532464981079102, "logits/rejected": -13.553136825561523, "logps/chosen": -2687.1806640625, "logps/rejected": -2598.836181640625, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": -121.70982360839844, "rewards/margins": 16.107568740844727, "rewards/rejected": -137.81739807128906, "step": 19140 }, { "epoch": 1.11, "grad_norm": 1.7821313475518137e-17, "learning_rate": 0.0006323387127984829, "logits/chosen": -17.28138542175293, "logits/rejected": -17.552915573120117, "logps/chosen": -2754.655517578125, "logps/rejected": -2669.62646484375, "loss": 13.8685, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -211.02896118164062, "rewards/margins": -6.79870080947876, "rewards/rejected": -204.23025512695312, "step": 19150 }, { "epoch": 1.11, "grad_norm": 3.97255784108483e-15, "learning_rate": 0.0006321452068578506, "logits/chosen": -15.81383991241455, "logits/rejected": -16.23069190979004, "logps/chosen": -2530.734619140625, "logps/rejected": -2595.52099609375, "loss": 3.9397, "rewards/accuracies": 0.5, "rewards/chosen": -164.46694946289062, "rewards/margins": 10.005033493041992, "rewards/rejected": -174.47198486328125, "step": 19160 }, { "epoch": 1.11, "grad_norm": 0.0015546600334346294, "learning_rate": 0.0006319517009172182, "logits/chosen": -13.89789867401123, "logits/rejected": -14.119112014770508, "logps/chosen": -2763.287353515625, "logps/rejected": -2921.88720703125, "loss": 8.0281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -117.2865219116211, "rewards/margins": 4.696813106536865, "rewards/rejected": -121.98332214355469, "step": 19170 }, { "epoch": 1.11, "grad_norm": 2.011852628935884e-10, "learning_rate": 0.0006317581949765858, "logits/chosen": -14.500981330871582, "logits/rejected": -14.669360160827637, "logps/chosen": -2876.4169921875, "logps/rejected": -2884.684814453125, "loss": 3.2884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.16122436523438, "rewards/margins": 5.446791648864746, "rewards/rejected": -186.60800170898438, "step": 19180 }, { "epoch": 1.11, "grad_norm": 38.62531280517578, "learning_rate": 0.0006315646890359534, "logits/chosen": -15.306180000305176, "logits/rejected": -15.856396675109863, "logps/chosen": -2807.61767578125, "logps/rejected": -2718.30517578125, "loss": 6.8488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -243.0048065185547, "rewards/margins": 1.3927596807479858, "rewards/rejected": -244.3975372314453, "step": 19190 }, { "epoch": 1.11, "grad_norm": 0.0, "learning_rate": 0.0006313711830953211, "logits/chosen": -12.200918197631836, "logits/rejected": -12.269386291503906, "logps/chosen": -2755.948486328125, "logps/rejected": -2520.93994140625, "loss": 1.9283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.306182861328125, "rewards/margins": 14.709749221801758, "rewards/rejected": -69.01593017578125, "step": 19200 }, { "epoch": 1.11, "grad_norm": 0.0027858465909957886, "learning_rate": 0.0006311776771546887, "logits/chosen": -12.598379135131836, "logits/rejected": -12.724326133728027, "logps/chosen": -2707.037841796875, "logps/rejected": -2678.20166015625, "loss": 9.6015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -63.9962158203125, "rewards/margins": -0.33704453706741333, "rewards/rejected": -63.6591682434082, "step": 19210 }, { "epoch": 1.11, "grad_norm": 15.867057800292969, "learning_rate": 0.0006309841712140563, "logits/chosen": -12.675150871276855, "logits/rejected": -13.178794860839844, "logps/chosen": -2636.25341796875, "logps/rejected": -2590.59521484375, "loss": 29.6166, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -37.46825408935547, "rewards/margins": -22.346994400024414, "rewards/rejected": -15.121261596679688, "step": 19220 }, { "epoch": 1.11, "grad_norm": 0.04478229209780693, "learning_rate": 0.0006307906652734239, "logits/chosen": -13.52739429473877, "logits/rejected": -13.956077575683594, "logps/chosen": -3040.41064453125, "logps/rejected": -2687.823486328125, "loss": 18.8635, "rewards/accuracies": 0.5, "rewards/chosen": -151.68768310546875, "rewards/margins": -9.370272636413574, "rewards/rejected": -142.31741333007812, "step": 19230 }, { "epoch": 1.11, "grad_norm": 8.673196792602539, "learning_rate": 0.0006305971593327915, "logits/chosen": -15.736318588256836, "logits/rejected": -16.36471176147461, "logps/chosen": -3040.1015625, "logps/rejected": -2916.971435546875, "loss": 1.5403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.43557739257812, "rewards/margins": 5.164031982421875, "rewards/rejected": -195.59959411621094, "step": 19240 }, { "epoch": 1.11, "grad_norm": 0.8512486815452576, "learning_rate": 0.0006304036533921591, "logits/chosen": -14.03247356414795, "logits/rejected": -14.260393142700195, "logps/chosen": -2586.707763671875, "logps/rejected": -2593.3564453125, "loss": 2.4399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.58737182617188, "rewards/margins": 14.792716979980469, "rewards/rejected": -160.38009643554688, "step": 19250 }, { "epoch": 1.11, "grad_norm": 42.80944061279297, "learning_rate": 0.0006302101474515269, "logits/chosen": -13.180234909057617, "logits/rejected": -13.29516315460205, "logps/chosen": -2786.33447265625, "logps/rejected": -2111.410888671875, "loss": 13.3744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -137.254638671875, "rewards/margins": 3.794295072555542, "rewards/rejected": -141.04893493652344, "step": 19260 }, { "epoch": 1.12, "grad_norm": 3.267291307449341, "learning_rate": 0.0006300166415108945, "logits/chosen": -12.805707931518555, "logits/rejected": -13.155412673950195, "logps/chosen": -2550.201904296875, "logps/rejected": -2494.468505859375, "loss": 3.5412, "rewards/accuracies": 0.5, "rewards/chosen": -135.66549682617188, "rewards/margins": 0.7081781625747681, "rewards/rejected": -136.37368774414062, "step": 19270 }, { "epoch": 1.12, "grad_norm": 83.01951599121094, "learning_rate": 0.000629823135570262, "logits/chosen": -11.276800155639648, "logits/rejected": -11.47711181640625, "logps/chosen": -2805.231201171875, "logps/rejected": -2676.158203125, "loss": 1.9189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -152.53538513183594, "rewards/margins": 6.488310813903809, "rewards/rejected": -159.02369689941406, "step": 19280 }, { "epoch": 1.12, "grad_norm": 0.06542842835187912, "learning_rate": 0.0006296296296296296, "logits/chosen": -12.742895126342773, "logits/rejected": -13.146052360534668, "logps/chosen": -2861.725830078125, "logps/rejected": -2767.17724609375, "loss": 2.679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.3155059814453, "rewards/margins": 6.306870937347412, "rewards/rejected": -142.62234497070312, "step": 19290 }, { "epoch": 1.12, "grad_norm": 57.3027229309082, "learning_rate": 0.0006294361236889972, "logits/chosen": -17.753347396850586, "logits/rejected": -18.353832244873047, "logps/chosen": -2174.50830078125, "logps/rejected": -2224.87158203125, "loss": 7.5806, "rewards/accuracies": 0.5, "rewards/chosen": -171.9303436279297, "rewards/margins": -0.6250280141830444, "rewards/rejected": -171.30532836914062, "step": 19300 }, { "epoch": 1.12, "grad_norm": 45.88935852050781, "learning_rate": 0.0006292426177483648, "logits/chosen": -14.34782600402832, "logits/rejected": -14.458340644836426, "logps/chosen": -2840.78271484375, "logps/rejected": -2624.683349609375, "loss": 17.5503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -210.2712860107422, "rewards/margins": -7.758893013000488, "rewards/rejected": -202.5124053955078, "step": 19310 }, { "epoch": 1.12, "grad_norm": 3.3198866844177246, "learning_rate": 0.0006290491118077325, "logits/chosen": -14.563748359680176, "logits/rejected": -14.705549240112305, "logps/chosen": -2898.9345703125, "logps/rejected": -2733.474609375, "loss": 7.3183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.78660583496094, "rewards/margins": 4.049895286560059, "rewards/rejected": -137.83651733398438, "step": 19320 }, { "epoch": 1.12, "grad_norm": 5.474048137664795, "learning_rate": 0.0006288556058671001, "logits/chosen": -14.140698432922363, "logits/rejected": -14.43018627166748, "logps/chosen": -2745.296875, "logps/rejected": -2510.781982421875, "loss": 15.3251, "rewards/accuracies": 0.5, "rewards/chosen": -157.630126953125, "rewards/margins": -10.395654678344727, "rewards/rejected": -147.23448181152344, "step": 19330 }, { "epoch": 1.12, "grad_norm": 2.1182146072387695, "learning_rate": 0.0006286620999264677, "logits/chosen": -15.082473754882812, "logits/rejected": -15.304651260375977, "logps/chosen": -2777.77490234375, "logps/rejected": -2760.09765625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -90.6736068725586, "rewards/margins": 17.98607063293457, "rewards/rejected": -108.65968322753906, "step": 19340 }, { "epoch": 1.12, "grad_norm": 94.4244384765625, "learning_rate": 0.0006284685939858353, "logits/chosen": -10.622354507446289, "logits/rejected": -10.591663360595703, "logps/chosen": -2482.04150390625, "logps/rejected": -2432.91064453125, "loss": 6.6018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -46.363548278808594, "rewards/margins": 3.5788357257843018, "rewards/rejected": -49.9423828125, "step": 19350 }, { "epoch": 1.12, "grad_norm": 4.8537538532400504e-05, "learning_rate": 0.0006282750880452029, "logits/chosen": -13.315977096557617, "logits/rejected": -13.284589767456055, "logps/chosen": -2622.13330078125, "logps/rejected": -2414.587158203125, "loss": 25.0128, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.95379638671875, "rewards/margins": -17.828262329101562, "rewards/rejected": -120.12553405761719, "step": 19360 }, { "epoch": 1.12, "grad_norm": 140.5040740966797, "learning_rate": 0.0006280815821045707, "logits/chosen": -16.159740447998047, "logits/rejected": -16.093290328979492, "logps/chosen": -2722.93115234375, "logps/rejected": -2486.173583984375, "loss": 14.2275, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -165.2968292236328, "rewards/margins": 2.752591609954834, "rewards/rejected": -168.0494384765625, "step": 19370 }, { "epoch": 1.12, "grad_norm": 3.1745844353281427e-06, "learning_rate": 0.0006278880761639383, "logits/chosen": -14.205159187316895, "logits/rejected": -14.439480781555176, "logps/chosen": -2682.63916015625, "logps/rejected": -2462.609375, "loss": 3.0944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.95884704589844, "rewards/margins": 7.7285475730896, "rewards/rejected": -177.68740844726562, "step": 19380 }, { "epoch": 1.12, "grad_norm": 78.8459701538086, "learning_rate": 0.0006276945702233059, "logits/chosen": -12.34683895111084, "logits/rejected": -12.436210632324219, "logps/chosen": -2497.523193359375, "logps/rejected": -2500.91162109375, "loss": 7.2561, "rewards/accuracies": 0.5, "rewards/chosen": -106.44621276855469, "rewards/margins": -1.7646820545196533, "rewards/rejected": -104.68153381347656, "step": 19390 }, { "epoch": 1.12, "grad_norm": 55.75485610961914, "learning_rate": 0.0006275010642826735, "logits/chosen": -20.967960357666016, "logits/rejected": -21.510982513427734, "logps/chosen": -2180.215087890625, "logps/rejected": -2270.02783203125, "loss": 1.4561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -191.6199493408203, "rewards/margins": 15.978497505187988, "rewards/rejected": -207.5984344482422, "step": 19400 }, { "epoch": 1.12, "grad_norm": 47.21528244018555, "learning_rate": 0.0006273075583420411, "logits/chosen": -13.820161819458008, "logits/rejected": -13.973386764526367, "logps/chosen": -2617.89501953125, "logps/rejected": -2164.485595703125, "loss": 6.9917, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.5231704711914, "rewards/margins": -3.5759308338165283, "rewards/rejected": -109.9472427368164, "step": 19410 }, { "epoch": 1.12, "grad_norm": 106.64588928222656, "learning_rate": 0.0006271140524014087, "logits/chosen": -17.24288558959961, "logits/rejected": -17.669048309326172, "logps/chosen": -2347.7177734375, "logps/rejected": -2435.04345703125, "loss": 4.5273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.9317626953125, "rewards/margins": 15.090255737304688, "rewards/rejected": -161.02200317382812, "step": 19420 }, { "epoch": 1.12, "grad_norm": 0.1460665911436081, "learning_rate": 0.0006269205464607764, "logits/chosen": -16.253292083740234, "logits/rejected": -16.492340087890625, "logps/chosen": -2557.147216796875, "logps/rejected": -2740.31884765625, "loss": 5.1201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.69522094726562, "rewards/margins": 9.935577392578125, "rewards/rejected": -165.63079833984375, "step": 19430 }, { "epoch": 1.13, "grad_norm": 2.3401637077331543, "learning_rate": 0.000626727040520144, "logits/chosen": -21.565025329589844, "logits/rejected": -21.251720428466797, "logps/chosen": -2667.57177734375, "logps/rejected": -2429.85498046875, "loss": 3.1933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -166.6869354248047, "rewards/margins": 12.030401229858398, "rewards/rejected": -178.7173614501953, "step": 19440 }, { "epoch": 1.13, "grad_norm": 75.53313446044922, "learning_rate": 0.0006265335345795116, "logits/chosen": -14.68385124206543, "logits/rejected": -14.284799575805664, "logps/chosen": -2957.99072265625, "logps/rejected": -2657.02880859375, "loss": 9.6323, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -164.78680419921875, "rewards/margins": -4.064880847930908, "rewards/rejected": -160.721923828125, "step": 19450 }, { "epoch": 1.13, "grad_norm": 11.480693817138672, "learning_rate": 0.0006263400286388792, "logits/chosen": -15.491783142089844, "logits/rejected": -15.970993041992188, "logps/chosen": -2874.22998046875, "logps/rejected": -2593.79150390625, "loss": 10.0604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.19113159179688, "rewards/margins": 4.152002334594727, "rewards/rejected": -178.3431396484375, "step": 19460 }, { "epoch": 1.13, "grad_norm": 36.773624420166016, "learning_rate": 0.0006261465226982469, "logits/chosen": -16.5140323638916, "logits/rejected": -16.750843048095703, "logps/chosen": -2763.96923828125, "logps/rejected": -2699.44677734375, "loss": 5.1447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.62100219726562, "rewards/margins": 4.510872840881348, "rewards/rejected": -137.1318817138672, "step": 19470 }, { "epoch": 1.13, "grad_norm": 183.4235382080078, "learning_rate": 0.0006259530167576146, "logits/chosen": -15.515497207641602, "logits/rejected": -16.04322052001953, "logps/chosen": -2841.53955078125, "logps/rejected": -2504.586669921875, "loss": 6.7439, "rewards/accuracies": 0.5, "rewards/chosen": -175.8154754638672, "rewards/margins": 3.5575408935546875, "rewards/rejected": -179.37301635742188, "step": 19480 }, { "epoch": 1.13, "grad_norm": 17.62044906616211, "learning_rate": 0.0006257595108169822, "logits/chosen": -15.282984733581543, "logits/rejected": -15.42530632019043, "logps/chosen": -2515.30126953125, "logps/rejected": -2415.899169921875, "loss": 14.6574, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -166.10067749023438, "rewards/margins": -13.387290954589844, "rewards/rejected": -152.71339416503906, "step": 19490 }, { "epoch": 1.13, "grad_norm": 19.355802536010742, "learning_rate": 0.0006255660048763497, "logits/chosen": -16.45159912109375, "logits/rejected": -16.71609878540039, "logps/chosen": -2895.394775390625, "logps/rejected": -2807.540283203125, "loss": 5.2206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.01260375976562, "rewards/margins": 8.107523918151855, "rewards/rejected": -187.12013244628906, "step": 19500 }, { "epoch": 1.13, "grad_norm": 2.1128425942151807e-07, "learning_rate": 0.0006253724989357173, "logits/chosen": -18.420644760131836, "logits/rejected": -20.578533172607422, "logps/chosen": -2610.8115234375, "logps/rejected": -2400.20263671875, "loss": 18.3663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.93826293945312, "rewards/margins": -8.610776901245117, "rewards/rejected": -182.32748413085938, "step": 19510 }, { "epoch": 1.13, "grad_norm": 22.25389862060547, "learning_rate": 0.0006251789929950849, "logits/chosen": -17.115005493164062, "logits/rejected": -17.01926040649414, "logps/chosen": -2910.776611328125, "logps/rejected": -2894.355712890625, "loss": 0.66, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.58029174804688, "rewards/margins": 18.498998641967773, "rewards/rejected": -173.0792999267578, "step": 19520 }, { "epoch": 1.13, "grad_norm": 0.0046583497896790504, "learning_rate": 0.0006249854870544525, "logits/chosen": -15.095239639282227, "logits/rejected": -15.121915817260742, "logps/chosen": -2876.569580078125, "logps/rejected": -2633.6162109375, "loss": 9.4888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.28182983398438, "rewards/margins": 7.27727746963501, "rewards/rejected": -192.55911254882812, "step": 19530 }, { "epoch": 1.13, "grad_norm": 40.48994064331055, "learning_rate": 0.0006247919811138202, "logits/chosen": -17.23662567138672, "logits/rejected": -17.08550262451172, "logps/chosen": -2760.3828125, "logps/rejected": -2581.943359375, "loss": 15.9894, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -238.5589599609375, "rewards/margins": -7.4201860427856445, "rewards/rejected": -231.13876342773438, "step": 19540 }, { "epoch": 1.13, "grad_norm": 4.371961040305905e-06, "learning_rate": 0.0006245984751731878, "logits/chosen": -12.544698715209961, "logits/rejected": -13.017370223999023, "logps/chosen": -3091.31396484375, "logps/rejected": -3085.56982421875, "loss": 7.3935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.18197631835938, "rewards/margins": 4.401679039001465, "rewards/rejected": -166.58363342285156, "step": 19550 }, { "epoch": 1.13, "grad_norm": 88.56646728515625, "learning_rate": 0.0006244049692325554, "logits/chosen": -10.044931411743164, "logits/rejected": -9.45326042175293, "logps/chosen": -3078.5224609375, "logps/rejected": -2638.32421875, "loss": 15.9246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -67.81315612792969, "rewards/margins": -4.803783893585205, "rewards/rejected": -63.009376525878906, "step": 19560 }, { "epoch": 1.13, "grad_norm": 46.60211181640625, "learning_rate": 0.000624211463291923, "logits/chosen": -11.807806015014648, "logits/rejected": -11.693695068359375, "logps/chosen": -2918.14306640625, "logps/rejected": -2902.70166015625, "loss": 1.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.35101318359375, "rewards/margins": 8.807394027709961, "rewards/rejected": -127.15840148925781, "step": 19570 }, { "epoch": 1.13, "grad_norm": 1.2923531077349603e-16, "learning_rate": 0.0006240179573512907, "logits/chosen": -11.745552062988281, "logits/rejected": -12.976590156555176, "logps/chosen": -2793.16259765625, "logps/rejected": -3071.03955078125, "loss": 8.2966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.93690490722656, "rewards/margins": 5.049465656280518, "rewards/rejected": -159.98635864257812, "step": 19580 }, { "epoch": 1.13, "grad_norm": 0.00017286604270339012, "learning_rate": 0.0006238244514106583, "logits/chosen": -11.815695762634277, "logits/rejected": -11.966728210449219, "logps/chosen": -3374.848876953125, "logps/rejected": -3177.6953125, "loss": 5.6802, "rewards/accuracies": 0.5, "rewards/chosen": -140.54867553710938, "rewards/margins": 2.2381255626678467, "rewards/rejected": -142.7867889404297, "step": 19590 }, { "epoch": 1.13, "grad_norm": 2.350482482260663e-11, "learning_rate": 0.000623630945470026, "logits/chosen": -11.714215278625488, "logits/rejected": -11.953948020935059, "logps/chosen": -3110.521240234375, "logps/rejected": -3070.3896484375, "loss": 0.5547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -124.7431411743164, "rewards/margins": 16.718639373779297, "rewards/rejected": -141.46176147460938, "step": 19600 }, { "epoch": 1.14, "grad_norm": 4.394644737243652, "learning_rate": 0.0006234374395293936, "logits/chosen": -15.77540111541748, "logits/rejected": -15.807843208312988, "logps/chosen": -2835.657958984375, "logps/rejected": -2872.884033203125, "loss": 4.3045, "rewards/accuracies": 0.5, "rewards/chosen": -207.2449951171875, "rewards/margins": 3.811746120452881, "rewards/rejected": -211.0567626953125, "step": 19610 }, { "epoch": 1.14, "grad_norm": 0.00027117342688143253, "learning_rate": 0.0006232439335887612, "logits/chosen": -16.862661361694336, "logits/rejected": -16.7922420501709, "logps/chosen": -2554.833740234375, "logps/rejected": -2653.023193359375, "loss": 10.2036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.5516357421875, "rewards/margins": 5.151959419250488, "rewards/rejected": -205.70361328125, "step": 19620 }, { "epoch": 1.14, "grad_norm": 0.0, "learning_rate": 0.0006230504276481288, "logits/chosen": -15.001005172729492, "logits/rejected": -15.397730827331543, "logps/chosen": -2584.983642578125, "logps/rejected": -2209.90478515625, "loss": 3.175, "rewards/accuracies": 0.5, "rewards/chosen": -155.70498657226562, "rewards/margins": 6.682043552398682, "rewards/rejected": -162.3870391845703, "step": 19630 }, { "epoch": 1.14, "grad_norm": 1.5000466146997127e-11, "learning_rate": 0.0006228569217074964, "logits/chosen": -13.550073623657227, "logits/rejected": -13.651281356811523, "logps/chosen": -3084.353515625, "logps/rejected": -2719.6337890625, "loss": 2.5783, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.2141876220703, "rewards/margins": 17.142139434814453, "rewards/rejected": -149.35633850097656, "step": 19640 }, { "epoch": 1.14, "grad_norm": 63.93633270263672, "learning_rate": 0.000622663415766864, "logits/chosen": -14.389155387878418, "logits/rejected": -14.693142890930176, "logps/chosen": -3012.537353515625, "logps/rejected": -2725.93994140625, "loss": 9.7766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.3124542236328, "rewards/margins": 8.995356559753418, "rewards/rejected": -155.30780029296875, "step": 19650 }, { "epoch": 1.14, "grad_norm": 8.714898314110542e-08, "learning_rate": 0.0006224699098262317, "logits/chosen": -19.85348129272461, "logits/rejected": -19.88248062133789, "logps/chosen": -2463.520263671875, "logps/rejected": -2486.387451171875, "loss": 6.2056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -217.53073120117188, "rewards/margins": 3.9833457469940186, "rewards/rejected": -221.51406860351562, "step": 19660 }, { "epoch": 1.14, "grad_norm": 0.0, "learning_rate": 0.0006222764038855993, "logits/chosen": -20.453411102294922, "logits/rejected": -20.851125717163086, "logps/chosen": -2655.94482421875, "logps/rejected": -2248.94677734375, "loss": 37.6356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.56704711914062, "rewards/margins": -19.251140594482422, "rewards/rejected": -162.31590270996094, "step": 19670 }, { "epoch": 1.14, "grad_norm": 34.66673278808594, "learning_rate": 0.000622082897944967, "logits/chosen": -16.956066131591797, "logits/rejected": -17.05416488647461, "logps/chosen": -2844.82373046875, "logps/rejected": -2936.86767578125, "loss": 2.5097, "rewards/accuracies": 0.5, "rewards/chosen": -150.36273193359375, "rewards/margins": 2.414205551147461, "rewards/rejected": -152.77694702148438, "step": 19680 }, { "epoch": 1.14, "grad_norm": 0.00020828229025937617, "learning_rate": 0.0006218893920043346, "logits/chosen": -15.545659065246582, "logits/rejected": -15.445856094360352, "logps/chosen": -2981.844970703125, "logps/rejected": -2890.6484375, "loss": 2.1974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.9605255126953, "rewards/margins": 6.6658196449279785, "rewards/rejected": -138.6263427734375, "step": 19690 }, { "epoch": 1.14, "grad_norm": 7.415769188061366e-20, "learning_rate": 0.0006216958860637022, "logits/chosen": -16.458457946777344, "logits/rejected": -16.380109786987305, "logps/chosen": -3054.701904296875, "logps/rejected": -2687.05859375, "loss": 7.2926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.33645629882812, "rewards/margins": 7.4609503746032715, "rewards/rejected": -196.79737854003906, "step": 19700 }, { "epoch": 1.14, "grad_norm": 118.32626342773438, "learning_rate": 0.0006215023801230699, "logits/chosen": -19.386295318603516, "logits/rejected": -19.969470977783203, "logps/chosen": -2675.273193359375, "logps/rejected": -2724.322021484375, "loss": 8.9005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -202.70997619628906, "rewards/margins": 21.322526931762695, "rewards/rejected": -224.03250122070312, "step": 19710 }, { "epoch": 1.14, "grad_norm": 3.8271628000075e-06, "learning_rate": 0.0006213088741824374, "logits/chosen": -19.4138240814209, "logits/rejected": -20.58106231689453, "logps/chosen": -2779.5869140625, "logps/rejected": -2660.256103515625, "loss": 4.5552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.5440216064453, "rewards/margins": 17.74907875061035, "rewards/rejected": -171.29307556152344, "step": 19720 }, { "epoch": 1.14, "grad_norm": 40.93009948730469, "learning_rate": 0.000621115368241805, "logits/chosen": -18.116369247436523, "logits/rejected": -18.490148544311523, "logps/chosen": -2974.66357421875, "logps/rejected": -2368.3671875, "loss": 2.3077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.29777526855469, "rewards/margins": 10.587071418762207, "rewards/rejected": -129.88485717773438, "step": 19730 }, { "epoch": 1.14, "grad_norm": 144.18658447265625, "learning_rate": 0.0006209218623011726, "logits/chosen": -19.00188446044922, "logits/rejected": -19.72186279296875, "logps/chosen": -2714.67529296875, "logps/rejected": -2897.604736328125, "loss": 11.144, "rewards/accuracies": 0.5, "rewards/chosen": -192.58755493164062, "rewards/margins": -6.433676719665527, "rewards/rejected": -186.15390014648438, "step": 19740 }, { "epoch": 1.14, "grad_norm": 102.32373809814453, "learning_rate": 0.0006207283563605402, "logits/chosen": -17.095256805419922, "logits/rejected": -17.580575942993164, "logps/chosen": -3220.395263671875, "logps/rejected": -3029.084716796875, "loss": 11.4751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -218.6730499267578, "rewards/margins": -1.5084556341171265, "rewards/rejected": -217.1645965576172, "step": 19750 }, { "epoch": 1.14, "grad_norm": 42.502567291259766, "learning_rate": 0.0006205348504199078, "logits/chosen": -13.75421142578125, "logits/rejected": -14.297955513000488, "logps/chosen": -3272.26611328125, "logps/rejected": -3317.401611328125, "loss": 5.4023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -218.6024932861328, "rewards/margins": 3.3348934650421143, "rewards/rejected": -221.93734741210938, "step": 19760 }, { "epoch": 1.14, "grad_norm": 55.408416748046875, "learning_rate": 0.0006203413444792755, "logits/chosen": -13.693445205688477, "logits/rejected": -13.542327880859375, "logps/chosen": -3278.43896484375, "logps/rejected": -2561.609130859375, "loss": 46.6664, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -184.37649536132812, "rewards/margins": -39.74439239501953, "rewards/rejected": -144.63206481933594, "step": 19770 }, { "epoch": 1.14, "grad_norm": 4.825416867297594e-13, "learning_rate": 0.0006201478385386431, "logits/chosen": -12.993520736694336, "logits/rejected": -12.83044147491455, "logps/chosen": -2758.0966796875, "logps/rejected": -2764.573974609375, "loss": 6.7085, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -163.46975708007812, "rewards/margins": 0.8814998865127563, "rewards/rejected": -164.35125732421875, "step": 19780 }, { "epoch": 1.15, "grad_norm": 0.06911731511354446, "learning_rate": 0.0006199543325980108, "logits/chosen": -15.214869499206543, "logits/rejected": -15.286233901977539, "logps/chosen": -2340.47802734375, "logps/rejected": -2404.06689453125, "loss": 3.0783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.16775512695312, "rewards/margins": 7.4605865478515625, "rewards/rejected": -174.62835693359375, "step": 19790 }, { "epoch": 1.15, "grad_norm": 2.35551215155283e-05, "learning_rate": 0.0006197608266573784, "logits/chosen": -14.513033866882324, "logits/rejected": -14.98468017578125, "logps/chosen": -2591.433837890625, "logps/rejected": -2256.78955078125, "loss": 13.3711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.3653564453125, "rewards/margins": 6.603140354156494, "rewards/rejected": -185.968505859375, "step": 19800 }, { "epoch": 1.15, "grad_norm": 61.29496383666992, "learning_rate": 0.000619567320716746, "logits/chosen": -13.267271041870117, "logits/rejected": -13.66491413116455, "logps/chosen": -2333.473876953125, "logps/rejected": -1924.7808837890625, "loss": 3.4936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -60.46259307861328, "rewards/margins": 29.929576873779297, "rewards/rejected": -90.39216613769531, "step": 19810 }, { "epoch": 1.15, "grad_norm": 2.036829948425293, "learning_rate": 0.0006193738147761137, "logits/chosen": -13.067667007446289, "logits/rejected": -13.893949508666992, "logps/chosen": -2842.249755859375, "logps/rejected": -2232.703369140625, "loss": 24.4274, "rewards/accuracies": 0.5, "rewards/chosen": -141.47503662109375, "rewards/margins": -8.790369987487793, "rewards/rejected": -132.68467712402344, "step": 19820 }, { "epoch": 1.15, "grad_norm": 8.372680895263329e-06, "learning_rate": 0.0006191803088354813, "logits/chosen": -15.180795669555664, "logits/rejected": -15.299310684204102, "logps/chosen": -2570.895751953125, "logps/rejected": -2512.12548828125, "loss": 5.2187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.57882690429688, "rewards/margins": 21.943466186523438, "rewards/rejected": -165.5222930908203, "step": 19830 }, { "epoch": 1.15, "grad_norm": 350.7220458984375, "learning_rate": 0.0006189868028948489, "logits/chosen": -14.018875122070312, "logits/rejected": -14.1709566116333, "logps/chosen": -2427.014892578125, "logps/rejected": -2265.755126953125, "loss": 4.2001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -110.80721282958984, "rewards/margins": 9.493544578552246, "rewards/rejected": -120.3007583618164, "step": 19840 }, { "epoch": 1.15, "grad_norm": 385.5422058105469, "learning_rate": 0.0006187932969542165, "logits/chosen": -12.545807838439941, "logits/rejected": -12.799861907958984, "logps/chosen": -2581.45263671875, "logps/rejected": -2684.99560546875, "loss": 23.6338, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -107.65052795410156, "rewards/margins": -16.940784454345703, "rewards/rejected": -90.7097396850586, "step": 19850 }, { "epoch": 1.15, "grad_norm": 11.956576347351074, "learning_rate": 0.0006185997910135841, "logits/chosen": -16.500944137573242, "logits/rejected": -16.295204162597656, "logps/chosen": -2609.193359375, "logps/rejected": -2469.54248046875, "loss": 11.4365, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -155.41432189941406, "rewards/margins": -6.907958030700684, "rewards/rejected": -148.50637817382812, "step": 19860 }, { "epoch": 1.15, "grad_norm": 2.974218887175084e-06, "learning_rate": 0.0006184062850729517, "logits/chosen": -16.597579956054688, "logits/rejected": -17.21363639831543, "logps/chosen": -2954.570068359375, "logps/rejected": -2605.559326171875, "loss": 17.7842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.91195678710938, "rewards/margins": -10.195880889892578, "rewards/rejected": -144.716064453125, "step": 19870 }, { "epoch": 1.15, "grad_norm": 62.77283477783203, "learning_rate": 0.0006182127791323194, "logits/chosen": -19.2283878326416, "logits/rejected": -20.02178955078125, "logps/chosen": -2239.52978515625, "logps/rejected": -2151.20947265625, "loss": 13.0029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.38076782226562, "rewards/margins": 3.7871384620666504, "rewards/rejected": -198.16790771484375, "step": 19880 }, { "epoch": 1.15, "grad_norm": 2.9809418822246414e-12, "learning_rate": 0.0006180192731916871, "logits/chosen": -17.39337921142578, "logits/rejected": -17.216197967529297, "logps/chosen": -1987.574462890625, "logps/rejected": -1565.190673828125, "loss": 33.2935, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -152.4235076904297, "rewards/margins": -24.666780471801758, "rewards/rejected": -127.75672912597656, "step": 19890 }, { "epoch": 1.15, "grad_norm": 83.8868637084961, "learning_rate": 0.0006178257672510547, "logits/chosen": -14.736494064331055, "logits/rejected": -14.536578178405762, "logps/chosen": -2919.948974609375, "logps/rejected": -2351.359375, "loss": 11.2664, "rewards/accuracies": 0.5, "rewards/chosen": -135.27749633789062, "rewards/margins": -5.200392246246338, "rewards/rejected": -130.07708740234375, "step": 19900 }, { "epoch": 1.15, "grad_norm": 0.002568154828622937, "learning_rate": 0.0006176322613104223, "logits/chosen": -25.861431121826172, "logits/rejected": -28.08322525024414, "logps/chosen": -2563.76220703125, "logps/rejected": -2616.15869140625, "loss": 8.9777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -230.9813232421875, "rewards/margins": 10.196632385253906, "rewards/rejected": -241.177978515625, "step": 19910 }, { "epoch": 1.15, "grad_norm": 4.2255536084119744e-20, "learning_rate": 0.0006174387553697899, "logits/chosen": -16.006683349609375, "logits/rejected": -16.31097412109375, "logps/chosen": -3332.211669921875, "logps/rejected": -2844.42529296875, "loss": 4.2412, "rewards/accuracies": 0.5, "rewards/chosen": -76.37052154541016, "rewards/margins": 8.509567260742188, "rewards/rejected": -84.88008117675781, "step": 19920 }, { "epoch": 1.15, "grad_norm": 0.002390842651948333, "learning_rate": 0.0006172452494291575, "logits/chosen": -17.254146575927734, "logits/rejected": -17.49107551574707, "logps/chosen": -2290.20654296875, "logps/rejected": -2359.667236328125, "loss": 3.0516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -156.63937377929688, "rewards/margins": 15.176325798034668, "rewards/rejected": -171.81570434570312, "step": 19930 }, { "epoch": 1.15, "grad_norm": 27.226673126220703, "learning_rate": 0.0006170517434885251, "logits/chosen": -15.071316719055176, "logits/rejected": -15.84705638885498, "logps/chosen": -3029.069580078125, "logps/rejected": -2497.54052734375, "loss": 6.1936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.10794830322266, "rewards/margins": 5.076468467712402, "rewards/rejected": -126.18440246582031, "step": 19940 }, { "epoch": 1.15, "grad_norm": 0.0, "learning_rate": 0.0006168582375478927, "logits/chosen": -17.209449768066406, "logits/rejected": -17.542686462402344, "logps/chosen": -2877.793701171875, "logps/rejected": -2343.64111328125, "loss": 17.0016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -137.70034790039062, "rewards/margins": 0.7019516229629517, "rewards/rejected": -138.4022979736328, "step": 19950 }, { "epoch": 1.16, "grad_norm": 1.3981392612549826e-06, "learning_rate": 0.0006166647316072603, "logits/chosen": -18.19814109802246, "logits/rejected": -18.558931350708008, "logps/chosen": -2596.57666015625, "logps/rejected": -2453.3681640625, "loss": 3.729, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.10043334960938, "rewards/margins": 10.570229530334473, "rewards/rejected": -177.67068481445312, "step": 19960 }, { "epoch": 1.16, "grad_norm": 123.90950775146484, "learning_rate": 0.0006164712256666279, "logits/chosen": -15.148755073547363, "logits/rejected": -15.611207962036133, "logps/chosen": -2538.32763671875, "logps/rejected": -2469.8466796875, "loss": 7.2509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.91769409179688, "rewards/margins": 9.091774940490723, "rewards/rejected": -150.00946044921875, "step": 19970 }, { "epoch": 1.16, "grad_norm": 0.0028655182104557753, "learning_rate": 0.0006162777197259955, "logits/chosen": -13.270441055297852, "logits/rejected": -13.422805786132812, "logps/chosen": -2539.37060546875, "logps/rejected": -2692.590576171875, "loss": 4.849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -72.40985107421875, "rewards/margins": 7.63655948638916, "rewards/rejected": -80.0464096069336, "step": 19980 }, { "epoch": 1.16, "grad_norm": 0.005117921624332666, "learning_rate": 0.0006160842137853633, "logits/chosen": -15.59038257598877, "logits/rejected": -15.309591293334961, "logps/chosen": -2857.763427734375, "logps/rejected": -2894.41259765625, "loss": 9.6602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.96511840820312, "rewards/margins": 2.1138949394226074, "rewards/rejected": -141.07899475097656, "step": 19990 }, { "epoch": 1.16, "grad_norm": 68.65511322021484, "learning_rate": 0.0006158907078447309, "logits/chosen": -16.036449432373047, "logits/rejected": -16.265151977539062, "logps/chosen": -2346.26416015625, "logps/rejected": -2279.1875, "loss": 12.0948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.01260375976562, "rewards/margins": -1.6135371923446655, "rewards/rejected": -169.39907836914062, "step": 20000 }, { "epoch": 1.16, "grad_norm": 1.9108176275040023e-05, "learning_rate": 0.0006156972019040985, "logits/chosen": -17.491724014282227, "logits/rejected": -17.12764549255371, "logps/chosen": -2691.24560546875, "logps/rejected": -2716.040283203125, "loss": 6.6714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -209.55252075195312, "rewards/margins": 2.245281934738159, "rewards/rejected": -211.7978057861328, "step": 20010 }, { "epoch": 1.16, "grad_norm": 95.08837127685547, "learning_rate": 0.0006155036959634661, "logits/chosen": -14.629284858703613, "logits/rejected": -14.64678955078125, "logps/chosen": -3215.821044921875, "logps/rejected": -2917.45849609375, "loss": 11.591, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.38943481445312, "rewards/margins": -1.1255238056182861, "rewards/rejected": -198.26388549804688, "step": 20020 }, { "epoch": 1.16, "grad_norm": 170.10643005371094, "learning_rate": 0.0006153101900228337, "logits/chosen": -12.469456672668457, "logits/rejected": -12.387757301330566, "logps/chosen": -2915.409423828125, "logps/rejected": -2519.754150390625, "loss": 9.7413, "rewards/accuracies": 0.5, "rewards/chosen": -93.42688751220703, "rewards/margins": -3.5413124561309814, "rewards/rejected": -89.88557434082031, "step": 20030 }, { "epoch": 1.16, "grad_norm": 65.03350830078125, "learning_rate": 0.0006151166840822013, "logits/chosen": -10.835196495056152, "logits/rejected": -10.820402145385742, "logps/chosen": -2616.473388671875, "logps/rejected": -2147.89892578125, "loss": 5.698, "rewards/accuracies": 0.5, "rewards/chosen": -24.055919647216797, "rewards/margins": 14.480610847473145, "rewards/rejected": -38.536529541015625, "step": 20040 }, { "epoch": 1.16, "grad_norm": 110.29104614257812, "learning_rate": 0.000614923178141569, "logits/chosen": -12.317069053649902, "logits/rejected": -12.38090705871582, "logps/chosen": -2765.935546875, "logps/rejected": -2748.26123046875, "loss": 16.8108, "rewards/accuracies": 0.5, "rewards/chosen": -112.86199951171875, "rewards/margins": -12.058117866516113, "rewards/rejected": -100.80388641357422, "step": 20050 }, { "epoch": 1.16, "grad_norm": 1.815095990176374e-11, "learning_rate": 0.0006147296722009366, "logits/chosen": -10.463661193847656, "logits/rejected": -10.532508850097656, "logps/chosen": -3077.832763671875, "logps/rejected": -2900.785888671875, "loss": 7.0418, "rewards/accuracies": 0.5, "rewards/chosen": -63.952484130859375, "rewards/margins": 3.753079891204834, "rewards/rejected": -67.70555877685547, "step": 20060 }, { "epoch": 1.16, "grad_norm": 0.00018911912047769874, "learning_rate": 0.0006145361662603042, "logits/chosen": -15.353672981262207, "logits/rejected": -15.722122192382812, "logps/chosen": -2570.6513671875, "logps/rejected": -2511.00634765625, "loss": 0.5906, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -195.47323608398438, "rewards/margins": 22.390342712402344, "rewards/rejected": -217.8636016845703, "step": 20070 }, { "epoch": 1.16, "grad_norm": 6.915413450769847e-07, "learning_rate": 0.0006143426603196718, "logits/chosen": -13.620956420898438, "logits/rejected": -13.593233108520508, "logps/chosen": -2635.151123046875, "logps/rejected": -2491.076171875, "loss": 20.3119, "rewards/accuracies": 0.5, "rewards/chosen": -232.6372833251953, "rewards/margins": -16.205583572387695, "rewards/rejected": -216.4317169189453, "step": 20080 }, { "epoch": 1.16, "grad_norm": 144.28793334960938, "learning_rate": 0.0006141491543790394, "logits/chosen": -12.679971694946289, "logits/rejected": -12.746530532836914, "logps/chosen": -2980.584716796875, "logps/rejected": -2364.044677734375, "loss": 15.7615, "rewards/accuracies": 0.5, "rewards/chosen": -121.88780212402344, "rewards/margins": -3.4790923595428467, "rewards/rejected": -118.4087142944336, "step": 20090 }, { "epoch": 1.16, "grad_norm": 0.0040819901041686535, "learning_rate": 0.0006139556484384071, "logits/chosen": -14.22210693359375, "logits/rejected": -14.388445854187012, "logps/chosen": -2820.94970703125, "logps/rejected": -2233.957275390625, "loss": 11.4792, "rewards/accuracies": 0.5, "rewards/chosen": -105.7323989868164, "rewards/margins": 5.022300720214844, "rewards/rejected": -110.75468444824219, "step": 20100 }, { "epoch": 1.16, "grad_norm": 0.013222110457718372, "learning_rate": 0.0006137621424977748, "logits/chosen": -15.503179550170898, "logits/rejected": -15.355018615722656, "logps/chosen": -2305.55419921875, "logps/rejected": -2278.848388671875, "loss": 32.9273, "rewards/accuracies": 0.5, "rewards/chosen": -189.89703369140625, "rewards/margins": -25.016971588134766, "rewards/rejected": -164.88006591796875, "step": 20110 }, { "epoch": 1.16, "grad_norm": 60.764549255371094, "learning_rate": 0.0006135686365571424, "logits/chosen": -16.37885093688965, "logits/rejected": -16.406604766845703, "logps/chosen": -2677.92626953125, "logps/rejected": -2665.311767578125, "loss": 7.3595, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -197.5130615234375, "rewards/margins": -2.0060336589813232, "rewards/rejected": -195.5070343017578, "step": 20120 }, { "epoch": 1.17, "grad_norm": 29.929832458496094, "learning_rate": 0.00061337513061651, "logits/chosen": -14.806381225585938, "logits/rejected": -14.697613716125488, "logps/chosen": -3015.19921875, "logps/rejected": -3003.931640625, "loss": 5.7499, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -194.9720916748047, "rewards/margins": -1.1083520650863647, "rewards/rejected": -193.86373901367188, "step": 20130 }, { "epoch": 1.17, "grad_norm": 11.588482856750488, "learning_rate": 0.0006131816246758776, "logits/chosen": -15.727998733520508, "logits/rejected": -15.972386360168457, "logps/chosen": -3414.561767578125, "logps/rejected": -3214.98779296875, "loss": 3.8772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.57888793945312, "rewards/margins": 18.277647018432617, "rewards/rejected": -147.85653686523438, "step": 20140 }, { "epoch": 1.17, "grad_norm": 5.219742433837382e-06, "learning_rate": 0.0006129881187352452, "logits/chosen": -15.658291816711426, "logits/rejected": -15.827792167663574, "logps/chosen": -2930.553466796875, "logps/rejected": -2667.119140625, "loss": 22.9939, "rewards/accuracies": 0.5, "rewards/chosen": -183.8375701904297, "rewards/margins": -15.680910110473633, "rewards/rejected": -168.15664672851562, "step": 20150 }, { "epoch": 1.17, "grad_norm": 0.003738041967153549, "learning_rate": 0.0006127946127946127, "logits/chosen": -17.202030181884766, "logits/rejected": -17.241161346435547, "logps/chosen": -2889.200927734375, "logps/rejected": -2765.58349609375, "loss": 14.1785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -196.6731719970703, "rewards/margins": -4.9071760177612305, "rewards/rejected": -191.76597595214844, "step": 20160 }, { "epoch": 1.17, "grad_norm": 54.505313873291016, "learning_rate": 0.0006126011068539804, "logits/chosen": -17.229900360107422, "logits/rejected": -17.55476951599121, "logps/chosen": -2556.81591796875, "logps/rejected": -2488.37158203125, "loss": 10.9461, "rewards/accuracies": 0.5, "rewards/chosen": -188.19813537597656, "rewards/margins": -3.200833797454834, "rewards/rejected": -184.99729919433594, "step": 20170 }, { "epoch": 1.17, "grad_norm": 1.2530064582824707, "learning_rate": 0.000612407600913348, "logits/chosen": -15.745837211608887, "logits/rejected": -15.920193672180176, "logps/chosen": -2530.3583984375, "logps/rejected": -2149.14599609375, "loss": 18.9318, "rewards/accuracies": 0.5, "rewards/chosen": -184.76055908203125, "rewards/margins": -15.516975402832031, "rewards/rejected": -169.24359130859375, "step": 20180 }, { "epoch": 1.17, "grad_norm": 46.62128448486328, "learning_rate": 0.0006122140949727156, "logits/chosen": -14.489463806152344, "logits/rejected": -15.178335189819336, "logps/chosen": -2466.8720703125, "logps/rejected": -2182.20458984375, "loss": 18.4531, "rewards/accuracies": 0.5, "rewards/chosen": -104.46406555175781, "rewards/margins": -13.110818862915039, "rewards/rejected": -91.3532485961914, "step": 20190 }, { "epoch": 1.17, "grad_norm": 1135.0618896484375, "learning_rate": 0.0006120205890320833, "logits/chosen": -12.934709548950195, "logits/rejected": -13.077760696411133, "logps/chosen": -2406.5439453125, "logps/rejected": -2311.789306640625, "loss": 5.9388, "rewards/accuracies": 0.5, "rewards/chosen": -113.379638671875, "rewards/margins": 7.584500789642334, "rewards/rejected": -120.9641342163086, "step": 20200 }, { "epoch": 1.17, "grad_norm": 192.14100646972656, "learning_rate": 0.0006118270830914509, "logits/chosen": -11.777300834655762, "logits/rejected": -11.979267120361328, "logps/chosen": -3071.34326171875, "logps/rejected": -2801.788818359375, "loss": 14.9258, "rewards/accuracies": 0.5, "rewards/chosen": -161.09669494628906, "rewards/margins": -5.339669704437256, "rewards/rejected": -155.75701904296875, "step": 20210 }, { "epoch": 1.17, "grad_norm": 26.7023868560791, "learning_rate": 0.0006116335771508186, "logits/chosen": -16.10049057006836, "logits/rejected": -16.042695999145508, "logps/chosen": -2147.556640625, "logps/rejected": -2215.195556640625, "loss": 5.6047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.53541564941406, "rewards/margins": 11.764396667480469, "rewards/rejected": -128.29978942871094, "step": 20220 }, { "epoch": 1.17, "grad_norm": 0.0017848144052550197, "learning_rate": 0.0006114400712101862, "logits/chosen": -15.464391708374023, "logits/rejected": -16.04399299621582, "logps/chosen": -2584.283935546875, "logps/rejected": -2639.47314453125, "loss": 22.2455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.21507263183594, "rewards/margins": -14.352335929870605, "rewards/rejected": -104.86273193359375, "step": 20230 }, { "epoch": 1.17, "grad_norm": 1.3141905341740312e-09, "learning_rate": 0.0006112465652695538, "logits/chosen": -15.319307327270508, "logits/rejected": -15.247509956359863, "logps/chosen": -2342.95166015625, "logps/rejected": -2353.02490234375, "loss": 8.0289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -130.9525604248047, "rewards/margins": 1.7664365768432617, "rewards/rejected": -132.71902465820312, "step": 20240 }, { "epoch": 1.17, "grad_norm": 86.20774841308594, "learning_rate": 0.0006110530593289214, "logits/chosen": -17.48652458190918, "logits/rejected": -17.382038116455078, "logps/chosen": -3008.14697265625, "logps/rejected": -2683.576416015625, "loss": 18.6825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.36184692382812, "rewards/margins": -6.594300746917725, "rewards/rejected": -161.7675323486328, "step": 20250 }, { "epoch": 1.17, "grad_norm": 1.4496090412139893, "learning_rate": 0.000610859553388289, "logits/chosen": -17.766849517822266, "logits/rejected": -18.18083381652832, "logps/chosen": -2771.329833984375, "logps/rejected": -2610.018310546875, "loss": 11.2211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.02978515625, "rewards/margins": 1.322749376296997, "rewards/rejected": -172.3525390625, "step": 20260 }, { "epoch": 1.17, "grad_norm": 23.309608459472656, "learning_rate": 0.0006106660474476566, "logits/chosen": -17.55304527282715, "logits/rejected": -18.032108306884766, "logps/chosen": -2847.683837890625, "logps/rejected": -2669.450927734375, "loss": 24.6022, "rewards/accuracies": 0.5, "rewards/chosen": -182.34762573242188, "rewards/margins": -17.55270767211914, "rewards/rejected": -164.794921875, "step": 20270 }, { "epoch": 1.17, "grad_norm": 0.02450907602906227, "learning_rate": 0.0006104725415070243, "logits/chosen": -15.923624992370605, "logits/rejected": -16.03415870666504, "logps/chosen": -2463.193603515625, "logps/rejected": -2515.3466796875, "loss": 2.5425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.1979217529297, "rewards/margins": 7.610678672790527, "rewards/rejected": -151.80862426757812, "step": 20280 }, { "epoch": 1.17, "grad_norm": 99.29967498779297, "learning_rate": 0.0006102790355663919, "logits/chosen": -14.00629711151123, "logits/rejected": -13.91053581237793, "logps/chosen": -2482.18896484375, "logps/rejected": -2636.169189453125, "loss": 6.3845, "rewards/accuracies": 0.5, "rewards/chosen": -127.15281677246094, "rewards/margins": 10.534791946411133, "rewards/rejected": -137.68760681152344, "step": 20290 }, { "epoch": 1.18, "grad_norm": 368.4283752441406, "learning_rate": 0.0006100855296257595, "logits/chosen": -13.94196891784668, "logits/rejected": -14.345159530639648, "logps/chosen": -2532.34619140625, "logps/rejected": -2634.89892578125, "loss": 3.6289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.140625, "rewards/margins": 15.305740356445312, "rewards/rejected": -124.44636535644531, "step": 20300 }, { "epoch": 1.18, "grad_norm": 0.005488655995577574, "learning_rate": 0.0006098920236851272, "logits/chosen": -14.184576034545898, "logits/rejected": -14.466160774230957, "logps/chosen": -2613.32666015625, "logps/rejected": -2581.485107421875, "loss": 7.8007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.63763427734375, "rewards/margins": 4.716236114501953, "rewards/rejected": -155.35386657714844, "step": 20310 }, { "epoch": 1.18, "grad_norm": 3.451641896390356e-05, "learning_rate": 0.0006096985177444948, "logits/chosen": -14.848527908325195, "logits/rejected": -14.979507446289062, "logps/chosen": -2317.21533203125, "logps/rejected": -2270.646484375, "loss": 15.7633, "rewards/accuracies": 0.5, "rewards/chosen": -149.57728576660156, "rewards/margins": -11.373104095458984, "rewards/rejected": -138.2041778564453, "step": 20320 }, { "epoch": 1.18, "grad_norm": 0.0, "learning_rate": 0.0006095050118038625, "logits/chosen": -12.825784683227539, "logits/rejected": -12.625944137573242, "logps/chosen": -3008.97265625, "logps/rejected": -2581.166748046875, "loss": 8.1975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -112.37947845458984, "rewards/margins": 11.739474296569824, "rewards/rejected": -124.11894226074219, "step": 20330 }, { "epoch": 1.18, "grad_norm": 4.108738037858591e-18, "learning_rate": 0.0006093115058632301, "logits/chosen": -14.466525077819824, "logits/rejected": -14.83228588104248, "logps/chosen": -2779.436767578125, "logps/rejected": -2871.136474609375, "loss": 3.253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.69070434570312, "rewards/margins": 5.375007152557373, "rewards/rejected": -138.06570434570312, "step": 20340 }, { "epoch": 1.18, "grad_norm": 124.64979553222656, "learning_rate": 0.0006091179999225977, "logits/chosen": -14.415120124816895, "logits/rejected": -14.138415336608887, "logps/chosen": -2946.731201171875, "logps/rejected": -2694.539794921875, "loss": 24.1751, "rewards/accuracies": 0.5, "rewards/chosen": -155.3147735595703, "rewards/margins": -10.138100624084473, "rewards/rejected": -145.1766815185547, "step": 20350 }, { "epoch": 1.18, "grad_norm": 2.850882474515726e-22, "learning_rate": 0.0006089244939819653, "logits/chosen": -14.244474411010742, "logits/rejected": -14.76732063293457, "logps/chosen": -2953.82177734375, "logps/rejected": -2622.37158203125, "loss": 0.8205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -175.79627990722656, "rewards/margins": 25.520145416259766, "rewards/rejected": -201.31643676757812, "step": 20360 }, { "epoch": 1.18, "grad_norm": 0.5408190488815308, "learning_rate": 0.0006087309880413329, "logits/chosen": -16.026487350463867, "logits/rejected": -15.858613967895508, "logps/chosen": -2439.90869140625, "logps/rejected": -2286.325439453125, "loss": 13.8493, "rewards/accuracies": 0.5, "rewards/chosen": -181.373291015625, "rewards/margins": -8.963744163513184, "rewards/rejected": -172.40956115722656, "step": 20370 }, { "epoch": 1.18, "grad_norm": 1.3820607591696898e-06, "learning_rate": 0.0006085374821007004, "logits/chosen": -12.399168968200684, "logits/rejected": -12.63713264465332, "logps/chosen": -2305.125732421875, "logps/rejected": -2294.802490234375, "loss": 7.9917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.56986236572266, "rewards/margins": -2.2276711463928223, "rewards/rejected": -106.3421859741211, "step": 20380 }, { "epoch": 1.18, "grad_norm": 0.3975921869277954, "learning_rate": 0.000608343976160068, "logits/chosen": -14.740669250488281, "logits/rejected": -14.967275619506836, "logps/chosen": -2630.77392578125, "logps/rejected": -2396.66162109375, "loss": 7.1789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.2316436767578, "rewards/margins": 13.15771770477295, "rewards/rejected": -207.3893585205078, "step": 20390 }, { "epoch": 1.18, "grad_norm": 79.04006958007812, "learning_rate": 0.0006081504702194357, "logits/chosen": -15.598482131958008, "logits/rejected": -15.760541915893555, "logps/chosen": -2415.22265625, "logps/rejected": -2371.97216796875, "loss": 5.1733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.87571716308594, "rewards/margins": 8.261999130249023, "rewards/rejected": -177.13772583007812, "step": 20400 }, { "epoch": 1.18, "grad_norm": 5.8794426918029785, "learning_rate": 0.0006079569642788034, "logits/chosen": -14.442916870117188, "logits/rejected": -14.811723709106445, "logps/chosen": -2565.7685546875, "logps/rejected": -2560.43505859375, "loss": 3.8101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -102.9447250366211, "rewards/margins": 14.286497116088867, "rewards/rejected": -117.2312240600586, "step": 20410 }, { "epoch": 1.18, "grad_norm": 112.20378875732422, "learning_rate": 0.000607763458338171, "logits/chosen": -14.132705688476562, "logits/rejected": -14.320155143737793, "logps/chosen": -2909.8388671875, "logps/rejected": -2915.369384765625, "loss": 9.018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.49383544921875, "rewards/margins": -0.21378478407859802, "rewards/rejected": -190.28005981445312, "step": 20420 }, { "epoch": 1.18, "grad_norm": 49.972171783447266, "learning_rate": 0.0006075699523975386, "logits/chosen": -13.772947311401367, "logits/rejected": -14.416071891784668, "logps/chosen": -3013.033935546875, "logps/rejected": -2449.635009765625, "loss": 5.292, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.3426971435547, "rewards/margins": 5.196287631988525, "rewards/rejected": -161.53897094726562, "step": 20430 }, { "epoch": 1.18, "grad_norm": 3.904823824996129e-05, "learning_rate": 0.0006073764464569062, "logits/chosen": -15.168660163879395, "logits/rejected": -15.372461318969727, "logps/chosen": -2530.102294921875, "logps/rejected": -2444.021484375, "loss": 4.0368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.9924774169922, "rewards/margins": 21.95795249938965, "rewards/rejected": -163.950439453125, "step": 20440 }, { "epoch": 1.18, "grad_norm": 75.86434936523438, "learning_rate": 0.0006071829405162739, "logits/chosen": -14.970492362976074, "logits/rejected": -14.820920944213867, "logps/chosen": -2684.509765625, "logps/rejected": -2547.353515625, "loss": 6.1595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -161.02133178710938, "rewards/margins": 3.7745602130889893, "rewards/rejected": -164.7958984375, "step": 20450 }, { "epoch": 1.18, "grad_norm": 2.4375291118872155e-08, "learning_rate": 0.0006069894345756415, "logits/chosen": -13.469320297241211, "logits/rejected": -13.57164192199707, "logps/chosen": -2456.550537109375, "logps/rejected": -2324.421630859375, "loss": 4.9718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -108.71730041503906, "rewards/margins": 0.666333019733429, "rewards/rejected": -109.38362121582031, "step": 20460 }, { "epoch": 1.18, "grad_norm": 149.34298706054688, "learning_rate": 0.0006067959286350091, "logits/chosen": -14.370013236999512, "logits/rejected": -14.693077087402344, "logps/chosen": -2578.72021484375, "logps/rejected": -2403.39599609375, "loss": 17.5064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.97830963134766, "rewards/margins": -5.260636329650879, "rewards/rejected": -98.71766662597656, "step": 20470 }, { "epoch": 1.19, "grad_norm": 20.56548500061035, "learning_rate": 0.0006066024226943767, "logits/chosen": -15.31591510772705, "logits/rejected": -16.134845733642578, "logps/chosen": -3339.61962890625, "logps/rejected": -2705.360595703125, "loss": 0.1709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -181.16336059570312, "rewards/margins": 21.045930862426758, "rewards/rejected": -202.2092742919922, "step": 20480 }, { "epoch": 1.19, "grad_norm": 3.9098613235921675e-09, "learning_rate": 0.0006064089167537443, "logits/chosen": -18.11231231689453, "logits/rejected": -18.984594345092773, "logps/chosen": -2867.3203125, "logps/rejected": -2496.274169921875, "loss": 3.1528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.69778442382812, "rewards/margins": 17.56821632385254, "rewards/rejected": -188.26602172851562, "step": 20490 }, { "epoch": 1.19, "grad_norm": 0.015430997125804424, "learning_rate": 0.0006062154108131119, "logits/chosen": -14.75244426727295, "logits/rejected": -15.084246635437012, "logps/chosen": -2625.98876953125, "logps/rejected": -2515.595458984375, "loss": 1.1337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -116.9081039428711, "rewards/margins": 13.562515258789062, "rewards/rejected": -130.47061157226562, "step": 20500 }, { "epoch": 1.19, "grad_norm": 2.063068276327365e-13, "learning_rate": 0.0006060219048724796, "logits/chosen": -15.758123397827148, "logits/rejected": -15.901809692382812, "logps/chosen": -2367.231201171875, "logps/rejected": -2120.515869140625, "loss": 4.1925, "rewards/accuracies": 0.5, "rewards/chosen": -134.92153930664062, "rewards/margins": 7.273937225341797, "rewards/rejected": -142.1954803466797, "step": 20510 }, { "epoch": 1.19, "grad_norm": 69.88970947265625, "learning_rate": 0.0006058283989318473, "logits/chosen": -14.541964530944824, "logits/rejected": -15.22307014465332, "logps/chosen": -2736.585205078125, "logps/rejected": -2527.490966796875, "loss": 2.3979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -124.3523941040039, "rewards/margins": 8.103612899780273, "rewards/rejected": -132.45599365234375, "step": 20520 }, { "epoch": 1.19, "grad_norm": 1.4920034345777822e-06, "learning_rate": 0.0006056348929912149, "logits/chosen": -13.861211776733398, "logits/rejected": -14.394048690795898, "logps/chosen": -2783.00634765625, "logps/rejected": -2436.39306640625, "loss": 4.7363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -95.77287292480469, "rewards/margins": 24.79384422302246, "rewards/rejected": -120.56672668457031, "step": 20530 }, { "epoch": 1.19, "grad_norm": 47.44184875488281, "learning_rate": 0.0006054413870505825, "logits/chosen": -16.048328399658203, "logits/rejected": -16.69339370727539, "logps/chosen": -2540.300048828125, "logps/rejected": -2142.21826171875, "loss": 15.9432, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -127.14017486572266, "rewards/margins": -6.2005181312561035, "rewards/rejected": -120.93965911865234, "step": 20540 }, { "epoch": 1.19, "grad_norm": 1.9348397254943848, "learning_rate": 0.0006052478811099501, "logits/chosen": -15.30384349822998, "logits/rejected": -15.852903366088867, "logps/chosen": -2272.591796875, "logps/rejected": -2227.09765625, "loss": 17.7242, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -132.99789428710938, "rewards/margins": -7.5629730224609375, "rewards/rejected": -125.4349365234375, "step": 20550 }, { "epoch": 1.19, "grad_norm": 33.49622344970703, "learning_rate": 0.0006050543751693178, "logits/chosen": -15.252583503723145, "logits/rejected": -15.152257919311523, "logps/chosen": -2296.975830078125, "logps/rejected": -2421.36767578125, "loss": 10.2828, "rewards/accuracies": 0.5, "rewards/chosen": -136.92156982421875, "rewards/margins": -5.498383045196533, "rewards/rejected": -131.4231719970703, "step": 20560 }, { "epoch": 1.19, "grad_norm": 112.92533874511719, "learning_rate": 0.0006048608692286854, "logits/chosen": -13.930088996887207, "logits/rejected": -14.689321517944336, "logps/chosen": -3076.81591796875, "logps/rejected": -2342.021484375, "loss": 18.6758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.2140884399414, "rewards/margins": -10.415746688842773, "rewards/rejected": -110.79832458496094, "step": 20570 }, { "epoch": 1.19, "grad_norm": 1.70345000327643e-08, "learning_rate": 0.000604667363288053, "logits/chosen": -19.128347396850586, "logits/rejected": -20.174726486206055, "logps/chosen": -2622.86572265625, "logps/rejected": -2529.31005859375, "loss": 6.5281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -196.453125, "rewards/margins": 4.625092506408691, "rewards/rejected": -201.07823181152344, "step": 20580 }, { "epoch": 1.19, "grad_norm": 1.0372163217198249e-07, "learning_rate": 0.0006044738573474205, "logits/chosen": -13.253071784973145, "logits/rejected": -13.307286262512207, "logps/chosen": -2814.56103515625, "logps/rejected": -2442.787109375, "loss": 11.9898, "rewards/accuracies": 0.5, "rewards/chosen": -139.14523315429688, "rewards/margins": -1.7481266260147095, "rewards/rejected": -137.39710998535156, "step": 20590 }, { "epoch": 1.19, "grad_norm": 0.008240093477070332, "learning_rate": 0.0006042803514067881, "logits/chosen": -13.757902145385742, "logits/rejected": -14.307188034057617, "logps/chosen": -2549.01318359375, "logps/rejected": -2342.6259765625, "loss": 1.2788, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -107.48941802978516, "rewards/margins": 16.231809616088867, "rewards/rejected": -123.72122955322266, "step": 20600 }, { "epoch": 1.19, "grad_norm": 0.23205772042274475, "learning_rate": 0.0006040868454661557, "logits/chosen": -15.068263053894043, "logits/rejected": -15.469927787780762, "logps/chosen": -2741.863037109375, "logps/rejected": -2524.142578125, "loss": 2.3882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.59230041503906, "rewards/margins": 6.760870456695557, "rewards/rejected": -115.35316467285156, "step": 20610 }, { "epoch": 1.19, "grad_norm": 2.0743587003538375e-15, "learning_rate": 0.0006038933395255235, "logits/chosen": -15.279583930969238, "logits/rejected": -16.02867889404297, "logps/chosen": -2610.51806640625, "logps/rejected": -2234.245361328125, "loss": 2.6152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.41256713867188, "rewards/margins": 16.780628204345703, "rewards/rejected": -165.19320678710938, "step": 20620 }, { "epoch": 1.19, "grad_norm": 1.971444785198173e-09, "learning_rate": 0.0006036998335848911, "logits/chosen": -14.975679397583008, "logits/rejected": -15.58317756652832, "logps/chosen": -3122.20068359375, "logps/rejected": -3084.10986328125, "loss": 6.1498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.310791015625, "rewards/margins": 1.7257496118545532, "rewards/rejected": -135.03656005859375, "step": 20630 }, { "epoch": 1.19, "grad_norm": 80.2703857421875, "learning_rate": 0.0006035063276442587, "logits/chosen": -17.671194076538086, "logits/rejected": -18.750694274902344, "logps/chosen": -2731.361083984375, "logps/rejected": -2688.186279296875, "loss": 4.515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -198.4969940185547, "rewards/margins": 16.10757064819336, "rewards/rejected": -214.6045684814453, "step": 20640 }, { "epoch": 1.2, "grad_norm": 10.518254280090332, "learning_rate": 0.0006033128217036263, "logits/chosen": -17.18172836303711, "logits/rejected": -16.152942657470703, "logps/chosen": -2712.503662109375, "logps/rejected": -3007.982421875, "loss": 3.4754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.3842010498047, "rewards/margins": 29.758560180664062, "rewards/rejected": -165.14276123046875, "step": 20650 }, { "epoch": 1.2, "grad_norm": 0.003143050940707326, "learning_rate": 0.0006031193157629939, "logits/chosen": -18.556324005126953, "logits/rejected": -20.022871017456055, "logps/chosen": -2616.6279296875, "logps/rejected": -2497.794677734375, "loss": 13.4126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.9849853515625, "rewards/margins": -7.935675144195557, "rewards/rejected": -194.04931640625, "step": 20660 }, { "epoch": 1.2, "grad_norm": 133.7657470703125, "learning_rate": 0.0006029258098223615, "logits/chosen": -12.930903434753418, "logits/rejected": -12.865324020385742, "logps/chosen": -2341.67919921875, "logps/rejected": -2533.707763671875, "loss": 4.9424, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -58.83948516845703, "rewards/margins": -0.6962593197822571, "rewards/rejected": -58.143226623535156, "step": 20670 }, { "epoch": 1.2, "grad_norm": 81.6644287109375, "learning_rate": 0.0006027323038817292, "logits/chosen": -18.69537353515625, "logits/rejected": -20.074071884155273, "logps/chosen": -2598.432861328125, "logps/rejected": -2218.85498046875, "loss": 15.6518, "rewards/accuracies": 0.5, "rewards/chosen": -202.05801391601562, "rewards/margins": -5.560763359069824, "rewards/rejected": -196.49728393554688, "step": 20680 }, { "epoch": 1.2, "grad_norm": 88.07842254638672, "learning_rate": 0.0006025387979410968, "logits/chosen": -16.45220947265625, "logits/rejected": -16.5891170501709, "logps/chosen": -2992.4228515625, "logps/rejected": -2717.290771484375, "loss": 9.8853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.3934783935547, "rewards/margins": 0.7215240597724915, "rewards/rejected": -145.11502075195312, "step": 20690 }, { "epoch": 1.2, "grad_norm": 10.542865753173828, "learning_rate": 0.0006023452920004644, "logits/chosen": -17.78722381591797, "logits/rejected": -18.17084312438965, "logps/chosen": -2472.903076171875, "logps/rejected": -2383.304443359375, "loss": 10.8005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.84976196289062, "rewards/margins": 2.153977155685425, "rewards/rejected": -183.00375366210938, "step": 20700 }, { "epoch": 1.2, "grad_norm": 0.0, "learning_rate": 0.000602151786059832, "logits/chosen": -17.387914657592773, "logits/rejected": -17.35373306274414, "logps/chosen": -2747.6953125, "logps/rejected": -2668.7265625, "loss": 2.0697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -164.95103454589844, "rewards/margins": 15.563133239746094, "rewards/rejected": -180.51419067382812, "step": 20710 }, { "epoch": 1.2, "grad_norm": 25.657442092895508, "learning_rate": 0.0006019582801191996, "logits/chosen": -17.042388916015625, "logits/rejected": -17.245948791503906, "logps/chosen": -2597.32470703125, "logps/rejected": -2559.66650390625, "loss": 5.1156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.92930603027344, "rewards/margins": 1.4012298583984375, "rewards/rejected": -189.33053588867188, "step": 20720 }, { "epoch": 1.2, "grad_norm": 4.561028958960378e-07, "learning_rate": 0.0006017647741785674, "logits/chosen": -15.640609741210938, "logits/rejected": -15.946271896362305, "logps/chosen": -2715.153564453125, "logps/rejected": -2440.256103515625, "loss": 20.2071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.72239685058594, "rewards/margins": -8.519149780273438, "rewards/rejected": -150.2032470703125, "step": 20730 }, { "epoch": 1.2, "grad_norm": 2.585575930424966e-05, "learning_rate": 0.000601571268237935, "logits/chosen": -18.640148162841797, "logits/rejected": -20.38991928100586, "logps/chosen": -2675.081787109375, "logps/rejected": -2648.36572265625, "loss": 3.561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -169.44834899902344, "rewards/margins": 18.452342987060547, "rewards/rejected": -187.90069580078125, "step": 20740 }, { "epoch": 1.2, "grad_norm": 62.59940719604492, "learning_rate": 0.0006013777622973026, "logits/chosen": -16.709814071655273, "logits/rejected": -17.175140380859375, "logps/chosen": -2838.393798828125, "logps/rejected": -2652.7998046875, "loss": 4.1807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -101.56647491455078, "rewards/margins": 14.704302787780762, "rewards/rejected": -116.27079010009766, "step": 20750 }, { "epoch": 1.2, "grad_norm": 0.0002790464786812663, "learning_rate": 0.0006011842563566702, "logits/chosen": -14.917645454406738, "logits/rejected": -15.613685607910156, "logps/chosen": -2596.13525390625, "logps/rejected": -2533.681884765625, "loss": 2.7567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -115.18443298339844, "rewards/margins": 14.591962814331055, "rewards/rejected": -129.77639770507812, "step": 20760 }, { "epoch": 1.2, "grad_norm": 8.249157920358575e-19, "learning_rate": 0.0006009907504160378, "logits/chosen": -14.969663619995117, "logits/rejected": -15.65429401397705, "logps/chosen": -2866.093994140625, "logps/rejected": -2577.847412109375, "loss": 8.2369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.94686126708984, "rewards/margins": 20.53862953186035, "rewards/rejected": -123.48548889160156, "step": 20770 }, { "epoch": 1.2, "grad_norm": 7.793120038179779e-10, "learning_rate": 0.0006007972444754054, "logits/chosen": -15.641677856445312, "logits/rejected": -15.788839340209961, "logps/chosen": -2570.2646484375, "logps/rejected": -2612.224853515625, "loss": 10.7346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.77291870117188, "rewards/margins": 12.849154472351074, "rewards/rejected": -155.6220703125, "step": 20780 }, { "epoch": 1.2, "grad_norm": 0.3643394112586975, "learning_rate": 0.0006006037385347731, "logits/chosen": -17.869674682617188, "logits/rejected": -19.574535369873047, "logps/chosen": -2761.6982421875, "logps/rejected": -2667.348876953125, "loss": 20.26, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.44168090820312, "rewards/margins": 6.438597679138184, "rewards/rejected": -157.88027954101562, "step": 20790 }, { "epoch": 1.2, "grad_norm": 0.8297719359397888, "learning_rate": 0.0006004102325941407, "logits/chosen": -18.780109405517578, "logits/rejected": -19.596012115478516, "logps/chosen": -2302.08642578125, "logps/rejected": -2154.681640625, "loss": 20.1109, "rewards/accuracies": 0.5, "rewards/chosen": -207.1601104736328, "rewards/margins": -10.984359741210938, "rewards/rejected": -196.17575073242188, "step": 20800 }, { "epoch": 1.2, "grad_norm": 52.86249542236328, "learning_rate": 0.0006002167266535082, "logits/chosen": -15.994955062866211, "logits/rejected": -16.226757049560547, "logps/chosen": -2634.907470703125, "logps/rejected": -2246.637451171875, "loss": 21.429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.76747131347656, "rewards/margins": -18.779321670532227, "rewards/rejected": -110.98814392089844, "step": 20810 }, { "epoch": 1.21, "grad_norm": 0.0, "learning_rate": 0.0006000232207128758, "logits/chosen": -14.19511604309082, "logits/rejected": -15.750493049621582, "logps/chosen": -2851.3486328125, "logps/rejected": -2709.4638671875, "loss": 8.1081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -133.3219451904297, "rewards/margins": 19.29287338256836, "rewards/rejected": -152.6148223876953, "step": 20820 }, { "epoch": 1.21, "grad_norm": 129.49935913085938, "learning_rate": 0.0005998297147722435, "logits/chosen": -15.069279670715332, "logits/rejected": -16.95928192138672, "logps/chosen": -2569.05615234375, "logps/rejected": -2216.99072265625, "loss": 9.8696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.76181030273438, "rewards/margins": 1.9745391607284546, "rewards/rejected": -146.73634338378906, "step": 20830 }, { "epoch": 1.21, "grad_norm": 13.006842613220215, "learning_rate": 0.0005996362088316111, "logits/chosen": -19.748199462890625, "logits/rejected": -21.551902770996094, "logps/chosen": -2218.139892578125, "logps/rejected": -2069.8388671875, "loss": 14.4189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -125.67767333984375, "rewards/margins": -2.6743998527526855, "rewards/rejected": -123.00328063964844, "step": 20840 }, { "epoch": 1.21, "grad_norm": 0.012210004031658173, "learning_rate": 0.0005994427028909788, "logits/chosen": -16.92049789428711, "logits/rejected": -17.392925262451172, "logps/chosen": -2495.0791015625, "logps/rejected": -2256.12939453125, "loss": 6.516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.8045654296875, "rewards/margins": 13.838678359985352, "rewards/rejected": -137.64324951171875, "step": 20850 }, { "epoch": 1.21, "grad_norm": 2.6387232242991443e-12, "learning_rate": 0.0005992491969503464, "logits/chosen": -16.939868927001953, "logits/rejected": -17.019067764282227, "logps/chosen": -2773.19677734375, "logps/rejected": -2642.696044921875, "loss": 7.7318, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -164.60507202148438, "rewards/margins": 3.164492130279541, "rewards/rejected": -167.76956176757812, "step": 20860 }, { "epoch": 1.21, "grad_norm": 2.2910635379957967e-06, "learning_rate": 0.000599055691009714, "logits/chosen": -18.74062156677246, "logits/rejected": -20.248750686645508, "logps/chosen": -2898.610595703125, "logps/rejected": -2747.166015625, "loss": 2.6951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -211.931884765625, "rewards/margins": 10.38691520690918, "rewards/rejected": -222.3188018798828, "step": 20870 }, { "epoch": 1.21, "grad_norm": 1.2717446968665896e-18, "learning_rate": 0.0005988621850690816, "logits/chosen": -12.469688415527344, "logits/rejected": -12.930102348327637, "logps/chosen": -2850.755615234375, "logps/rejected": -2811.70068359375, "loss": 8.2986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -61.045982360839844, "rewards/margins": 8.098901748657227, "rewards/rejected": -69.14488220214844, "step": 20880 }, { "epoch": 1.21, "grad_norm": 0.001004837336950004, "learning_rate": 0.0005986686791284492, "logits/chosen": -15.271380424499512, "logits/rejected": -17.3074951171875, "logps/chosen": -3114.665283203125, "logps/rejected": -2749.60986328125, "loss": 10.0425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.22522735595703, "rewards/margins": 5.952467441558838, "rewards/rejected": -94.17770385742188, "step": 20890 }, { "epoch": 1.21, "grad_norm": 142.10531616210938, "learning_rate": 0.0005984751731878168, "logits/chosen": -20.686216354370117, "logits/rejected": -23.06389617919922, "logps/chosen": -2869.39013671875, "logps/rejected": -2545.478759765625, "loss": 7.7668, "rewards/accuracies": 0.5, "rewards/chosen": -175.30702209472656, "rewards/margins": 3.14469838142395, "rewards/rejected": -178.4517364501953, "step": 20900 }, { "epoch": 1.21, "grad_norm": 0.28078392148017883, "learning_rate": 0.0005982816672471845, "logits/chosen": -23.22141456604004, "logits/rejected": -23.023027420043945, "logps/chosen": -2597.582275390625, "logps/rejected": -2465.020751953125, "loss": 5.0117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.3900909423828, "rewards/margins": 3.3505985736846924, "rewards/rejected": -217.74069213867188, "step": 20910 }, { "epoch": 1.21, "grad_norm": 76.66239929199219, "learning_rate": 0.0005980881613065521, "logits/chosen": -20.87324333190918, "logits/rejected": -25.0755672454834, "logps/chosen": -2830.158935546875, "logps/rejected": -2734.0966796875, "loss": 3.1182, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.8379364013672, "rewards/margins": 12.363790512084961, "rewards/rejected": -213.2017364501953, "step": 20920 }, { "epoch": 1.21, "grad_norm": 23.79494285583496, "learning_rate": 0.0005978946553659197, "logits/chosen": -17.001354217529297, "logits/rejected": -17.805580139160156, "logps/chosen": -2589.57861328125, "logps/rejected": -2634.295654296875, "loss": 3.3618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.11805725097656, "rewards/margins": 10.628253936767578, "rewards/rejected": -201.74627685546875, "step": 20930 }, { "epoch": 1.21, "grad_norm": 0.00026487145805731416, "learning_rate": 0.0005977011494252874, "logits/chosen": -14.440286636352539, "logits/rejected": -15.949902534484863, "logps/chosen": -2620.16943359375, "logps/rejected": -2197.69921875, "loss": 16.4655, "rewards/accuracies": 0.5, "rewards/chosen": -149.0666961669922, "rewards/margins": -1.9666671752929688, "rewards/rejected": -147.1000213623047, "step": 20940 }, { "epoch": 1.21, "grad_norm": 217.88685607910156, "learning_rate": 0.000597507643484655, "logits/chosen": -15.359268188476562, "logits/rejected": -16.153799057006836, "logps/chosen": -2639.208740234375, "logps/rejected": -2796.265625, "loss": 3.6651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -130.26007080078125, "rewards/margins": 8.613520622253418, "rewards/rejected": -138.8735809326172, "step": 20950 }, { "epoch": 1.21, "grad_norm": 0.0, "learning_rate": 0.0005973141375440227, "logits/chosen": -16.486679077148438, "logits/rejected": -17.10340118408203, "logps/chosen": -2564.02783203125, "logps/rejected": -2580.931884765625, "loss": 1.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.74200439453125, "rewards/margins": 20.51175880432129, "rewards/rejected": -175.25375366210938, "step": 20960 }, { "epoch": 1.21, "grad_norm": 29.433656692504883, "learning_rate": 0.0005971206316033903, "logits/chosen": -16.4981746673584, "logits/rejected": -16.711902618408203, "logps/chosen": -2414.94970703125, "logps/rejected": -2401.95849609375, "loss": 4.5612, "rewards/accuracies": 0.5, "rewards/chosen": -158.82101440429688, "rewards/margins": 11.187220573425293, "rewards/rejected": -170.00820922851562, "step": 20970 }, { "epoch": 1.21, "grad_norm": 5.543584823608398, "learning_rate": 0.0005969271256627579, "logits/chosen": -14.712359428405762, "logits/rejected": -15.431175231933594, "logps/chosen": -2454.498046875, "logps/rejected": -2550.236328125, "loss": 1.3783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -172.79885864257812, "rewards/margins": 13.666460990905762, "rewards/rejected": -186.46530151367188, "step": 20980 }, { "epoch": 1.21, "grad_norm": 0.006000409834086895, "learning_rate": 0.0005967336197221255, "logits/chosen": -13.712587356567383, "logits/rejected": -15.007894515991211, "logps/chosen": -2449.392578125, "logps/rejected": -2361.788818359375, "loss": 10.4852, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -130.2833251953125, "rewards/margins": -0.9372550845146179, "rewards/rejected": -129.3460693359375, "step": 20990 }, { "epoch": 1.22, "grad_norm": 127.15393829345703, "learning_rate": 0.0005965401137814931, "logits/chosen": -13.750042915344238, "logits/rejected": -14.08752155303955, "logps/chosen": -2603.41748046875, "logps/rejected": -2502.06884765625, "loss": 16.9188, "rewards/accuracies": 0.5, "rewards/chosen": -87.78388214111328, "rewards/margins": -11.027361869812012, "rewards/rejected": -76.75651550292969, "step": 21000 }, { "epoch": 1.22, "grad_norm": 88.13213348388672, "learning_rate": 0.0005963466078408607, "logits/chosen": -14.995244979858398, "logits/rejected": -15.598089218139648, "logps/chosen": -2556.8994140625, "logps/rejected": -2527.705322265625, "loss": 10.2915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.6508331298828, "rewards/margins": 4.4664130210876465, "rewards/rejected": -150.11724853515625, "step": 21010 }, { "epoch": 1.22, "grad_norm": 95.1794662475586, "learning_rate": 0.0005961531019002284, "logits/chosen": -15.676465034484863, "logits/rejected": -15.073575973510742, "logps/chosen": -2520.99853515625, "logps/rejected": -2352.95068359375, "loss": 8.6947, "rewards/accuracies": 0.5, "rewards/chosen": -167.25955200195312, "rewards/margins": -1.212781548500061, "rewards/rejected": -166.0467529296875, "step": 21020 }, { "epoch": 1.22, "grad_norm": 3.642381264223199e-14, "learning_rate": 0.0005959595959595959, "logits/chosen": -12.828814506530762, "logits/rejected": -13.199048042297363, "logps/chosen": -2942.005126953125, "logps/rejected": -2603.383056640625, "loss": 15.1994, "rewards/accuracies": 0.5, "rewards/chosen": -115.8792724609375, "rewards/margins": -6.154016017913818, "rewards/rejected": -109.7252426147461, "step": 21030 }, { "epoch": 1.22, "grad_norm": 91.34806823730469, "learning_rate": 0.0005957660900189636, "logits/chosen": -13.585886001586914, "logits/rejected": -13.41075611114502, "logps/chosen": -2997.98681640625, "logps/rejected": -2975.72412109375, "loss": 12.9197, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -161.28201293945312, "rewards/margins": -7.2050018310546875, "rewards/rejected": -154.07699584960938, "step": 21040 }, { "epoch": 1.22, "grad_norm": 0.011490201577544212, "learning_rate": 0.0005955725840783312, "logits/chosen": -14.926307678222656, "logits/rejected": -15.805097579956055, "logps/chosen": -2782.051025390625, "logps/rejected": -2338.012451171875, "loss": 14.8726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.75723266601562, "rewards/margins": 0.5189399719238281, "rewards/rejected": -166.27618408203125, "step": 21050 }, { "epoch": 1.22, "grad_norm": 37.801612854003906, "learning_rate": 0.0005953790781376988, "logits/chosen": -12.506010055541992, "logits/rejected": -12.825662612915039, "logps/chosen": -3268.455322265625, "logps/rejected": -2902.55908203125, "loss": 14.2547, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -127.06083679199219, "rewards/margins": -12.310929298400879, "rewards/rejected": -114.74992370605469, "step": 21060 }, { "epoch": 1.22, "grad_norm": 3.079724606210194e-20, "learning_rate": 0.0005951855721970664, "logits/chosen": -15.635333061218262, "logits/rejected": -16.364559173583984, "logps/chosen": -3085.586181640625, "logps/rejected": -2462.60595703125, "loss": 15.5365, "rewards/accuracies": 0.5, "rewards/chosen": -124.99796295166016, "rewards/margins": -7.220704078674316, "rewards/rejected": -117.77725982666016, "step": 21070 }, { "epoch": 1.22, "grad_norm": 54.860679626464844, "learning_rate": 0.0005949920662564341, "logits/chosen": -19.05609703063965, "logits/rejected": -17.673297882080078, "logps/chosen": -2624.49658203125, "logps/rejected": -2698.69482421875, "loss": 16.121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -162.46304321289062, "rewards/margins": -0.7428905367851257, "rewards/rejected": -161.72015380859375, "step": 21080 }, { "epoch": 1.22, "grad_norm": 9.59953680990111e-08, "learning_rate": 0.0005947985603158017, "logits/chosen": -15.968226432800293, "logits/rejected": -16.189510345458984, "logps/chosen": -2893.56982421875, "logps/rejected": -2489.09521484375, "loss": 10.0642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.77210998535156, "rewards/margins": 3.518672466278076, "rewards/rejected": -189.290771484375, "step": 21090 }, { "epoch": 1.22, "grad_norm": 0.0, "learning_rate": 0.0005946050543751693, "logits/chosen": -14.36616325378418, "logits/rejected": -16.080326080322266, "logps/chosen": -2773.65087890625, "logps/rejected": -2485.0908203125, "loss": 14.5212, "rewards/accuracies": 0.5, "rewards/chosen": -173.5325469970703, "rewards/margins": 2.3884682655334473, "rewards/rejected": -175.92100524902344, "step": 21100 }, { "epoch": 1.22, "grad_norm": 0.1180039644241333, "learning_rate": 0.0005944115484345369, "logits/chosen": -17.04819107055664, "logits/rejected": -17.993900299072266, "logps/chosen": -2466.947021484375, "logps/rejected": -2269.67578125, "loss": 13.6268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -143.82272338867188, "rewards/margins": 2.155200958251953, "rewards/rejected": -145.97792053222656, "step": 21110 }, { "epoch": 1.22, "grad_norm": 0.017348306253552437, "learning_rate": 0.0005942180424939045, "logits/chosen": -14.515643119812012, "logits/rejected": -15.720771789550781, "logps/chosen": -2334.24560546875, "logps/rejected": -2115.5087890625, "loss": 8.8999, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -140.9021453857422, "rewards/margins": -4.948654651641846, "rewards/rejected": -135.9534912109375, "step": 21120 }, { "epoch": 1.22, "grad_norm": 14.784092903137207, "learning_rate": 0.0005940245365532722, "logits/chosen": -17.4881591796875, "logits/rejected": -20.404239654541016, "logps/chosen": -2628.74072265625, "logps/rejected": -2504.65576171875, "loss": 5.0342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.99755859375, "rewards/margins": 1.1104644536972046, "rewards/rejected": -228.10800170898438, "step": 21130 }, { "epoch": 1.22, "grad_norm": 95.16841888427734, "learning_rate": 0.0005938310306126398, "logits/chosen": -13.376063346862793, "logits/rejected": -14.430203437805176, "logps/chosen": -2770.36962890625, "logps/rejected": -2514.471923828125, "loss": 5.081, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.39585876464844, "rewards/margins": 11.845356941223145, "rewards/rejected": -137.24122619628906, "step": 21140 }, { "epoch": 1.22, "grad_norm": 137.41751098632812, "learning_rate": 0.0005936375246720075, "logits/chosen": -14.055444717407227, "logits/rejected": -13.662317276000977, "logps/chosen": -2478.6865234375, "logps/rejected": -2596.43115234375, "loss": 8.8602, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -151.4849853515625, "rewards/margins": 1.5368893146514893, "rewards/rejected": -153.02186584472656, "step": 21150 }, { "epoch": 1.22, "grad_norm": 44.537109375, "learning_rate": 0.0005934440187313751, "logits/chosen": -11.971719741821289, "logits/rejected": -12.052886009216309, "logps/chosen": -2466.46142578125, "logps/rejected": -2460.6162109375, "loss": 12.4565, "rewards/accuracies": 0.5, "rewards/chosen": -149.34027099609375, "rewards/margins": -8.631390571594238, "rewards/rejected": -140.70889282226562, "step": 21160 }, { "epoch": 1.23, "grad_norm": 1.0374234079790767e-05, "learning_rate": 0.0005932505127907427, "logits/chosen": -12.810264587402344, "logits/rejected": -13.169013977050781, "logps/chosen": -2723.45849609375, "logps/rejected": -2546.024169921875, "loss": 4.3448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -122.4815673828125, "rewards/margins": 6.084539890289307, "rewards/rejected": -128.56610107421875, "step": 21170 }, { "epoch": 1.23, "grad_norm": 0.0, "learning_rate": 0.0005930570068501103, "logits/chosen": -14.757534980773926, "logits/rejected": -15.423296928405762, "logps/chosen": -2645.79736328125, "logps/rejected": -2786.01806640625, "loss": 3.1111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.14791870117188, "rewards/margins": 15.59205150604248, "rewards/rejected": -151.73995971679688, "step": 21180 }, { "epoch": 1.23, "grad_norm": 1.0718876686851425e-14, "learning_rate": 0.000592863500909478, "logits/chosen": -16.709178924560547, "logits/rejected": -16.976089477539062, "logps/chosen": -2798.7802734375, "logps/rejected": -2495.299072265625, "loss": 9.3141, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -140.17259216308594, "rewards/margins": -3.367976665496826, "rewards/rejected": -136.8046112060547, "step": 21190 }, { "epoch": 1.23, "grad_norm": 87.10427856445312, "learning_rate": 0.0005926699949688456, "logits/chosen": -15.046918869018555, "logits/rejected": -15.459497451782227, "logps/chosen": -2573.064208984375, "logps/rejected": -2435.981689453125, "loss": 24.7631, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.73883056640625, "rewards/margins": -18.31360626220703, "rewards/rejected": -146.42523193359375, "step": 21200 }, { "epoch": 1.23, "grad_norm": 245.23959350585938, "learning_rate": 0.0005924764890282132, "logits/chosen": -15.594907760620117, "logits/rejected": -15.943598747253418, "logps/chosen": -2774.712646484375, "logps/rejected": -3041.1171875, "loss": 4.5292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -143.84515380859375, "rewards/margins": 11.88848876953125, "rewards/rejected": -155.733642578125, "step": 21210 }, { "epoch": 1.23, "grad_norm": 59.391754150390625, "learning_rate": 0.0005922829830875808, "logits/chosen": -14.238639831542969, "logits/rejected": -14.532464981079102, "logps/chosen": -3026.77197265625, "logps/rejected": -2737.98681640625, "loss": 9.0517, "rewards/accuracies": 0.5, "rewards/chosen": -125.16302490234375, "rewards/margins": 11.922297477722168, "rewards/rejected": -137.0853271484375, "step": 21220 }, { "epoch": 1.23, "grad_norm": 80.42317199707031, "learning_rate": 0.0005920894771469484, "logits/chosen": -13.421455383300781, "logits/rejected": -14.260129928588867, "logps/chosen": -3155.015625, "logps/rejected": -2687.65380859375, "loss": 2.9538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -175.60354614257812, "rewards/margins": 7.6555280685424805, "rewards/rejected": -183.25906372070312, "step": 21230 }, { "epoch": 1.23, "grad_norm": 48.16497039794922, "learning_rate": 0.000591895971206316, "logits/chosen": -13.66277027130127, "logits/rejected": -14.097498893737793, "logps/chosen": -2234.23974609375, "logps/rejected": -2074.717529296875, "loss": 7.295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -106.73587799072266, "rewards/margins": 0.3722241520881653, "rewards/rejected": -107.10810852050781, "step": 21240 }, { "epoch": 1.23, "grad_norm": 0.0, "learning_rate": 0.0005917024652656837, "logits/chosen": -15.589401245117188, "logits/rejected": -15.767694473266602, "logps/chosen": -2559.586669921875, "logps/rejected": -2184.9609375, "loss": 13.2984, "rewards/accuracies": 0.5, "rewards/chosen": -154.1412811279297, "rewards/margins": 1.5269629955291748, "rewards/rejected": -155.6682586669922, "step": 21250 }, { "epoch": 1.23, "grad_norm": 7.844677448272705, "learning_rate": 0.0005915089593250513, "logits/chosen": -15.772770881652832, "logits/rejected": -16.276472091674805, "logps/chosen": -2323.32421875, "logps/rejected": -2406.874755859375, "loss": 1.8585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.0025634765625, "rewards/margins": 17.033138275146484, "rewards/rejected": -188.0356903076172, "step": 21260 }, { "epoch": 1.23, "grad_norm": 1.0832025554297076e-10, "learning_rate": 0.0005913154533844189, "logits/chosen": -14.619836807250977, "logits/rejected": -15.208292007446289, "logps/chosen": -2525.790283203125, "logps/rejected": -2378.104736328125, "loss": 5.2709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.8968505859375, "rewards/margins": 0.6564838290214539, "rewards/rejected": -150.55332946777344, "step": 21270 }, { "epoch": 1.23, "grad_norm": 56.644989013671875, "learning_rate": 0.0005911219474437865, "logits/chosen": -12.365216255187988, "logits/rejected": -12.753055572509766, "logps/chosen": -2642.9755859375, "logps/rejected": -1901.976318359375, "loss": 20.9498, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -111.45637512207031, "rewards/margins": -2.7275214195251465, "rewards/rejected": -108.7288589477539, "step": 21280 }, { "epoch": 1.23, "grad_norm": 1.2572334839866528e-10, "learning_rate": 0.0005909284415031541, "logits/chosen": -16.96882438659668, "logits/rejected": -17.54752540588379, "logps/chosen": -2653.55126953125, "logps/rejected": -2454.6787109375, "loss": 15.1372, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -162.4203338623047, "rewards/margins": -7.661106109619141, "rewards/rejected": -154.75921630859375, "step": 21290 }, { "epoch": 1.23, "grad_norm": 151.88548278808594, "learning_rate": 0.0005907349355625218, "logits/chosen": -14.91362476348877, "logits/rejected": -15.562304496765137, "logps/chosen": -2930.481689453125, "logps/rejected": -2596.908447265625, "loss": 3.1976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.80422973632812, "rewards/margins": 11.713617324829102, "rewards/rejected": -163.517822265625, "step": 21300 }, { "epoch": 1.23, "grad_norm": 54.4367561340332, "learning_rate": 0.0005905414296218894, "logits/chosen": -13.596890449523926, "logits/rejected": -14.626179695129395, "logps/chosen": -2778.107666015625, "logps/rejected": -2661.288818359375, "loss": 7.9983, "rewards/accuracies": 0.5, "rewards/chosen": -81.80716705322266, "rewards/margins": 3.457897186279297, "rewards/rejected": -85.26506042480469, "step": 21310 }, { "epoch": 1.23, "grad_norm": 0.006453727837651968, "learning_rate": 0.000590347923681257, "logits/chosen": -14.314104080200195, "logits/rejected": -15.108636856079102, "logps/chosen": -2568.41064453125, "logps/rejected": -2243.271484375, "loss": 30.6242, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -118.55165100097656, "rewards/margins": -22.929645538330078, "rewards/rejected": -95.62199401855469, "step": 21320 }, { "epoch": 1.23, "grad_norm": 2.2403113541435906e-16, "learning_rate": 0.0005901544177406246, "logits/chosen": -12.312067031860352, "logits/rejected": -13.281786918640137, "logps/chosen": -2703.46826171875, "logps/rejected": -2800.339111328125, "loss": 3.4189, "rewards/accuracies": 0.5, "rewards/chosen": -138.0689697265625, "rewards/margins": 11.142534255981445, "rewards/rejected": -149.21148681640625, "step": 21330 }, { "epoch": 1.24, "grad_norm": 2.6060521918225277e-07, "learning_rate": 0.0005899609117999922, "logits/chosen": -12.268829345703125, "logits/rejected": -12.101603507995605, "logps/chosen": -2600.455810546875, "logps/rejected": -2375.65185546875, "loss": 12.5986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -71.18572235107422, "rewards/margins": 1.2376339435577393, "rewards/rejected": -72.42335510253906, "step": 21340 }, { "epoch": 1.24, "grad_norm": 1.1415989320084918e-05, "learning_rate": 0.0005897674058593598, "logits/chosen": -20.19001579284668, "logits/rejected": -19.63892364501953, "logps/chosen": -2525.962158203125, "logps/rejected": -2540.82861328125, "loss": 3.2741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.61280822753906, "rewards/margins": 0.9617748260498047, "rewards/rejected": -191.57455444335938, "step": 21350 }, { "epoch": 1.24, "grad_norm": 95.73678588867188, "learning_rate": 0.0005895738999187276, "logits/chosen": -17.73758316040039, "logits/rejected": -18.1773681640625, "logps/chosen": -2405.852783203125, "logps/rejected": -2401.338134765625, "loss": 9.9254, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.81423950195312, "rewards/margins": -5.275635719299316, "rewards/rejected": -173.53860473632812, "step": 21360 }, { "epoch": 1.24, "grad_norm": 0.5124104619026184, "learning_rate": 0.0005893803939780952, "logits/chosen": -16.12912368774414, "logits/rejected": -16.405101776123047, "logps/chosen": -3056.9501953125, "logps/rejected": -2704.060546875, "loss": 2.9883, "rewards/accuracies": 0.5, "rewards/chosen": -149.31761169433594, "rewards/margins": 15.012628555297852, "rewards/rejected": -164.33023071289062, "step": 21370 }, { "epoch": 1.24, "grad_norm": 120.13807678222656, "learning_rate": 0.0005891868880374628, "logits/chosen": -12.710107803344727, "logits/rejected": -13.432034492492676, "logps/chosen": -3148.948486328125, "logps/rejected": -2509.493408203125, "loss": 3.3638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -82.5210952758789, "rewards/margins": 5.788854122161865, "rewards/rejected": -88.30994415283203, "step": 21380 }, { "epoch": 1.24, "grad_norm": 4.2372378494099066e-15, "learning_rate": 0.0005889933820968304, "logits/chosen": -15.272847175598145, "logits/rejected": -15.971789360046387, "logps/chosen": -2567.715087890625, "logps/rejected": -2805.525146484375, "loss": 3.8066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.3804168701172, "rewards/margins": 6.640066623687744, "rewards/rejected": -149.02047729492188, "step": 21390 }, { "epoch": 1.24, "grad_norm": 42.79737854003906, "learning_rate": 0.000588799876156198, "logits/chosen": -13.953961372375488, "logits/rejected": -13.055630683898926, "logps/chosen": -2713.3408203125, "logps/rejected": -2375.943115234375, "loss": 17.3747, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -168.48916625976562, "rewards/margins": -3.189215898513794, "rewards/rejected": -165.29995727539062, "step": 21400 }, { "epoch": 1.24, "grad_norm": 27.338590621948242, "learning_rate": 0.0005886063702155657, "logits/chosen": -14.198224067687988, "logits/rejected": -15.284749984741211, "logps/chosen": -2579.10791015625, "logps/rejected": -2202.152587890625, "loss": 4.9556, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -186.2547149658203, "rewards/margins": -0.49846306443214417, "rewards/rejected": -185.75624084472656, "step": 21410 }, { "epoch": 1.24, "grad_norm": 613.3084716796875, "learning_rate": 0.0005884128642749333, "logits/chosen": -16.068927764892578, "logits/rejected": -16.826095581054688, "logps/chosen": -2534.863525390625, "logps/rejected": -2588.194580078125, "loss": 10.5762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -130.0944366455078, "rewards/margins": 4.334963798522949, "rewards/rejected": -134.4293975830078, "step": 21420 }, { "epoch": 1.24, "grad_norm": 17.44706153869629, "learning_rate": 0.0005882193583343009, "logits/chosen": -15.812841415405273, "logits/rejected": -16.114376068115234, "logps/chosen": -2379.509765625, "logps/rejected": -2226.887451171875, "loss": 14.0399, "rewards/accuracies": 0.5, "rewards/chosen": -191.06094360351562, "rewards/margins": -9.70570182800293, "rewards/rejected": -181.35525512695312, "step": 21430 }, { "epoch": 1.24, "grad_norm": 113.33712005615234, "learning_rate": 0.0005880258523936685, "logits/chosen": -15.840171813964844, "logits/rejected": -16.090290069580078, "logps/chosen": -2192.063232421875, "logps/rejected": -2390.37841796875, "loss": 19.8338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.56101989746094, "rewards/margins": -4.203064918518066, "rewards/rejected": -170.3579559326172, "step": 21440 }, { "epoch": 1.24, "grad_norm": 0.2815874218940735, "learning_rate": 0.0005878323464530361, "logits/chosen": -13.350088119506836, "logits/rejected": -13.316668510437012, "logps/chosen": -2936.970458984375, "logps/rejected": -2691.860595703125, "loss": 3.9048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -120.00019836425781, "rewards/margins": -1.0281431674957275, "rewards/rejected": -118.97203063964844, "step": 21450 }, { "epoch": 1.24, "grad_norm": 21.379247665405273, "learning_rate": 0.0005876388405124038, "logits/chosen": -14.35639762878418, "logits/rejected": -14.75977897644043, "logps/chosen": -2658.272216796875, "logps/rejected": -2406.09814453125, "loss": 21.996, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -184.12570190429688, "rewards/margins": -14.085060119628906, "rewards/rejected": -170.04061889648438, "step": 21460 }, { "epoch": 1.24, "grad_norm": 1.4993739569035824e-05, "learning_rate": 0.0005874453345717714, "logits/chosen": -11.313138008117676, "logits/rejected": -11.41308879852295, "logps/chosen": -3051.1630859375, "logps/rejected": -2736.196533203125, "loss": 10.0603, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.4328842163086, "rewards/margins": -6.331839561462402, "rewards/rejected": -118.10102844238281, "step": 21470 }, { "epoch": 1.24, "grad_norm": 1.5320254564285278, "learning_rate": 0.000587251828631139, "logits/chosen": -14.196293830871582, "logits/rejected": -15.202542304992676, "logps/chosen": -2571.67822265625, "logps/rejected": -2435.284423828125, "loss": 6.807, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -218.07559204101562, "rewards/margins": -3.8651795387268066, "rewards/rejected": -214.21041870117188, "step": 21480 }, { "epoch": 1.24, "grad_norm": 78.51171875, "learning_rate": 0.0005870583226905066, "logits/chosen": -11.423012733459473, "logits/rejected": -11.286439895629883, "logps/chosen": -2890.945556640625, "logps/rejected": -2714.18603515625, "loss": 5.8101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -127.8983154296875, "rewards/margins": 8.996606826782227, "rewards/rejected": -136.89491271972656, "step": 21490 }, { "epoch": 1.24, "grad_norm": 53.71338653564453, "learning_rate": 0.0005868648167498742, "logits/chosen": -13.568575859069824, "logits/rejected": -13.596380233764648, "logps/chosen": -2897.672607421875, "logps/rejected": -2526.119873046875, "loss": 5.2389, "rewards/accuracies": 0.5, "rewards/chosen": -138.60845947265625, "rewards/margins": 6.9583611488342285, "rewards/rejected": -145.5668182373047, "step": 21500 }, { "epoch": 1.25, "grad_norm": 0.0, "learning_rate": 0.0005866713108092418, "logits/chosen": -16.974346160888672, "logits/rejected": -17.19882583618164, "logps/chosen": -2614.16943359375, "logps/rejected": -2312.796142578125, "loss": 23.9024, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.94920349121094, "rewards/margins": -9.102261543273926, "rewards/rejected": -182.84695434570312, "step": 21510 }, { "epoch": 1.25, "grad_norm": 63.883628845214844, "learning_rate": 0.0005864778048686094, "logits/chosen": -17.07447052001953, "logits/rejected": -17.04349708557129, "logps/chosen": -2810.616943359375, "logps/rejected": -2518.22021484375, "loss": 9.7801, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -183.79957580566406, "rewards/margins": -7.2259202003479, "rewards/rejected": -176.57366943359375, "step": 21520 }, { "epoch": 1.25, "grad_norm": 0.01763099804520607, "learning_rate": 0.0005862842989279771, "logits/chosen": -14.991312980651855, "logits/rejected": -15.000508308410645, "logps/chosen": -2369.765625, "logps/rejected": -2405.99267578125, "loss": 10.0701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.98782348632812, "rewards/margins": 9.657453536987305, "rewards/rejected": -174.645263671875, "step": 21530 }, { "epoch": 1.25, "grad_norm": 3.6302735679782927e-06, "learning_rate": 0.0005860907929873447, "logits/chosen": -12.049211502075195, "logits/rejected": -11.96794605255127, "logps/chosen": -2794.70849609375, "logps/rejected": -2517.88037109375, "loss": 6.9358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -105.612060546875, "rewards/margins": 5.867012977600098, "rewards/rejected": -111.47908020019531, "step": 21540 }, { "epoch": 1.25, "grad_norm": 65.2249755859375, "learning_rate": 0.0005858972870467123, "logits/chosen": -14.37951946258545, "logits/rejected": -14.453631401062012, "logps/chosen": -2489.087890625, "logps/rejected": -2509.667724609375, "loss": 3.5075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.86276245117188, "rewards/margins": 2.621330976486206, "rewards/rejected": -188.48410034179688, "step": 21550 }, { "epoch": 1.25, "grad_norm": 51.7250862121582, "learning_rate": 0.0005857037811060799, "logits/chosen": -12.643773078918457, "logits/rejected": -12.663961410522461, "logps/chosen": -2483.063720703125, "logps/rejected": -2720.2822265625, "loss": 7.1137, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -158.37081909179688, "rewards/margins": -1.54290771484375, "rewards/rejected": -156.82791137695312, "step": 21560 }, { "epoch": 1.25, "grad_norm": 6.908279281958585e-09, "learning_rate": 0.0005855102751654476, "logits/chosen": -15.615572929382324, "logits/rejected": -15.582392692565918, "logps/chosen": -2412.539794921875, "logps/rejected": -2447.75244140625, "loss": 2.1024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -209.8338623046875, "rewards/margins": 7.521148681640625, "rewards/rejected": -217.35501098632812, "step": 21570 }, { "epoch": 1.25, "grad_norm": 46.02159118652344, "learning_rate": 0.0005853167692248153, "logits/chosen": -13.356701850891113, "logits/rejected": -13.504429817199707, "logps/chosen": -2856.5107421875, "logps/rejected": -2815.341064453125, "loss": 0.6943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -120.9808120727539, "rewards/margins": 13.206463813781738, "rewards/rejected": -134.18728637695312, "step": 21580 }, { "epoch": 1.25, "grad_norm": 0.0007884618826210499, "learning_rate": 0.0005851232632841829, "logits/chosen": -13.612741470336914, "logits/rejected": -13.435873031616211, "logps/chosen": -2731.572998046875, "logps/rejected": -2677.534912109375, "loss": 17.0574, "rewards/accuracies": 0.5, "rewards/chosen": -111.0789794921875, "rewards/margins": -9.251287460327148, "rewards/rejected": -101.82768249511719, "step": 21590 }, { "epoch": 1.25, "grad_norm": 47.330997467041016, "learning_rate": 0.0005849297573435505, "logits/chosen": -13.171284675598145, "logits/rejected": -13.069633483886719, "logps/chosen": -2727.374267578125, "logps/rejected": -2480.89306640625, "loss": 9.6152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.51321411132812, "rewards/margins": -4.192601203918457, "rewards/rejected": -156.32058715820312, "step": 21600 }, { "epoch": 1.25, "grad_norm": 0.0026138327084481716, "learning_rate": 0.0005847362514029181, "logits/chosen": -13.33680534362793, "logits/rejected": -13.358454704284668, "logps/chosen": -2439.501220703125, "logps/rejected": -2108.489501953125, "loss": 2.6103, "rewards/accuracies": 0.5, "rewards/chosen": -171.9592742919922, "rewards/margins": 6.609208583831787, "rewards/rejected": -178.5684814453125, "step": 21610 }, { "epoch": 1.25, "grad_norm": 179.0256805419922, "learning_rate": 0.0005845427454622857, "logits/chosen": -14.010107040405273, "logits/rejected": -14.106292724609375, "logps/chosen": -2556.660888671875, "logps/rejected": -2720.742431640625, "loss": 8.5776, "rewards/accuracies": 0.5, "rewards/chosen": -136.53713989257812, "rewards/margins": 3.9625961780548096, "rewards/rejected": -140.499755859375, "step": 21620 }, { "epoch": 1.25, "grad_norm": 170.27468872070312, "learning_rate": 0.0005843492395216533, "logits/chosen": -11.252095222473145, "logits/rejected": -11.233105659484863, "logps/chosen": -2601.47705078125, "logps/rejected": -2639.85986328125, "loss": 8.3774, "rewards/accuracies": 0.5, "rewards/chosen": -56.07733154296875, "rewards/margins": -1.4997707605361938, "rewards/rejected": -54.57756423950195, "step": 21630 }, { "epoch": 1.25, "grad_norm": 6.867197726023733e-07, "learning_rate": 0.000584155733581021, "logits/chosen": -16.73027801513672, "logits/rejected": -16.620407104492188, "logps/chosen": -2747.471435546875, "logps/rejected": -2767.690673828125, "loss": 0.5831, "rewards/accuracies": 0.5, "rewards/chosen": -168.6500244140625, "rewards/margins": 6.230801105499268, "rewards/rejected": -174.88082885742188, "step": 21640 }, { "epoch": 1.25, "grad_norm": 41.78843688964844, "learning_rate": 0.0005839622276403886, "logits/chosen": -13.969294548034668, "logits/rejected": -13.847991943359375, "logps/chosen": -2393.999755859375, "logps/rejected": -2495.162353515625, "loss": 3.2374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.64759826660156, "rewards/margins": 19.797916412353516, "rewards/rejected": -192.4455108642578, "step": 21650 }, { "epoch": 1.25, "grad_norm": 11.069995880126953, "learning_rate": 0.0005837687216997562, "logits/chosen": -13.693188667297363, "logits/rejected": -13.829423904418945, "logps/chosen": -2692.855224609375, "logps/rejected": -2618.43603515625, "loss": 12.5626, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -200.7522430419922, "rewards/margins": -7.460015773773193, "rewards/rejected": -193.29220581054688, "step": 21660 }, { "epoch": 1.25, "grad_norm": 1.6535143004148267e-05, "learning_rate": 0.0005835752157591239, "logits/chosen": -13.717369079589844, "logits/rejected": -13.663793563842773, "logps/chosen": -2839.117919921875, "logps/rejected": -2654.865966796875, "loss": 18.8605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.73092651367188, "rewards/margins": -9.659399032592773, "rewards/rejected": -154.071533203125, "step": 21670 }, { "epoch": 1.25, "grad_norm": 0.0001075279651558958, "learning_rate": 0.0005833817098184915, "logits/chosen": -15.672894477844238, "logits/rejected": -15.37617301940918, "logps/chosen": -2808.4345703125, "logps/rejected": -2500.34033203125, "loss": 0.4304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -177.78805541992188, "rewards/margins": 15.749029159545898, "rewards/rejected": -193.53707885742188, "step": 21680 }, { "epoch": 1.26, "grad_norm": 101.93844604492188, "learning_rate": 0.000583188203877859, "logits/chosen": -14.282829284667969, "logits/rejected": -14.423151016235352, "logps/chosen": -2793.90625, "logps/rejected": -2736.815185546875, "loss": 10.031, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -212.76806640625, "rewards/margins": -8.380167961120605, "rewards/rejected": -204.3878936767578, "step": 21690 }, { "epoch": 1.26, "grad_norm": 1.5235068798065186, "learning_rate": 0.0005829946979372267, "logits/chosen": -15.653116226196289, "logits/rejected": -15.769792556762695, "logps/chosen": -2593.71484375, "logps/rejected": -2264.99365234375, "loss": 19.9767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.1792984008789, "rewards/margins": -14.21654224395752, "rewards/rejected": -110.9627685546875, "step": 21700 }, { "epoch": 1.26, "grad_norm": 3.0837298936603474e-08, "learning_rate": 0.0005828011919965943, "logits/chosen": -13.731115341186523, "logits/rejected": -13.820462226867676, "logps/chosen": -2706.74072265625, "logps/rejected": -2673.00146484375, "loss": 7.599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -231.40109252929688, "rewards/margins": 3.8980422019958496, "rewards/rejected": -235.2991485595703, "step": 21710 }, { "epoch": 1.26, "grad_norm": 26.84093475341797, "learning_rate": 0.0005826076860559619, "logits/chosen": -14.416116714477539, "logits/rejected": -14.285284042358398, "logps/chosen": -2675.44287109375, "logps/rejected": -2264.4521484375, "loss": 13.2922, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -196.85452270507812, "rewards/margins": -12.326345443725586, "rewards/rejected": -184.52818298339844, "step": 21720 }, { "epoch": 1.26, "grad_norm": 57.52218246459961, "learning_rate": 0.0005824141801153295, "logits/chosen": -14.874773979187012, "logits/rejected": -15.417966842651367, "logps/chosen": -2430.5595703125, "logps/rejected": -2389.04248046875, "loss": 18.2806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.5166778564453, "rewards/margins": 12.481666564941406, "rewards/rejected": -190.9983367919922, "step": 21730 }, { "epoch": 1.26, "grad_norm": 0.031062310561537743, "learning_rate": 0.0005822206741746971, "logits/chosen": -12.744089126586914, "logits/rejected": -12.726490020751953, "logps/chosen": -2611.79931640625, "logps/rejected": -2324.19482421875, "loss": 5.0521, "rewards/accuracies": 0.5, "rewards/chosen": -129.07638549804688, "rewards/margins": 8.936227798461914, "rewards/rejected": -138.01263427734375, "step": 21740 }, { "epoch": 1.26, "grad_norm": 20.690048217773438, "learning_rate": 0.0005820271682340647, "logits/chosen": -14.46467399597168, "logits/rejected": -14.501994132995605, "logps/chosen": -2291.365234375, "logps/rejected": -2308.779296875, "loss": 4.3745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.76112365722656, "rewards/margins": 8.014989852905273, "rewards/rejected": -127.7761001586914, "step": 21750 }, { "epoch": 1.26, "grad_norm": 7.248094508018039e-20, "learning_rate": 0.0005818336622934324, "logits/chosen": -15.766352653503418, "logits/rejected": -15.54064655303955, "logps/chosen": -2677.235595703125, "logps/rejected": -2470.91259765625, "loss": 13.1803, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -192.84939575195312, "rewards/margins": -3.850163221359253, "rewards/rejected": -188.9992218017578, "step": 21760 }, { "epoch": 1.26, "grad_norm": 0.009143427945673466, "learning_rate": 0.0005816401563528001, "logits/chosen": -14.337717056274414, "logits/rejected": -13.954248428344727, "logps/chosen": -2618.963134765625, "logps/rejected": -2517.48095703125, "loss": 13.4761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.39376831054688, "rewards/margins": -1.1528053283691406, "rewards/rejected": -153.24095153808594, "step": 21770 }, { "epoch": 1.26, "grad_norm": 67.89789581298828, "learning_rate": 0.0005814466504121677, "logits/chosen": -13.417806625366211, "logits/rejected": -13.250085830688477, "logps/chosen": -2774.45263671875, "logps/rejected": -2433.819580078125, "loss": 3.0146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -117.11674499511719, "rewards/margins": 11.354325294494629, "rewards/rejected": -128.4710693359375, "step": 21780 }, { "epoch": 1.26, "grad_norm": 0.005781939718872309, "learning_rate": 0.0005812531444715353, "logits/chosen": -15.208150863647461, "logits/rejected": -15.129611015319824, "logps/chosen": -2710.585205078125, "logps/rejected": -1987.107177734375, "loss": 22.9539, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.56549072265625, "rewards/margins": -15.570530891418457, "rewards/rejected": -143.99497985839844, "step": 21790 }, { "epoch": 1.26, "grad_norm": 8.597272872924805, "learning_rate": 0.0005810596385309029, "logits/chosen": -14.902453422546387, "logits/rejected": -14.660738945007324, "logps/chosen": -2562.092529296875, "logps/rejected": -2778.69677734375, "loss": 8.7939, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.363037109375, "rewards/margins": -1.0217307806015015, "rewards/rejected": -124.34129333496094, "step": 21800 }, { "epoch": 1.26, "grad_norm": 21.15916633605957, "learning_rate": 0.0005808661325902706, "logits/chosen": -18.17771339416504, "logits/rejected": -18.31117057800293, "logps/chosen": -2568.42626953125, "logps/rejected": -2282.0078125, "loss": 15.2861, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -183.4401092529297, "rewards/margins": -13.593963623046875, "rewards/rejected": -169.84616088867188, "step": 21810 }, { "epoch": 1.26, "grad_norm": 19.4161319732666, "learning_rate": 0.0005806726266496382, "logits/chosen": -15.876378059387207, "logits/rejected": -16.202877044677734, "logps/chosen": -3096.653564453125, "logps/rejected": -2550.049072265625, "loss": 8.7445, "rewards/accuracies": 0.5, "rewards/chosen": -165.85922241210938, "rewards/margins": -2.1267943382263184, "rewards/rejected": -163.732421875, "step": 21820 }, { "epoch": 1.26, "grad_norm": 2.4456489086151123, "learning_rate": 0.0005804791207090058, "logits/chosen": -14.737701416015625, "logits/rejected": -14.826189041137695, "logps/chosen": -2862.658203125, "logps/rejected": -2590.353515625, "loss": 8.5464, "rewards/accuracies": 0.5, "rewards/chosen": -114.9498519897461, "rewards/margins": -3.585784435272217, "rewards/rejected": -111.36405944824219, "step": 21830 }, { "epoch": 1.26, "grad_norm": 0.00028002815088257194, "learning_rate": 0.0005802856147683734, "logits/chosen": -14.673517227172852, "logits/rejected": -14.895709037780762, "logps/chosen": -2641.1552734375, "logps/rejected": -2661.423828125, "loss": 13.1384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.64080810546875, "rewards/margins": 6.650892734527588, "rewards/rejected": -152.2916717529297, "step": 21840 }, { "epoch": 1.26, "grad_norm": 6.820439324428662e-08, "learning_rate": 0.000580092108827741, "logits/chosen": -13.831456184387207, "logits/rejected": -14.289093017578125, "logps/chosen": -2931.229248046875, "logps/rejected": -2659.569580078125, "loss": 19.0975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.1045379638672, "rewards/margins": -11.582381248474121, "rewards/rejected": -173.5221710205078, "step": 21850 }, { "epoch": 1.27, "grad_norm": 0.0, "learning_rate": 0.0005798986028871086, "logits/chosen": -12.892402648925781, "logits/rejected": -12.951189994812012, "logps/chosen": -2754.434814453125, "logps/rejected": -2721.57861328125, "loss": 3.4518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.69322204589844, "rewards/margins": 12.769224166870117, "rewards/rejected": -143.4624481201172, "step": 21860 }, { "epoch": 1.27, "grad_norm": 118.5479965209961, "learning_rate": 0.0005797050969464763, "logits/chosen": -14.93757152557373, "logits/rejected": -15.341606140136719, "logps/chosen": -2456.568603515625, "logps/rejected": -2404.55322265625, "loss": 14.2738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.3999481201172, "rewards/margins": 0.3877304196357727, "rewards/rejected": -214.78768920898438, "step": 21870 }, { "epoch": 1.27, "grad_norm": 25.87877082824707, "learning_rate": 0.000579511591005844, "logits/chosen": -17.853191375732422, "logits/rejected": -17.76528549194336, "logps/chosen": -2429.302490234375, "logps/rejected": -2389.224853515625, "loss": 3.1228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -213.8751983642578, "rewards/margins": 1.9455223083496094, "rewards/rejected": -215.82070922851562, "step": 21880 }, { "epoch": 1.27, "grad_norm": 0.00019715476082637906, "learning_rate": 0.0005793180850652116, "logits/chosen": -16.403676986694336, "logits/rejected": -17.19102668762207, "logps/chosen": -2778.482666015625, "logps/rejected": -2759.48486328125, "loss": 6.987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.9827880859375, "rewards/margins": 0.24266108870506287, "rewards/rejected": -167.22543334960938, "step": 21890 }, { "epoch": 1.27, "grad_norm": 1.823021555935611e-08, "learning_rate": 0.0005791245791245792, "logits/chosen": -16.07886505126953, "logits/rejected": -16.40871810913086, "logps/chosen": -3228.8369140625, "logps/rejected": -2992.80615234375, "loss": 4.7011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.6520004272461, "rewards/margins": 8.407914161682129, "rewards/rejected": -117.05992126464844, "step": 21900 }, { "epoch": 1.27, "grad_norm": 111.29937744140625, "learning_rate": 0.0005789310731839467, "logits/chosen": -21.833070755004883, "logits/rejected": -21.833559036254883, "logps/chosen": -2837.05322265625, "logps/rejected": -2722.5556640625, "loss": 14.5204, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -189.1792755126953, "rewards/margins": -11.551965713500977, "rewards/rejected": -177.62730407714844, "step": 21910 }, { "epoch": 1.27, "grad_norm": 126.17225646972656, "learning_rate": 0.0005787375672433143, "logits/chosen": -19.481121063232422, "logits/rejected": -20.697141647338867, "logps/chosen": -2654.59619140625, "logps/rejected": -2366.89306640625, "loss": 22.6982, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.55943298339844, "rewards/margins": -11.838371276855469, "rewards/rejected": -147.72105407714844, "step": 21920 }, { "epoch": 1.27, "grad_norm": 57.48168182373047, "learning_rate": 0.000578544061302682, "logits/chosen": -18.087242126464844, "logits/rejected": -18.53685188293457, "logps/chosen": -2878.03271484375, "logps/rejected": -2932.863037109375, "loss": 4.3312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.4852752685547, "rewards/margins": 5.30648136138916, "rewards/rejected": -170.79176330566406, "step": 21930 }, { "epoch": 1.27, "grad_norm": 38.441749572753906, "learning_rate": 0.0005783505553620496, "logits/chosen": -20.994979858398438, "logits/rejected": -21.910724639892578, "logps/chosen": -2777.2431640625, "logps/rejected": -2700.140625, "loss": 3.6231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -211.7422637939453, "rewards/margins": 2.8854756355285645, "rewards/rejected": -214.6277313232422, "step": 21940 }, { "epoch": 1.27, "grad_norm": 2.4955878257751465, "learning_rate": 0.0005781570494214172, "logits/chosen": -15.782379150390625, "logits/rejected": -17.067615509033203, "logps/chosen": -2716.688232421875, "logps/rejected": -2685.70458984375, "loss": 6.0426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.94570922851562, "rewards/margins": 10.371743202209473, "rewards/rejected": -163.31747436523438, "step": 21950 }, { "epoch": 1.27, "grad_norm": 3.268134832382202, "learning_rate": 0.0005779635434807848, "logits/chosen": -11.65777587890625, "logits/rejected": -11.645565032958984, "logps/chosen": -3420.58740234375, "logps/rejected": -3017.740966796875, "loss": 9.9331, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -110.32437896728516, "rewards/margins": -2.5430495738983154, "rewards/rejected": -107.78133392333984, "step": 21960 }, { "epoch": 1.27, "grad_norm": 0.0, "learning_rate": 0.0005777700375401524, "logits/chosen": -18.785724639892578, "logits/rejected": -18.52107810974121, "logps/chosen": -2657.28271484375, "logps/rejected": -2532.92724609375, "loss": 3.8115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.0203399658203, "rewards/margins": 24.480457305908203, "rewards/rejected": -180.50079345703125, "step": 21970 }, { "epoch": 1.27, "grad_norm": 1.676109528947478e-10, "learning_rate": 0.0005775765315995202, "logits/chosen": -17.06513214111328, "logits/rejected": -17.48383331298828, "logps/chosen": -2933.0234375, "logps/rejected": -2549.3974609375, "loss": 2.2445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -81.03206634521484, "rewards/margins": 10.257983207702637, "rewards/rejected": -91.29004669189453, "step": 21980 }, { "epoch": 1.27, "grad_norm": 32.70398712158203, "learning_rate": 0.0005773830256588878, "logits/chosen": -19.23102378845215, "logits/rejected": -23.313793182373047, "logps/chosen": -2978.81103515625, "logps/rejected": -2847.4912109375, "loss": 7.2815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -126.16850280761719, "rewards/margins": 3.2557578086853027, "rewards/rejected": -129.42425537109375, "step": 21990 }, { "epoch": 1.27, "grad_norm": 4.2077357223502254e-12, "learning_rate": 0.0005771895197182554, "logits/chosen": -20.67713165283203, "logits/rejected": -20.207712173461914, "logps/chosen": -2931.953125, "logps/rejected": -3027.301025390625, "loss": 5.8925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.12649536132812, "rewards/margins": 7.4378485679626465, "rewards/rejected": -182.56434631347656, "step": 22000 }, { "epoch": 1.27, "grad_norm": 347.7121887207031, "learning_rate": 0.000576996013777623, "logits/chosen": -16.283092498779297, "logits/rejected": -16.699115753173828, "logps/chosen": -3149.58447265625, "logps/rejected": -3408.330810546875, "loss": 24.8844, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -97.19148254394531, "rewards/margins": -23.94962501525879, "rewards/rejected": -73.24185180664062, "step": 22010 }, { "epoch": 1.27, "grad_norm": 6.779198313822121e-10, "learning_rate": 0.0005768025078369906, "logits/chosen": -19.046817779541016, "logits/rejected": -18.06254005432129, "logps/chosen": -3073.190673828125, "logps/rejected": -3029.775634765625, "loss": 6.9466, "rewards/accuracies": 0.5, "rewards/chosen": -157.04486083984375, "rewards/margins": -2.473602294921875, "rewards/rejected": -154.57125854492188, "step": 22020 }, { "epoch": 1.28, "grad_norm": 82.51359558105469, "learning_rate": 0.0005766090018963582, "logits/chosen": -16.791217803955078, "logits/rejected": -16.36092758178711, "logps/chosen": -2918.103271484375, "logps/rejected": -3028.4638671875, "loss": 2.1157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.48544311523438, "rewards/margins": 7.900038242340088, "rewards/rejected": -193.38546752929688, "step": 22030 }, { "epoch": 1.28, "grad_norm": 60.805335998535156, "learning_rate": 0.0005764154959557259, "logits/chosen": -13.270547866821289, "logits/rejected": -14.409727096557617, "logps/chosen": -2999.75537109375, "logps/rejected": -2849.78662109375, "loss": 6.9802, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.27603149414062, "rewards/margins": 4.526206016540527, "rewards/rejected": -161.80224609375, "step": 22040 }, { "epoch": 1.28, "grad_norm": 0.05436113849282265, "learning_rate": 0.0005762219900150935, "logits/chosen": -16.320476531982422, "logits/rejected": -19.46590805053711, "logps/chosen": -2925.69970703125, "logps/rejected": -2535.704345703125, "loss": 4.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -199.5355987548828, "rewards/margins": 9.072507858276367, "rewards/rejected": -208.6080780029297, "step": 22050 }, { "epoch": 1.28, "grad_norm": 27.346805572509766, "learning_rate": 0.0005760284840744611, "logits/chosen": -18.45613670349121, "logits/rejected": -18.49319076538086, "logps/chosen": -2489.60986328125, "logps/rejected": -2267.63671875, "loss": 3.3741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.7756805419922, "rewards/margins": 2.257875919342041, "rewards/rejected": -197.03353881835938, "step": 22060 }, { "epoch": 1.28, "grad_norm": 6.1375584614609124e-09, "learning_rate": 0.0005758349781338287, "logits/chosen": -23.297700881958008, "logits/rejected": -26.8338680267334, "logps/chosen": -2875.0126953125, "logps/rejected": -2820.19873046875, "loss": 1.8381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -253.10342407226562, "rewards/margins": 7.561360836029053, "rewards/rejected": -260.664794921875, "step": 22070 }, { "epoch": 1.28, "grad_norm": 92.09268951416016, "learning_rate": 0.0005756414721931963, "logits/chosen": -19.46034812927246, "logits/rejected": -22.68609619140625, "logps/chosen": -3099.0546875, "logps/rejected": -2693.2568359375, "loss": 5.3431, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.31784057617188, "rewards/margins": 9.875543594360352, "rewards/rejected": -201.19338989257812, "step": 22080 }, { "epoch": 1.28, "grad_norm": 94.51213836669922, "learning_rate": 0.000575447966252564, "logits/chosen": -21.610626220703125, "logits/rejected": -21.56658935546875, "logps/chosen": -2269.093505859375, "logps/rejected": -2381.2626953125, "loss": 4.692, "rewards/accuracies": 0.5, "rewards/chosen": -161.89295959472656, "rewards/margins": -0.5941411256790161, "rewards/rejected": -161.29881286621094, "step": 22090 }, { "epoch": 1.28, "grad_norm": 3.513727051540627e-06, "learning_rate": 0.0005752544603119317, "logits/chosen": -21.553863525390625, "logits/rejected": -32.36595916748047, "logps/chosen": -2807.19287109375, "logps/rejected": -2432.36474609375, "loss": 43.1976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -229.8187713623047, "rewards/margins": -33.2379035949707, "rewards/rejected": -196.5808868408203, "step": 22100 }, { "epoch": 1.28, "grad_norm": 936.966796875, "learning_rate": 0.0005750609543712993, "logits/chosen": -17.349544525146484, "logits/rejected": -20.17560386657715, "logps/chosen": -2797.35546875, "logps/rejected": -2746.468994140625, "loss": 10.9443, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -209.3260498046875, "rewards/margins": -1.7871220111846924, "rewards/rejected": -207.5389404296875, "step": 22110 }, { "epoch": 1.28, "grad_norm": 1.3677060595185253e-09, "learning_rate": 0.0005748674484306669, "logits/chosen": -14.944745063781738, "logits/rejected": -13.05461597442627, "logps/chosen": -3317.468017578125, "logps/rejected": -3357.36669921875, "loss": 6.9307, "rewards/accuracies": 0.5, "rewards/chosen": -118.0259017944336, "rewards/margins": -1.5223668813705444, "rewards/rejected": -116.50352478027344, "step": 22120 }, { "epoch": 1.28, "grad_norm": 0.03503350540995598, "learning_rate": 0.0005746739424900344, "logits/chosen": -11.302342414855957, "logits/rejected": -12.035676002502441, "logps/chosen": -3149.32080078125, "logps/rejected": -2768.833984375, "loss": 2.605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -66.68467712402344, "rewards/margins": 6.910592555999756, "rewards/rejected": -73.59526824951172, "step": 22130 }, { "epoch": 1.28, "grad_norm": 328.1550598144531, "learning_rate": 0.000574480436549402, "logits/chosen": -19.322050094604492, "logits/rejected": -25.651294708251953, "logps/chosen": -2654.22607421875, "logps/rejected": -2780.5556640625, "loss": 5.0158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.03219604492188, "rewards/margins": 17.92531967163086, "rewards/rejected": -215.95751953125, "step": 22140 }, { "epoch": 1.28, "grad_norm": 3.09153014299568e-09, "learning_rate": 0.0005742869306087696, "logits/chosen": -16.319028854370117, "logits/rejected": -16.27507781982422, "logps/chosen": -2691.00927734375, "logps/rejected": -2736.93115234375, "loss": 4.2767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -112.6456069946289, "rewards/margins": 10.507234573364258, "rewards/rejected": -123.15284729003906, "step": 22150 }, { "epoch": 1.28, "grad_norm": 0.0007136868662200868, "learning_rate": 0.0005740934246681373, "logits/chosen": -15.761543273925781, "logits/rejected": -15.228825569152832, "logps/chosen": -2617.12451171875, "logps/rejected": -2771.60009765625, "loss": 4.5586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.71566772460938, "rewards/margins": 17.27621841430664, "rewards/rejected": -169.99191284179688, "step": 22160 }, { "epoch": 1.28, "grad_norm": 3.0030846698281266e-09, "learning_rate": 0.0005738999187275049, "logits/chosen": -17.399370193481445, "logits/rejected": -16.44477081298828, "logps/chosen": -3039.04736328125, "logps/rejected": -2892.89599609375, "loss": 4.8983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.77720642089844, "rewards/margins": 7.444985866546631, "rewards/rejected": -168.22219848632812, "step": 22170 }, { "epoch": 1.28, "grad_norm": 3.3140859159175307e-05, "learning_rate": 0.0005737064127868725, "logits/chosen": -15.827688217163086, "logits/rejected": -19.159870147705078, "logps/chosen": -2908.341796875, "logps/rejected": -2829.87841796875, "loss": 14.4595, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -189.87802124023438, "rewards/margins": -9.387140274047852, "rewards/rejected": -180.49090576171875, "step": 22180 }, { "epoch": 1.28, "grad_norm": 122.19932556152344, "learning_rate": 0.0005735129068462402, "logits/chosen": -13.841464042663574, "logits/rejected": -16.909528732299805, "logps/chosen": -3236.4541015625, "logps/rejected": -2988.57080078125, "loss": 12.7938, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -184.37020874023438, "rewards/margins": -7.856955051422119, "rewards/rejected": -176.51324462890625, "step": 22190 }, { "epoch": 1.29, "grad_norm": 0.017319440841674805, "learning_rate": 0.0005733194009056078, "logits/chosen": -15.265376091003418, "logits/rejected": -14.927253723144531, "logps/chosen": -3033.611083984375, "logps/rejected": -2991.123046875, "loss": 1.8603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.8060073852539, "rewards/margins": 13.872113227844238, "rewards/rejected": -135.67811584472656, "step": 22200 }, { "epoch": 1.29, "grad_norm": 1.7260148525238037, "learning_rate": 0.0005731258949649755, "logits/chosen": -17.063251495361328, "logits/rejected": -17.814931869506836, "logps/chosen": -2707.4599609375, "logps/rejected": -3063.748046875, "loss": 3.9705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.7205810546875, "rewards/margins": 22.245595932006836, "rewards/rejected": -169.96617126464844, "step": 22210 }, { "epoch": 1.29, "grad_norm": 80.66400909423828, "learning_rate": 0.0005729323890243431, "logits/chosen": -12.92255687713623, "logits/rejected": -13.039060592651367, "logps/chosen": -2672.850341796875, "logps/rejected": -2693.026123046875, "loss": 9.4473, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.40301513671875, "rewards/margins": -5.894504547119141, "rewards/rejected": -111.5085220336914, "step": 22220 }, { "epoch": 1.29, "grad_norm": 74.72605895996094, "learning_rate": 0.0005727388830837107, "logits/chosen": -17.119617462158203, "logits/rejected": -18.940534591674805, "logps/chosen": -2712.705078125, "logps/rejected": -2764.78662109375, "loss": 4.6863, "rewards/accuracies": 0.5, "rewards/chosen": -200.12026977539062, "rewards/margins": 4.818001747131348, "rewards/rejected": -204.9382781982422, "step": 22230 }, { "epoch": 1.29, "grad_norm": 75.51009368896484, "learning_rate": 0.0005725453771430783, "logits/chosen": -18.205371856689453, "logits/rejected": -18.8975887298584, "logps/chosen": -2554.30322265625, "logps/rejected": -2744.78466796875, "loss": 7.0265, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -205.27587890625, "rewards/margins": 1.2749454975128174, "rewards/rejected": -206.55081176757812, "step": 22240 }, { "epoch": 1.29, "grad_norm": 162.2552947998047, "learning_rate": 0.0005723518712024459, "logits/chosen": -14.813143730163574, "logits/rejected": -16.109477996826172, "logps/chosen": -2789.04638671875, "logps/rejected": -2759.482421875, "loss": 1.8844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -205.482421875, "rewards/margins": 7.939662933349609, "rewards/rejected": -213.42208862304688, "step": 22250 }, { "epoch": 1.29, "grad_norm": 1.1910425424575806, "learning_rate": 0.0005721583652618135, "logits/chosen": -14.88568115234375, "logits/rejected": -15.964639663696289, "logps/chosen": -2777.35888671875, "logps/rejected": -2974.38037109375, "loss": 4.8543, "rewards/accuracies": 0.5, "rewards/chosen": -156.83065795898438, "rewards/margins": 3.880573272705078, "rewards/rejected": -160.71121215820312, "step": 22260 }, { "epoch": 1.29, "grad_norm": 2.0189108340673556e-07, "learning_rate": 0.0005719648593211812, "logits/chosen": -21.604991912841797, "logits/rejected": -22.927417755126953, "logps/chosen": -2801.774658203125, "logps/rejected": -2825.793701171875, "loss": 2.7731, "rewards/accuracies": 0.5, "rewards/chosen": -221.25814819335938, "rewards/margins": 1.4526560306549072, "rewards/rejected": -222.7108154296875, "step": 22270 }, { "epoch": 1.29, "grad_norm": 74.22747039794922, "learning_rate": 0.0005717713533805488, "logits/chosen": -15.521835327148438, "logits/rejected": -16.5334529876709, "logps/chosen": -2931.21728515625, "logps/rejected": -2793.646240234375, "loss": 3.8785, "rewards/accuracies": 0.5, "rewards/chosen": -116.68310546875, "rewards/margins": 1.422371506690979, "rewards/rejected": -118.10546875, "step": 22280 }, { "epoch": 1.29, "grad_norm": 217.7855682373047, "learning_rate": 0.0005715778474399164, "logits/chosen": -15.377008438110352, "logits/rejected": -16.91212272644043, "logps/chosen": -2707.649658203125, "logps/rejected": -2640.779296875, "loss": 17.6039, "rewards/accuracies": 0.5, "rewards/chosen": -173.5136260986328, "rewards/margins": -9.477978706359863, "rewards/rejected": -164.03565979003906, "step": 22290 }, { "epoch": 1.29, "grad_norm": 0.0008370417053811252, "learning_rate": 0.0005713843414992841, "logits/chosen": -15.179667472839355, "logits/rejected": -16.20810317993164, "logps/chosen": -2697.495849609375, "logps/rejected": -2338.848388671875, "loss": 15.4545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -147.83868408203125, "rewards/margins": -4.91287899017334, "rewards/rejected": -142.92581176757812, "step": 22300 }, { "epoch": 1.29, "grad_norm": 70.59600830078125, "learning_rate": 0.0005711908355586517, "logits/chosen": -18.99294662475586, "logits/rejected": -18.437667846679688, "logps/chosen": -2489.351806640625, "logps/rejected": -2440.2158203125, "loss": 7.452, "rewards/accuracies": 0.5, "rewards/chosen": -191.38009643554688, "rewards/margins": -2.089099168777466, "rewards/rejected": -189.29098510742188, "step": 22310 }, { "epoch": 1.29, "grad_norm": 5.101665738038719e-05, "learning_rate": 0.0005709973296180194, "logits/chosen": -14.83691120147705, "logits/rejected": -16.528003692626953, "logps/chosen": -2994.87744140625, "logps/rejected": -2994.126220703125, "loss": 4.9415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -147.2389373779297, "rewards/margins": 14.766301155090332, "rewards/rejected": -162.00521850585938, "step": 22320 }, { "epoch": 1.29, "grad_norm": 41.96866989135742, "learning_rate": 0.000570803823677387, "logits/chosen": -14.078432083129883, "logits/rejected": -15.14153003692627, "logps/chosen": -2680.180419921875, "logps/rejected": -2544.982666015625, "loss": 2.0741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -113.28177642822266, "rewards/margins": 5.034595012664795, "rewards/rejected": -118.31636810302734, "step": 22330 }, { "epoch": 1.29, "grad_norm": 46.88715362548828, "learning_rate": 0.0005706103177367545, "logits/chosen": -18.390483856201172, "logits/rejected": -18.497211456298828, "logps/chosen": -2579.887451171875, "logps/rejected": -2147.54248046875, "loss": 15.8273, "rewards/accuracies": 0.5, "rewards/chosen": -151.93849182128906, "rewards/margins": -0.9601585268974304, "rewards/rejected": -150.97830200195312, "step": 22340 }, { "epoch": 1.29, "grad_norm": 2.32863450050354, "learning_rate": 0.0005704168117961221, "logits/chosen": -14.661120414733887, "logits/rejected": -15.449243545532227, "logps/chosen": -2564.61962890625, "logps/rejected": -2316.135009765625, "loss": 12.0215, "rewards/accuracies": 0.5, "rewards/chosen": -143.9300537109375, "rewards/margins": -7.1475372314453125, "rewards/rejected": -136.78253173828125, "step": 22350 }, { "epoch": 1.29, "grad_norm": 304.8454895019531, "learning_rate": 0.0005702233058554897, "logits/chosen": -18.043102264404297, "logits/rejected": -17.82225227355957, "logps/chosen": -2397.07861328125, "logps/rejected": -2380.464599609375, "loss": 11.0002, "rewards/accuracies": 0.5, "rewards/chosen": -173.7978515625, "rewards/margins": -5.317588806152344, "rewards/rejected": -168.4802703857422, "step": 22360 }, { "epoch": 1.29, "grad_norm": 69.27391815185547, "learning_rate": 0.0005700297999148573, "logits/chosen": -12.25101089477539, "logits/rejected": -12.546873092651367, "logps/chosen": -2576.331298828125, "logps/rejected": -2551.3427734375, "loss": 6.3979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -54.243934631347656, "rewards/margins": 3.235407590866089, "rewards/rejected": -57.479347229003906, "step": 22370 }, { "epoch": 1.3, "grad_norm": 51.10593795776367, "learning_rate": 0.000569836293974225, "logits/chosen": -16.833993911743164, "logits/rejected": -16.451690673828125, "logps/chosen": -2797.989501953125, "logps/rejected": -2885.93359375, "loss": 4.7853, "rewards/accuracies": 0.5, "rewards/chosen": -119.8479232788086, "rewards/margins": 0.3496705889701843, "rewards/rejected": -120.1976089477539, "step": 22380 }, { "epoch": 1.3, "grad_norm": 0.0027313947211951017, "learning_rate": 0.0005696427880335926, "logits/chosen": -16.244068145751953, "logits/rejected": -16.049915313720703, "logps/chosen": -2861.408935546875, "logps/rejected": -2623.299560546875, "loss": 2.2098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.86195373535156, "rewards/margins": 6.164621829986572, "rewards/rejected": -164.0265655517578, "step": 22390 }, { "epoch": 1.3, "grad_norm": 1.940659899446473e-07, "learning_rate": 0.0005694492820929603, "logits/chosen": -15.266035079956055, "logits/rejected": -16.7321834564209, "logps/chosen": -3255.683837890625, "logps/rejected": -3199.4296875, "loss": 10.5808, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -180.93785095214844, "rewards/margins": -3.507521152496338, "rewards/rejected": -177.43032836914062, "step": 22400 }, { "epoch": 1.3, "grad_norm": 0.0, "learning_rate": 0.0005692557761523279, "logits/chosen": -13.949666976928711, "logits/rejected": -15.734594345092773, "logps/chosen": -2444.572509765625, "logps/rejected": -2836.92724609375, "loss": 7.0317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.94546508789062, "rewards/margins": 6.830059051513672, "rewards/rejected": -177.77549743652344, "step": 22410 }, { "epoch": 1.3, "grad_norm": 2.5682382329250686e-05, "learning_rate": 0.0005690622702116955, "logits/chosen": -12.299727439880371, "logits/rejected": -12.218351364135742, "logps/chosen": -2995.244384765625, "logps/rejected": -3036.30810546875, "loss": 3.6925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.67379760742188, "rewards/margins": 12.107099533081055, "rewards/rejected": -167.78091430664062, "step": 22420 }, { "epoch": 1.3, "grad_norm": 0.0009093201952055097, "learning_rate": 0.0005688687642710631, "logits/chosen": -8.487151145935059, "logits/rejected": -9.267927169799805, "logps/chosen": -3324.31396484375, "logps/rejected": -3009.75439453125, "loss": 11.5988, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -125.60015869140625, "rewards/margins": -9.603982925415039, "rewards/rejected": -115.99617767333984, "step": 22430 }, { "epoch": 1.3, "grad_norm": 310.4373779296875, "learning_rate": 0.0005686752583304308, "logits/chosen": -14.281707763671875, "logits/rejected": -13.750821113586426, "logps/chosen": -3090.96435546875, "logps/rejected": -2891.07568359375, "loss": 13.1625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -220.4517364501953, "rewards/margins": -2.735914945602417, "rewards/rejected": -217.7158203125, "step": 22440 }, { "epoch": 1.3, "grad_norm": 6.335763828246854e-06, "learning_rate": 0.0005684817523897984, "logits/chosen": -15.194700241088867, "logits/rejected": -15.6052885055542, "logps/chosen": -2653.2294921875, "logps/rejected": -2404.78271484375, "loss": 3.5707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.1050262451172, "rewards/margins": 12.875635147094727, "rewards/rejected": -209.9806671142578, "step": 22450 }, { "epoch": 1.3, "grad_norm": 5.957299709320068, "learning_rate": 0.000568288246449166, "logits/chosen": -16.652652740478516, "logits/rejected": -16.56917953491211, "logps/chosen": -2701.987548828125, "logps/rejected": -2626.770751953125, "loss": 4.0541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.22000122070312, "rewards/margins": 2.491110324859619, "rewards/rejected": -174.7111053466797, "step": 22460 }, { "epoch": 1.3, "grad_norm": 67.39998626708984, "learning_rate": 0.0005680947405085336, "logits/chosen": -18.620861053466797, "logits/rejected": -19.135948181152344, "logps/chosen": -2705.341796875, "logps/rejected": -2358.77099609375, "loss": 17.0866, "rewards/accuracies": 0.5, "rewards/chosen": -171.22311401367188, "rewards/margins": -6.619412899017334, "rewards/rejected": -164.60369873046875, "step": 22470 }, { "epoch": 1.3, "grad_norm": 0.09716299176216125, "learning_rate": 0.0005679012345679012, "logits/chosen": -15.726537704467773, "logits/rejected": -16.899232864379883, "logps/chosen": -2724.16650390625, "logps/rejected": -2856.731689453125, "loss": 7.9164, "rewards/accuracies": 0.5, "rewards/chosen": -138.78726196289062, "rewards/margins": -0.7696899175643921, "rewards/rejected": -138.01759338378906, "step": 22480 }, { "epoch": 1.3, "grad_norm": 69.01519012451172, "learning_rate": 0.0005677077286272688, "logits/chosen": -16.71158218383789, "logits/rejected": -17.90565299987793, "logps/chosen": -2802.4853515625, "logps/rejected": -2469.4892578125, "loss": 9.4115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -135.5635528564453, "rewards/margins": 6.554014682769775, "rewards/rejected": -142.11756896972656, "step": 22490 }, { "epoch": 1.3, "grad_norm": 0.04885464534163475, "learning_rate": 0.0005675142226866365, "logits/chosen": -17.374860763549805, "logits/rejected": -19.067440032958984, "logps/chosen": -2775.933837890625, "logps/rejected": -2804.2109375, "loss": 1.8868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.480712890625, "rewards/margins": 5.670957088470459, "rewards/rejected": -213.1516876220703, "step": 22500 }, { "epoch": 1.3, "grad_norm": 0.0, "learning_rate": 0.0005673207167460042, "logits/chosen": -14.345044136047363, "logits/rejected": -13.952113151550293, "logps/chosen": -2898.090087890625, "logps/rejected": -2309.27587890625, "loss": 3.7772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -65.06121826171875, "rewards/margins": 12.33234977722168, "rewards/rejected": -77.39356994628906, "step": 22510 }, { "epoch": 1.3, "grad_norm": 5.328981876373291, "learning_rate": 0.0005671272108053718, "logits/chosen": -16.245418548583984, "logits/rejected": -18.709863662719727, "logps/chosen": -2661.51904296875, "logps/rejected": -2686.927001953125, "loss": 1.5528, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -122.39501953125, "rewards/margins": 9.5424165725708, "rewards/rejected": -131.93743896484375, "step": 22520 }, { "epoch": 1.3, "grad_norm": 0.0, "learning_rate": 0.0005669337048647394, "logits/chosen": -16.521270751953125, "logits/rejected": -20.222890853881836, "logps/chosen": -2478.153076171875, "logps/rejected": -2382.486328125, "loss": 16.1232, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -175.97018432617188, "rewards/margins": -6.970110893249512, "rewards/rejected": -169.0000762939453, "step": 22530 }, { "epoch": 1.3, "grad_norm": 64.78279113769531, "learning_rate": 0.000566740198924107, "logits/chosen": -15.013224601745605, "logits/rejected": -15.182497024536133, "logps/chosen": -2655.0185546875, "logps/rejected": -2487.890625, "loss": 7.7441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -57.02851104736328, "rewards/margins": 6.5181708335876465, "rewards/rejected": -63.546669006347656, "step": 22540 }, { "epoch": 1.31, "grad_norm": 2.741676330408893e-10, "learning_rate": 0.0005665466929834747, "logits/chosen": -17.919586181640625, "logits/rejected": -19.021530151367188, "logps/chosen": -2590.017333984375, "logps/rejected": -2592.959716796875, "loss": 4.0282, "rewards/accuracies": 0.5, "rewards/chosen": -144.57235717773438, "rewards/margins": 7.488012790679932, "rewards/rejected": -152.06039428710938, "step": 22550 }, { "epoch": 1.31, "grad_norm": 0.1575736403465271, "learning_rate": 0.0005663531870428422, "logits/chosen": -15.739675521850586, "logits/rejected": -16.65306282043457, "logps/chosen": -2548.905029296875, "logps/rejected": -2659.452880859375, "loss": 2.8941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.312744140625, "rewards/margins": 7.405812740325928, "rewards/rejected": -119.71855163574219, "step": 22560 }, { "epoch": 1.31, "grad_norm": 63.86362838745117, "learning_rate": 0.0005661596811022098, "logits/chosen": -20.758275985717773, "logits/rejected": -19.748964309692383, "logps/chosen": -2750.95556640625, "logps/rejected": -2872.718505859375, "loss": 17.0371, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -160.07176208496094, "rewards/margins": -12.986490249633789, "rewards/rejected": -147.0852813720703, "step": 22570 }, { "epoch": 1.31, "grad_norm": 2.9304435884114355e-05, "learning_rate": 0.0005659661751615774, "logits/chosen": -23.538211822509766, "logits/rejected": -23.60172462463379, "logps/chosen": -2458.70166015625, "logps/rejected": -2656.50927734375, "loss": 0.0875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -186.38084411621094, "rewards/margins": 13.883323669433594, "rewards/rejected": -200.26419067382812, "step": 22580 }, { "epoch": 1.31, "grad_norm": 2.2043094635009766, "learning_rate": 0.000565772669220945, "logits/chosen": -24.11107063293457, "logits/rejected": -24.5493106842041, "logps/chosen": -2806.366455078125, "logps/rejected": -2789.543212890625, "loss": 1.8971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.3399658203125, "rewards/margins": 18.5892276763916, "rewards/rejected": -191.92919921875, "step": 22590 }, { "epoch": 1.31, "grad_norm": 0.0, "learning_rate": 0.0005655791632803126, "logits/chosen": -20.346086502075195, "logits/rejected": -21.513341903686523, "logps/chosen": -2391.38671875, "logps/rejected": -2955.03857421875, "loss": 7.952, "rewards/accuracies": 0.5, "rewards/chosen": -109.42742919921875, "rewards/margins": 44.01808547973633, "rewards/rejected": -153.44552612304688, "step": 22600 }, { "epoch": 1.31, "grad_norm": 4.745153380468492e-13, "learning_rate": 0.0005653856573396804, "logits/chosen": -15.695605278015137, "logits/rejected": -16.233545303344727, "logps/chosen": -3208.03662109375, "logps/rejected": -2887.59912109375, "loss": 11.0078, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -101.80036926269531, "rewards/margins": -3.2276008129119873, "rewards/rejected": -98.57276916503906, "step": 22610 }, { "epoch": 1.31, "grad_norm": 94.3452377319336, "learning_rate": 0.000565192151399048, "logits/chosen": -17.19017219543457, "logits/rejected": -17.519519805908203, "logps/chosen": -2517.5078125, "logps/rejected": -2370.911376953125, "loss": 4.1285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.97012329101562, "rewards/margins": 3.8106353282928467, "rewards/rejected": -146.78076171875, "step": 22620 }, { "epoch": 1.31, "grad_norm": 9.2139892578125, "learning_rate": 0.0005649986454584156, "logits/chosen": -14.949193954467773, "logits/rejected": -15.577238082885742, "logps/chosen": -2924.436279296875, "logps/rejected": -2976.321533203125, "loss": 7.2585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.6315460205078, "rewards/margins": 3.955735445022583, "rewards/rejected": -191.5872802734375, "step": 22630 }, { "epoch": 1.31, "grad_norm": 0.0010687833419069648, "learning_rate": 0.0005648051395177832, "logits/chosen": -15.600515365600586, "logits/rejected": -15.0673246383667, "logps/chosen": -3004.47998046875, "logps/rejected": -2869.133056640625, "loss": 4.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.6167449951172, "rewards/margins": 10.762849807739258, "rewards/rejected": -190.3795928955078, "step": 22640 }, { "epoch": 1.31, "grad_norm": 2.5192115572281182e-05, "learning_rate": 0.0005646116335771508, "logits/chosen": -17.370800018310547, "logits/rejected": -17.144926071166992, "logps/chosen": -2913.64208984375, "logps/rejected": -2627.24755859375, "loss": 22.7116, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -221.16635131835938, "rewards/margins": -12.457359313964844, "rewards/rejected": -208.7090301513672, "step": 22650 }, { "epoch": 1.31, "grad_norm": 84.78036499023438, "learning_rate": 0.0005644181276365184, "logits/chosen": -15.027122497558594, "logits/rejected": -15.882577896118164, "logps/chosen": -2927.30517578125, "logps/rejected": -2701.03662109375, "loss": 6.8909, "rewards/accuracies": 0.5, "rewards/chosen": -112.1829833984375, "rewards/margins": 3.4994919300079346, "rewards/rejected": -115.6824722290039, "step": 22660 }, { "epoch": 1.31, "grad_norm": 509.551513671875, "learning_rate": 0.0005642246216958861, "logits/chosen": -17.605995178222656, "logits/rejected": -20.271398544311523, "logps/chosen": -2989.6611328125, "logps/rejected": -2470.96044921875, "loss": 24.0804, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.89064025878906, "rewards/margins": -7.64553689956665, "rewards/rejected": -171.24510192871094, "step": 22670 }, { "epoch": 1.31, "grad_norm": 15.926983833312988, "learning_rate": 0.0005640311157552537, "logits/chosen": -22.52626609802246, "logits/rejected": -25.68569564819336, "logps/chosen": -2484.204345703125, "logps/rejected": -2688.94384765625, "loss": 2.6025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -222.6398162841797, "rewards/margins": 22.908489227294922, "rewards/rejected": -245.5482940673828, "step": 22680 }, { "epoch": 1.31, "grad_norm": 2.054554304464773e-10, "learning_rate": 0.0005638376098146213, "logits/chosen": -18.422454833984375, "logits/rejected": -20.762205123901367, "logps/chosen": -2758.770263671875, "logps/rejected": -2518.9990234375, "loss": 2.9445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.26675415039062, "rewards/margins": 14.126106262207031, "rewards/rejected": -182.39288330078125, "step": 22690 }, { "epoch": 1.31, "grad_norm": 0.0003485401102807373, "learning_rate": 0.0005636441038739889, "logits/chosen": -16.78219985961914, "logits/rejected": -17.190284729003906, "logps/chosen": -2834.3232421875, "logps/rejected": -2317.306640625, "loss": 4.3402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.54273223876953, "rewards/margins": 6.308304309844971, "rewards/rejected": -99.85102844238281, "step": 22700 }, { "epoch": 1.31, "grad_norm": 30.673189163208008, "learning_rate": 0.0005634505979333565, "logits/chosen": -20.914493560791016, "logits/rejected": -20.59130096435547, "logps/chosen": -2600.806396484375, "logps/rejected": -2566.528076171875, "loss": 6.8594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -190.8290557861328, "rewards/margins": -1.7542831897735596, "rewards/rejected": -189.07476806640625, "step": 22710 }, { "epoch": 1.32, "grad_norm": 0.001024120138026774, "learning_rate": 0.0005632570919927243, "logits/chosen": -17.067920684814453, "logits/rejected": -19.678001403808594, "logps/chosen": -2425.435546875, "logps/rejected": -2119.690673828125, "loss": 6.7959, "rewards/accuracies": 0.5, "rewards/chosen": -100.8595962524414, "rewards/margins": 3.3633761405944824, "rewards/rejected": -104.22297668457031, "step": 22720 }, { "epoch": 1.32, "grad_norm": 18.307571411132812, "learning_rate": 0.0005630635860520919, "logits/chosen": -16.309202194213867, "logits/rejected": -16.773860931396484, "logps/chosen": -2836.1865234375, "logps/rejected": -2805.48583984375, "loss": 0.4323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -101.0269546508789, "rewards/margins": 9.816988945007324, "rewards/rejected": -110.84394836425781, "step": 22730 }, { "epoch": 1.32, "grad_norm": 1.6573364734649658, "learning_rate": 0.0005628700801114595, "logits/chosen": -14.561325073242188, "logits/rejected": -14.159967422485352, "logps/chosen": -2626.364013671875, "logps/rejected": -2177.338623046875, "loss": 6.3297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -83.95726013183594, "rewards/margins": 14.609086990356445, "rewards/rejected": -98.56633758544922, "step": 22740 }, { "epoch": 1.32, "grad_norm": 1.4010265878347397e-15, "learning_rate": 0.0005626765741708271, "logits/chosen": -15.984135627746582, "logits/rejected": -18.05866050720215, "logps/chosen": -2670.37646484375, "logps/rejected": -2496.89306640625, "loss": 8.0479, "rewards/accuracies": 0.5, "rewards/chosen": -165.94192504882812, "rewards/margins": 10.529637336730957, "rewards/rejected": -176.47154235839844, "step": 22750 }, { "epoch": 1.32, "grad_norm": 1.5184102239440556e-11, "learning_rate": 0.0005624830682301947, "logits/chosen": -13.091405868530273, "logits/rejected": -14.551533699035645, "logps/chosen": -2647.640869140625, "logps/rejected": -2487.84228515625, "loss": 11.7509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -113.93495178222656, "rewards/margins": 2.9110755920410156, "rewards/rejected": -116.84603118896484, "step": 22760 }, { "epoch": 1.32, "grad_norm": 0.0010870955884456635, "learning_rate": 0.0005622895622895623, "logits/chosen": -13.049142837524414, "logits/rejected": -13.034372329711914, "logps/chosen": -2549.824951171875, "logps/rejected": -2372.342529296875, "loss": 2.3133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -67.45606994628906, "rewards/margins": 7.994266510009766, "rewards/rejected": -75.45032501220703, "step": 22770 }, { "epoch": 1.32, "grad_norm": 0.006631581578403711, "learning_rate": 0.0005620960563489299, "logits/chosen": -15.36540699005127, "logits/rejected": -16.769681930541992, "logps/chosen": -2840.36279296875, "logps/rejected": -2988.51708984375, "loss": 2.763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.11154174804688, "rewards/margins": 17.301931381225586, "rewards/rejected": -192.41348266601562, "step": 22780 }, { "epoch": 1.32, "grad_norm": 67.52378845214844, "learning_rate": 0.0005619025504082975, "logits/chosen": -13.253396987915039, "logits/rejected": -13.828041076660156, "logps/chosen": -2885.454833984375, "logps/rejected": -2863.8798828125, "loss": 3.1123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -101.9154052734375, "rewards/margins": 19.092166900634766, "rewards/rejected": -121.00758361816406, "step": 22790 }, { "epoch": 1.32, "grad_norm": 34.01899337768555, "learning_rate": 0.0005617090444676651, "logits/chosen": -14.880796432495117, "logits/rejected": -15.637219429016113, "logps/chosen": -2202.052490234375, "logps/rejected": -2190.755859375, "loss": 7.4166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.059326171875, "rewards/margins": 5.752934455871582, "rewards/rejected": -117.812255859375, "step": 22800 }, { "epoch": 1.32, "grad_norm": 68.44629669189453, "learning_rate": 0.0005615155385270327, "logits/chosen": -14.328104019165039, "logits/rejected": -14.344642639160156, "logps/chosen": -3028.4091796875, "logps/rejected": -2758.360107421875, "loss": 6.0737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.15277099609375, "rewards/margins": 8.223165512084961, "rewards/rejected": -138.37594604492188, "step": 22810 }, { "epoch": 1.32, "grad_norm": 47.249603271484375, "learning_rate": 0.0005613220325864004, "logits/chosen": -15.561958312988281, "logits/rejected": -15.497062683105469, "logps/chosen": -2147.333984375, "logps/rejected": -2082.573974609375, "loss": 17.4368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -146.83633422851562, "rewards/margins": 4.417839527130127, "rewards/rejected": -151.254150390625, "step": 22820 }, { "epoch": 1.32, "grad_norm": 92.41802215576172, "learning_rate": 0.000561128526645768, "logits/chosen": -18.19961929321289, "logits/rejected": -19.04079818725586, "logps/chosen": -2173.23876953125, "logps/rejected": -2204.014404296875, "loss": 5.9944, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -194.58665466308594, "rewards/margins": 0.033599089831113815, "rewards/rejected": -194.62025451660156, "step": 22830 }, { "epoch": 1.32, "grad_norm": 68.42790222167969, "learning_rate": 0.0005609350207051357, "logits/chosen": -15.150728225708008, "logits/rejected": -16.894826889038086, "logps/chosen": -2696.43798828125, "logps/rejected": -2539.95361328125, "loss": 1.868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.8881072998047, "rewards/margins": 14.735928535461426, "rewards/rejected": -159.62403869628906, "step": 22840 }, { "epoch": 1.32, "grad_norm": 80.83292388916016, "learning_rate": 0.0005607415147645033, "logits/chosen": -14.325876235961914, "logits/rejected": -14.780616760253906, "logps/chosen": -2380.540283203125, "logps/rejected": -2447.93310546875, "loss": 6.0862, "rewards/accuracies": 0.5, "rewards/chosen": -112.61177062988281, "rewards/margins": 15.992523193359375, "rewards/rejected": -128.60427856445312, "step": 22850 }, { "epoch": 1.32, "grad_norm": 38.494693756103516, "learning_rate": 0.0005605480088238709, "logits/chosen": -15.602869033813477, "logits/rejected": -16.393753051757812, "logps/chosen": -2363.285400390625, "logps/rejected": -2141.91845703125, "loss": 8.4049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -136.3437957763672, "rewards/margins": -1.7714965343475342, "rewards/rejected": -134.57229614257812, "step": 22860 }, { "epoch": 1.32, "grad_norm": 11.728919982910156, "learning_rate": 0.0005603545028832385, "logits/chosen": -14.819513320922852, "logits/rejected": -15.424448013305664, "logps/chosen": -2516.579345703125, "logps/rejected": -2608.616943359375, "loss": 1.3704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.76161193847656, "rewards/margins": 21.610599517822266, "rewards/rejected": -176.37220764160156, "step": 22870 }, { "epoch": 1.32, "grad_norm": 81.68346405029297, "learning_rate": 0.0005601609969426061, "logits/chosen": -16.97984504699707, "logits/rejected": -18.031291961669922, "logps/chosen": -2432.88818359375, "logps/rejected": -2509.393798828125, "loss": 19.9699, "rewards/accuracies": 0.5, "rewards/chosen": -204.55209350585938, "rewards/margins": -12.386578559875488, "rewards/rejected": -192.16549682617188, "step": 22880 }, { "epoch": 1.32, "grad_norm": 8.197445950123769e-15, "learning_rate": 0.0005599674910019738, "logits/chosen": -12.377670288085938, "logits/rejected": -14.375846862792969, "logps/chosen": -3127.168212890625, "logps/rejected": -3175.915771484375, "loss": 3.4077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -70.552978515625, "rewards/margins": 11.722644805908203, "rewards/rejected": -82.27561950683594, "step": 22890 }, { "epoch": 1.33, "grad_norm": 0.6685932278633118, "learning_rate": 0.0005597739850613414, "logits/chosen": -19.44005584716797, "logits/rejected": -19.26914405822754, "logps/chosen": -3026.54638671875, "logps/rejected": -3011.41455078125, "loss": 2.0193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -127.27525329589844, "rewards/margins": 6.455021858215332, "rewards/rejected": -133.73028564453125, "step": 22900 }, { "epoch": 1.33, "grad_norm": 0.0022892244160175323, "learning_rate": 0.000559580479120709, "logits/chosen": -14.550396919250488, "logits/rejected": -16.416013717651367, "logps/chosen": -2836.148193359375, "logps/rejected": -2409.96044921875, "loss": 3.6405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.83790588378906, "rewards/margins": 2.987802505493164, "rewards/rejected": -175.82568359375, "step": 22910 }, { "epoch": 1.33, "grad_norm": 88.94375610351562, "learning_rate": 0.0005593869731800766, "logits/chosen": -11.417874336242676, "logits/rejected": -12.880537033081055, "logps/chosen": -3424.815185546875, "logps/rejected": -3378.175048828125, "loss": 8.76, "rewards/accuracies": 0.5, "rewards/chosen": -135.50320434570312, "rewards/margins": -2.1314971446990967, "rewards/rejected": -133.37171936035156, "step": 22920 }, { "epoch": 1.33, "grad_norm": 0.0015017210971564054, "learning_rate": 0.0005591934672394443, "logits/chosen": -13.336979866027832, "logits/rejected": -14.355720520019531, "logps/chosen": -3290.199951171875, "logps/rejected": -2967.396728515625, "loss": 30.9488, "rewards/accuracies": 0.5, "rewards/chosen": -189.2379150390625, "rewards/margins": -26.670337677001953, "rewards/rejected": -162.5675811767578, "step": 22930 }, { "epoch": 1.33, "grad_norm": 383.8229675292969, "learning_rate": 0.000558999961298812, "logits/chosen": -11.690953254699707, "logits/rejected": -11.27747631072998, "logps/chosen": -2747.910400390625, "logps/rejected": -2258.324462890625, "loss": 10.9642, "rewards/accuracies": 0.5, "rewards/chosen": -146.28977966308594, "rewards/margins": -0.2696495056152344, "rewards/rejected": -146.0201416015625, "step": 22940 }, { "epoch": 1.33, "grad_norm": 0.285653680562973, "learning_rate": 0.0005588064553581796, "logits/chosen": -11.284184455871582, "logits/rejected": -11.382588386535645, "logps/chosen": -2626.48779296875, "logps/rejected": -2245.9365234375, "loss": 6.6055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -101.05297088623047, "rewards/margins": 10.08935260772705, "rewards/rejected": -111.14231872558594, "step": 22950 }, { "epoch": 1.33, "grad_norm": 4.061201082095067e-07, "learning_rate": 0.0005586129494175472, "logits/chosen": -12.523721694946289, "logits/rejected": -13.249710083007812, "logps/chosen": -2548.94921875, "logps/rejected": -2433.20361328125, "loss": 1.5099, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -84.87028503417969, "rewards/margins": 12.973739624023438, "rewards/rejected": -97.84402465820312, "step": 22960 }, { "epoch": 1.33, "grad_norm": 0.0001651170605327934, "learning_rate": 0.0005584194434769148, "logits/chosen": -15.262056350708008, "logits/rejected": -15.564114570617676, "logps/chosen": -2247.83251953125, "logps/rejected": -2110.220458984375, "loss": 15.9829, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -151.82720947265625, "rewards/margins": 4.897521018981934, "rewards/rejected": -156.7247314453125, "step": 22970 }, { "epoch": 1.33, "grad_norm": 4.161956787109375, "learning_rate": 0.0005582259375362824, "logits/chosen": -11.850296020507812, "logits/rejected": -12.344078063964844, "logps/chosen": -2885.76806640625, "logps/rejected": -2751.052490234375, "loss": 4.0955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -42.0959587097168, "rewards/margins": 20.674274444580078, "rewards/rejected": -62.770240783691406, "step": 22980 }, { "epoch": 1.33, "grad_norm": 153.6794891357422, "learning_rate": 0.00055803243159565, "logits/chosen": -12.75160026550293, "logits/rejected": -13.907989501953125, "logps/chosen": -2700.04833984375, "logps/rejected": -2711.642822265625, "loss": 12.294, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -135.4564208984375, "rewards/margins": -3.942265272140503, "rewards/rejected": -131.51416015625, "step": 22990 }, { "epoch": 1.33, "grad_norm": 181.57687377929688, "learning_rate": 0.0005578389256550175, "logits/chosen": -10.894556999206543, "logits/rejected": -10.899362564086914, "logps/chosen": -2514.84765625, "logps/rejected": -2499.91015625, "loss": 11.4774, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -92.06146240234375, "rewards/margins": -4.610286712646484, "rewards/rejected": -87.45117950439453, "step": 23000 }, { "epoch": 1.33, "grad_norm": 0.0020060634706169367, "learning_rate": 0.0005576454197143852, "logits/chosen": -15.7905912399292, "logits/rejected": -17.328960418701172, "logps/chosen": -2484.98876953125, "logps/rejected": -2083.385498046875, "loss": 41.5592, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -218.39859008789062, "rewards/margins": -39.10744094848633, "rewards/rejected": -179.2911376953125, "step": 23010 }, { "epoch": 1.33, "grad_norm": 0.007995354942977428, "learning_rate": 0.0005574519137737528, "logits/chosen": -13.02495002746582, "logits/rejected": -14.233210563659668, "logps/chosen": -2643.29248046875, "logps/rejected": -2634.416748046875, "loss": 6.0425, "rewards/accuracies": 0.5, "rewards/chosen": -144.59768676757812, "rewards/margins": 7.2685956954956055, "rewards/rejected": -151.8662872314453, "step": 23020 }, { "epoch": 1.33, "grad_norm": 118.6270523071289, "learning_rate": 0.0005572584078331205, "logits/chosen": -13.75114631652832, "logits/rejected": -13.722285270690918, "logps/chosen": -2664.2734375, "logps/rejected": -2814.31640625, "loss": 6.7453, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -126.37493896484375, "rewards/margins": 3.920259475708008, "rewards/rejected": -130.29519653320312, "step": 23030 }, { "epoch": 1.33, "grad_norm": 136.5928955078125, "learning_rate": 0.0005570649018924881, "logits/chosen": -12.90381145477295, "logits/rejected": -12.358647346496582, "logps/chosen": -2627.85546875, "logps/rejected": -2176.035888671875, "loss": 15.2519, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -172.4556427001953, "rewards/margins": -10.035066604614258, "rewards/rejected": -162.4205780029297, "step": 23040 }, { "epoch": 1.33, "grad_norm": 131.86766052246094, "learning_rate": 0.0005568713959518557, "logits/chosen": -11.574569702148438, "logits/rejected": -12.092161178588867, "logps/chosen": -2809.00732421875, "logps/rejected": -2563.91943359375, "loss": 24.7145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -134.055419921875, "rewards/margins": -21.426122665405273, "rewards/rejected": -112.62931060791016, "step": 23050 }, { "epoch": 1.33, "grad_norm": 91.91033172607422, "learning_rate": 0.0005566778900112234, "logits/chosen": -13.7204008102417, "logits/rejected": -14.95097541809082, "logps/chosen": -2604.386962890625, "logps/rejected": -2543.11279296875, "loss": 7.5716, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.77146911621094, "rewards/margins": -1.351580023765564, "rewards/rejected": -156.41989135742188, "step": 23060 }, { "epoch": 1.34, "grad_norm": 99.66211700439453, "learning_rate": 0.000556484384070591, "logits/chosen": -13.1013822555542, "logits/rejected": -14.020007133483887, "logps/chosen": -2519.67333984375, "logps/rejected": -2184.38427734375, "loss": 31.8701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.66387939453125, "rewards/margins": -23.0918025970459, "rewards/rejected": -151.57208251953125, "step": 23070 }, { "epoch": 1.34, "grad_norm": 78.2958984375, "learning_rate": 0.0005562908781299586, "logits/chosen": -14.288166999816895, "logits/rejected": -14.415580749511719, "logps/chosen": -2764.542724609375, "logps/rejected": -2643.61962890625, "loss": 14.3284, "rewards/accuracies": 0.5, "rewards/chosen": -131.3740692138672, "rewards/margins": -7.6131391525268555, "rewards/rejected": -123.76094818115234, "step": 23080 }, { "epoch": 1.34, "grad_norm": 288.8688659667969, "learning_rate": 0.0005560973721893262, "logits/chosen": -17.737194061279297, "logits/rejected": -17.851505279541016, "logps/chosen": -2472.00244140625, "logps/rejected": -2013.5728759765625, "loss": 14.8086, "rewards/accuracies": 0.5, "rewards/chosen": -162.03555297851562, "rewards/margins": -8.421455383300781, "rewards/rejected": -153.6140899658203, "step": 23090 }, { "epoch": 1.34, "grad_norm": 64.23622131347656, "learning_rate": 0.0005559038662486938, "logits/chosen": -14.27916431427002, "logits/rejected": -14.931047439575195, "logps/chosen": -2846.478271484375, "logps/rejected": -2447.76416015625, "loss": 10.5399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -144.59078979492188, "rewards/margins": -4.9480133056640625, "rewards/rejected": -139.64279174804688, "step": 23100 }, { "epoch": 1.34, "grad_norm": 0.004429974593222141, "learning_rate": 0.0005557103603080614, "logits/chosen": -15.70275592803955, "logits/rejected": -16.247037887573242, "logps/chosen": -2595.37255859375, "logps/rejected": -2822.86962890625, "loss": 9.2767, "rewards/accuracies": 0.5, "rewards/chosen": -210.8274383544922, "rewards/margins": -2.120605945587158, "rewards/rejected": -208.7068328857422, "step": 23110 }, { "epoch": 1.34, "grad_norm": 15.6221284866333, "learning_rate": 0.0005555168543674291, "logits/chosen": -16.959728240966797, "logits/rejected": -17.741058349609375, "logps/chosen": -2325.85986328125, "logps/rejected": -2256.325439453125, "loss": 6.7068, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -173.96145629882812, "rewards/margins": -3.9235987663269043, "rewards/rejected": -170.03787231445312, "step": 23120 }, { "epoch": 1.34, "grad_norm": 3.77378938765105e-07, "learning_rate": 0.0005553233484267967, "logits/chosen": -15.699501037597656, "logits/rejected": -15.022488594055176, "logps/chosen": -2840.200927734375, "logps/rejected": -2394.54052734375, "loss": 2.3166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.30471801757812, "rewards/margins": 5.0883588790893555, "rewards/rejected": -160.39309692382812, "step": 23130 }, { "epoch": 1.34, "grad_norm": 211.95632934570312, "learning_rate": 0.0005551298424861644, "logits/chosen": -13.426844596862793, "logits/rejected": -14.412202835083008, "logps/chosen": -2937.48681640625, "logps/rejected": -2768.160400390625, "loss": 4.3945, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -152.543212890625, "rewards/margins": -2.3008437156677246, "rewards/rejected": -150.24237060546875, "step": 23140 }, { "epoch": 1.34, "grad_norm": 52.685482025146484, "learning_rate": 0.000554936336545532, "logits/chosen": -14.693380355834961, "logits/rejected": -15.838827133178711, "logps/chosen": -2912.103759765625, "logps/rejected": -2641.01123046875, "loss": 3.4618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.81175231933594, "rewards/margins": 11.158854484558105, "rewards/rejected": -123.9706039428711, "step": 23150 }, { "epoch": 1.34, "grad_norm": 73.73099517822266, "learning_rate": 0.0005547428306048996, "logits/chosen": -14.680730819702148, "logits/rejected": -14.582052230834961, "logps/chosen": -2618.69384765625, "logps/rejected": -2652.614990234375, "loss": 9.1673, "rewards/accuracies": 0.5, "rewards/chosen": -130.5796356201172, "rewards/margins": -2.985985517501831, "rewards/rejected": -127.59366607666016, "step": 23160 }, { "epoch": 1.34, "grad_norm": 34.830562591552734, "learning_rate": 0.0005545493246642673, "logits/chosen": -15.621946334838867, "logits/rejected": -16.006420135498047, "logps/chosen": -2879.331787109375, "logps/rejected": -2418.227783203125, "loss": 1.3312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.6980743408203, "rewards/margins": 30.913869857788086, "rewards/rejected": -160.61195373535156, "step": 23170 }, { "epoch": 1.34, "grad_norm": 91.64801025390625, "learning_rate": 0.0005543558187236349, "logits/chosen": -13.824258804321289, "logits/rejected": -13.888348579406738, "logps/chosen": -2526.80615234375, "logps/rejected": -2519.25830078125, "loss": 4.7723, "rewards/accuracies": 0.5, "rewards/chosen": -122.648681640625, "rewards/margins": 1.2871675491333008, "rewards/rejected": -123.93583679199219, "step": 23180 }, { "epoch": 1.34, "grad_norm": 167.30311584472656, "learning_rate": 0.0005541623127830025, "logits/chosen": -12.586461067199707, "logits/rejected": -12.251716613769531, "logps/chosen": -2532.1875, "logps/rejected": -2877.491943359375, "loss": 10.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.03225708007812, "rewards/margins": -4.4868645668029785, "rewards/rejected": -169.54539489746094, "step": 23190 }, { "epoch": 1.34, "grad_norm": 81.08430480957031, "learning_rate": 0.0005539688068423701, "logits/chosen": -14.313039779663086, "logits/rejected": -17.038860321044922, "logps/chosen": -2495.287109375, "logps/rejected": -2270.795654296875, "loss": 6.6652, "rewards/accuracies": 0.5, "rewards/chosen": -156.72593688964844, "rewards/margins": -0.021484756842255592, "rewards/rejected": -156.70443725585938, "step": 23200 }, { "epoch": 1.34, "grad_norm": 2.5660910606384277, "learning_rate": 0.0005537753009017377, "logits/chosen": -17.119863510131836, "logits/rejected": -17.13646697998047, "logps/chosen": -2478.736083984375, "logps/rejected": -2597.765380859375, "loss": 5.182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.4816131591797, "rewards/margins": 5.474583625793457, "rewards/rejected": -193.95620727539062, "step": 23210 }, { "epoch": 1.34, "grad_norm": 88.31358337402344, "learning_rate": 0.0005535817949611052, "logits/chosen": -16.097965240478516, "logits/rejected": -15.619928359985352, "logps/chosen": -2819.64306640625, "logps/rejected": -2814.013671875, "loss": 1.7676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.90530395507812, "rewards/margins": 13.791661262512207, "rewards/rejected": -188.69696044921875, "step": 23220 }, { "epoch": 1.34, "grad_norm": 102.95519256591797, "learning_rate": 0.0005533882890204728, "logits/chosen": -13.708511352539062, "logits/rejected": -13.5299711227417, "logps/chosen": -2967.984619140625, "logps/rejected": -2558.198486328125, "loss": 2.9509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -81.21531677246094, "rewards/margins": 25.6571102142334, "rewards/rejected": -106.87242126464844, "step": 23230 }, { "epoch": 1.35, "grad_norm": 93.50779724121094, "learning_rate": 0.0005531947830798406, "logits/chosen": -12.710087776184082, "logits/rejected": -12.593472480773926, "logps/chosen": -3236.73779296875, "logps/rejected": -2579.93798828125, "loss": 5.9047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -89.91273498535156, "rewards/margins": 6.097216606140137, "rewards/rejected": -96.00994873046875, "step": 23240 }, { "epoch": 1.35, "grad_norm": 37.13460159301758, "learning_rate": 0.0005530012771392082, "logits/chosen": -15.956518173217773, "logits/rejected": -16.465072631835938, "logps/chosen": -2702.06982421875, "logps/rejected": -2585.387939453125, "loss": 9.9323, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -189.02320861816406, "rewards/margins": -7.838682651519775, "rewards/rejected": -181.18453979492188, "step": 23250 }, { "epoch": 1.35, "grad_norm": 19.596698760986328, "learning_rate": 0.0005528077711985758, "logits/chosen": -15.221771240234375, "logits/rejected": -15.670324325561523, "logps/chosen": -2548.401611328125, "logps/rejected": -2035.0025634765625, "loss": 4.0716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.7497100830078, "rewards/margins": 2.9911296367645264, "rewards/rejected": -157.74082946777344, "step": 23260 }, { "epoch": 1.35, "grad_norm": 62.60250473022461, "learning_rate": 0.0005526142652579434, "logits/chosen": -13.460871696472168, "logits/rejected": -13.454312324523926, "logps/chosen": -2718.09423828125, "logps/rejected": -2100.85888671875, "loss": 33.4835, "rewards/accuracies": 0.5, "rewards/chosen": -134.52188110351562, "rewards/margins": -26.649755477905273, "rewards/rejected": -107.87211608886719, "step": 23270 }, { "epoch": 1.35, "grad_norm": 5.411992970039137e-05, "learning_rate": 0.000552420759317311, "logits/chosen": -12.500711441040039, "logits/rejected": -12.622926712036133, "logps/chosen": -2911.78515625, "logps/rejected": -2717.06201171875, "loss": 4.6319, "rewards/accuracies": 0.5, "rewards/chosen": -78.36256408691406, "rewards/margins": 5.717834949493408, "rewards/rejected": -84.08039855957031, "step": 23280 }, { "epoch": 1.35, "grad_norm": 17.3946590423584, "learning_rate": 0.0005522272533766787, "logits/chosen": -14.726524353027344, "logits/rejected": -14.95495319366455, "logps/chosen": -2288.307373046875, "logps/rejected": -2329.16748046875, "loss": 5.329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.664794921875, "rewards/margins": 9.28834056854248, "rewards/rejected": -124.9531478881836, "step": 23290 }, { "epoch": 1.35, "grad_norm": 4.546981334686279, "learning_rate": 0.0005520337474360463, "logits/chosen": -15.089678764343262, "logits/rejected": -15.117042541503906, "logps/chosen": -2776.2666015625, "logps/rejected": -2298.30078125, "loss": 12.6913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -110.23663330078125, "rewards/margins": -1.0832515954971313, "rewards/rejected": -109.15339660644531, "step": 23300 }, { "epoch": 1.35, "grad_norm": 0.0009452112717553973, "learning_rate": 0.0005518402414954139, "logits/chosen": -13.562784194946289, "logits/rejected": -13.823644638061523, "logps/chosen": -2326.4921875, "logps/rejected": -2121.219970703125, "loss": 6.0705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -104.8656997680664, "rewards/margins": 8.893195152282715, "rewards/rejected": -113.75889587402344, "step": 23310 }, { "epoch": 1.35, "grad_norm": 165.1488800048828, "learning_rate": 0.0005516467355547815, "logits/chosen": -13.14856243133545, "logits/rejected": -13.089553833007812, "logps/chosen": -2751.14990234375, "logps/rejected": -2769.51806640625, "loss": 10.6873, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -101.5804443359375, "rewards/margins": -4.090031623840332, "rewards/rejected": -97.49040985107422, "step": 23320 }, { "epoch": 1.35, "grad_norm": 3.6575938793248497e-06, "learning_rate": 0.0005514532296141491, "logits/chosen": -13.775525093078613, "logits/rejected": -13.931413650512695, "logps/chosen": -2530.675048828125, "logps/rejected": -2338.30908203125, "loss": 8.8474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.92555236816406, "rewards/margins": -1.0271918773651123, "rewards/rejected": -128.89837646484375, "step": 23330 }, { "epoch": 1.35, "grad_norm": 0.050875432789325714, "learning_rate": 0.0005512597236735167, "logits/chosen": -14.582693099975586, "logits/rejected": -14.83057689666748, "logps/chosen": -2402.41650390625, "logps/rejected": -2180.47119140625, "loss": 15.8684, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.48397827148438, "rewards/margins": -12.2151460647583, "rewards/rejected": -152.26882934570312, "step": 23340 }, { "epoch": 1.35, "grad_norm": 50.44412612915039, "learning_rate": 0.0005510662177328845, "logits/chosen": -15.517443656921387, "logits/rejected": -16.224933624267578, "logps/chosen": -2747.8330078125, "logps/rejected": -2529.759033203125, "loss": 21.972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.5568389892578, "rewards/margins": -15.582807540893555, "rewards/rejected": -152.97402954101562, "step": 23350 }, { "epoch": 1.35, "grad_norm": 2.0394798411871307e-05, "learning_rate": 0.0005508727117922521, "logits/chosen": -12.120999336242676, "logits/rejected": -12.383130073547363, "logps/chosen": -2605.368896484375, "logps/rejected": -2395.45263671875, "loss": 14.1325, "rewards/accuracies": 0.5, "rewards/chosen": -66.24501037597656, "rewards/margins": -5.423909664154053, "rewards/rejected": -60.82111358642578, "step": 23360 }, { "epoch": 1.35, "grad_norm": 80.65132141113281, "learning_rate": 0.0005506792058516197, "logits/chosen": -16.283510208129883, "logits/rejected": -17.116832733154297, "logps/chosen": -2748.615234375, "logps/rejected": -2649.81201171875, "loss": 1.9621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -186.09024047851562, "rewards/margins": 9.523155212402344, "rewards/rejected": -195.61337280273438, "step": 23370 }, { "epoch": 1.35, "grad_norm": 12.196099281311035, "learning_rate": 0.0005504856999109873, "logits/chosen": -17.31742286682129, "logits/rejected": -17.254039764404297, "logps/chosen": -2391.661865234375, "logps/rejected": -2287.93798828125, "loss": 1.9096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.3845977783203, "rewards/margins": 5.031390190124512, "rewards/rejected": -163.4159698486328, "step": 23380 }, { "epoch": 1.35, "grad_norm": 106.58132934570312, "learning_rate": 0.0005502921939703549, "logits/chosen": -17.88938331604004, "logits/rejected": -18.32402992248535, "logps/chosen": -2609.522705078125, "logps/rejected": -2439.626220703125, "loss": 9.9253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.74244689941406, "rewards/margins": 3.2766005992889404, "rewards/rejected": -158.01902770996094, "step": 23390 }, { "epoch": 1.35, "grad_norm": 0.15030857920646667, "learning_rate": 0.0005500986880297226, "logits/chosen": -18.55906105041504, "logits/rejected": -18.353452682495117, "logps/chosen": -2593.351318359375, "logps/rejected": -2612.35107421875, "loss": 2.6709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.711181640625, "rewards/margins": 7.8142290115356445, "rewards/rejected": -196.52542114257812, "step": 23400 }, { "epoch": 1.36, "grad_norm": 3.0379336823926906e-09, "learning_rate": 0.0005499051820890902, "logits/chosen": -19.28061294555664, "logits/rejected": -19.525325775146484, "logps/chosen": -2694.19580078125, "logps/rejected": -2557.52587890625, "loss": 18.032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.45364379882812, "rewards/margins": -7.75496768951416, "rewards/rejected": -158.6986541748047, "step": 23410 }, { "epoch": 1.36, "grad_norm": 0.05565020814538002, "learning_rate": 0.0005497116761484578, "logits/chosen": -20.45462989807129, "logits/rejected": -20.200904846191406, "logps/chosen": -2786.870361328125, "logps/rejected": -2655.241943359375, "loss": 2.802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -187.33175659179688, "rewards/margins": 9.700399398803711, "rewards/rejected": -197.03216552734375, "step": 23420 }, { "epoch": 1.36, "grad_norm": 198.4499969482422, "learning_rate": 0.0005495181702078253, "logits/chosen": -16.623380661010742, "logits/rejected": -17.480392456054688, "logps/chosen": -3099.54638671875, "logps/rejected": -2179.27978515625, "loss": 39.2213, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.87319946289062, "rewards/margins": -26.556941986083984, "rewards/rejected": -105.3162612915039, "step": 23430 }, { "epoch": 1.36, "grad_norm": 18.331666946411133, "learning_rate": 0.0005493246642671929, "logits/chosen": -18.78205108642578, "logits/rejected": -19.12198829650879, "logps/chosen": -2933.73291015625, "logps/rejected": -2728.97705078125, "loss": 15.0996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.08779907226562, "rewards/margins": -0.6299015283584595, "rewards/rejected": -145.45790100097656, "step": 23440 }, { "epoch": 1.36, "grad_norm": 2.4243707136094436e-15, "learning_rate": 0.0005491311583265606, "logits/chosen": -20.80333709716797, "logits/rejected": -21.8508243560791, "logps/chosen": -2838.643798828125, "logps/rejected": -2766.4755859375, "loss": 3.8025, "rewards/accuracies": 0.5, "rewards/chosen": -248.7236328125, "rewards/margins": 4.58721923828125, "rewards/rejected": -253.31082153320312, "step": 23450 }, { "epoch": 1.36, "grad_norm": 3.237342902431828e-12, "learning_rate": 0.0005489376523859283, "logits/chosen": -17.623685836791992, "logits/rejected": -17.34805679321289, "logps/chosen": -2649.57666015625, "logps/rejected": -2296.1845703125, "loss": 22.1765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.18754577636719, "rewards/margins": -5.2635040283203125, "rewards/rejected": -114.9240493774414, "step": 23460 }, { "epoch": 1.36, "grad_norm": 107.83214569091797, "learning_rate": 0.0005487441464452959, "logits/chosen": -16.326139450073242, "logits/rejected": -16.580080032348633, "logps/chosen": -2696.71044921875, "logps/rejected": -2739.123291015625, "loss": 5.3496, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -132.50619506835938, "rewards/margins": 0.584896445274353, "rewards/rejected": -133.09109497070312, "step": 23470 }, { "epoch": 1.36, "grad_norm": 1.2269423758232145e-12, "learning_rate": 0.0005485506405046635, "logits/chosen": -18.427288055419922, "logits/rejected": -19.401987075805664, "logps/chosen": -2919.177001953125, "logps/rejected": -2807.951171875, "loss": 5.7613, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.80087280273438, "rewards/margins": -0.5329216122627258, "rewards/rejected": -182.26797485351562, "step": 23480 }, { "epoch": 1.36, "grad_norm": 1.9701705544772352e-10, "learning_rate": 0.0005483571345640311, "logits/chosen": -15.854669570922852, "logits/rejected": -15.85582447052002, "logps/chosen": -2470.3271484375, "logps/rejected": -2575.904541015625, "loss": 4.4261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.50743103027344, "rewards/margins": 13.544572830200195, "rewards/rejected": -191.052001953125, "step": 23490 }, { "epoch": 1.36, "grad_norm": 90.4687728881836, "learning_rate": 0.0005481636286233987, "logits/chosen": -11.742128372192383, "logits/rejected": -11.498510360717773, "logps/chosen": -2649.35888671875, "logps/rejected": -2551.69091796875, "loss": 5.7622, "rewards/accuracies": 0.5, "rewards/chosen": -141.88134765625, "rewards/margins": 1.748531699180603, "rewards/rejected": -143.62989807128906, "step": 23500 }, { "epoch": 1.36, "grad_norm": 6.700205267407e-05, "learning_rate": 0.0005479701226827663, "logits/chosen": -11.590456008911133, "logits/rejected": -11.732889175415039, "logps/chosen": -2819.198974609375, "logps/rejected": -2813.80908203125, "loss": 3.6436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -75.77903747558594, "rewards/margins": 7.246789455413818, "rewards/rejected": -83.02583312988281, "step": 23510 }, { "epoch": 1.36, "grad_norm": 67.22844696044922, "learning_rate": 0.000547776616742134, "logits/chosen": -13.567484855651855, "logits/rejected": -13.798004150390625, "logps/chosen": -2285.379638671875, "logps/rejected": -2364.491455078125, "loss": 6.6451, "rewards/accuracies": 0.5, "rewards/chosen": -140.82496643066406, "rewards/margins": 3.2480545043945312, "rewards/rejected": -144.0730438232422, "step": 23520 }, { "epoch": 1.36, "grad_norm": 46.868003845214844, "learning_rate": 0.0005475831108015016, "logits/chosen": -16.48805809020996, "logits/rejected": -19.15460205078125, "logps/chosen": -2599.04541015625, "logps/rejected": -2524.0703125, "loss": 5.7201, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -217.5113983154297, "rewards/margins": 4.544984340667725, "rewards/rejected": -222.05636596679688, "step": 23530 }, { "epoch": 1.36, "grad_norm": 36.045440673828125, "learning_rate": 0.0005473896048608692, "logits/chosen": -14.87397575378418, "logits/rejected": -14.738962173461914, "logps/chosen": -2552.333251953125, "logps/rejected": -2351.594482421875, "loss": 4.6225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -131.6136016845703, "rewards/margins": 1.3666412830352783, "rewards/rejected": -132.98023986816406, "step": 23540 }, { "epoch": 1.36, "grad_norm": 3.575399205146823e-06, "learning_rate": 0.0005471960989202369, "logits/chosen": -20.034423828125, "logits/rejected": -20.17704200744629, "logps/chosen": -2348.0771484375, "logps/rejected": -2362.12451171875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -169.79409790039062, "rewards/margins": 14.909866333007812, "rewards/rejected": -184.70396423339844, "step": 23550 }, { "epoch": 1.36, "grad_norm": 0.00014470808673650026, "learning_rate": 0.0005470025929796045, "logits/chosen": -15.343358993530273, "logits/rejected": -15.619046211242676, "logps/chosen": -2693.94970703125, "logps/rejected": -2534.862548828125, "loss": 1.6258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -82.28480529785156, "rewards/margins": 23.36764144897461, "rewards/rejected": -105.6524429321289, "step": 23560 }, { "epoch": 1.36, "grad_norm": 1.7127894163131714, "learning_rate": 0.0005468090870389722, "logits/chosen": -19.990188598632812, "logits/rejected": -20.1585636138916, "logps/chosen": -2359.34033203125, "logps/rejected": -2397.615966796875, "loss": 4.0589, "rewards/accuracies": 0.5, "rewards/chosen": -211.03219604492188, "rewards/margins": 2.438474178314209, "rewards/rejected": -213.47067260742188, "step": 23570 }, { "epoch": 1.36, "grad_norm": 2.8103288793324133e-11, "learning_rate": 0.0005466155810983398, "logits/chosen": -12.014089584350586, "logits/rejected": -12.318575859069824, "logps/chosen": -2740.737548828125, "logps/rejected": -2743.811279296875, "loss": 14.5508, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.64327239990234, "rewards/margins": -6.4120192527771, "rewards/rejected": -82.23124694824219, "step": 23580 }, { "epoch": 1.37, "grad_norm": 30.490886688232422, "learning_rate": 0.0005464220751577074, "logits/chosen": -13.829252243041992, "logits/rejected": -14.271402359008789, "logps/chosen": -2573.52099609375, "logps/rejected": -2627.060546875, "loss": 6.1429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.1852569580078, "rewards/margins": 4.651213645935059, "rewards/rejected": -162.83648681640625, "step": 23590 }, { "epoch": 1.37, "grad_norm": 0.0, "learning_rate": 0.000546228569217075, "logits/chosen": -15.298457145690918, "logits/rejected": -15.656898498535156, "logps/chosen": -2720.95166015625, "logps/rejected": -2511.108154296875, "loss": 9.4537, "rewards/accuracies": 0.5, "rewards/chosen": -126.34178161621094, "rewards/margins": 10.171919822692871, "rewards/rejected": -136.5137176513672, "step": 23600 }, { "epoch": 1.37, "grad_norm": 0.00044682223233394325, "learning_rate": 0.0005460350632764426, "logits/chosen": -13.302688598632812, "logits/rejected": -13.499903678894043, "logps/chosen": -2774.477294921875, "logps/rejected": -2564.9404296875, "loss": 0.1407, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -138.33535766601562, "rewards/margins": 16.595731735229492, "rewards/rejected": -154.93109130859375, "step": 23610 }, { "epoch": 1.37, "grad_norm": 4.03281519290033e-10, "learning_rate": 0.0005458415573358102, "logits/chosen": -14.076042175292969, "logits/rejected": -13.890909194946289, "logps/chosen": -2463.703125, "logps/rejected": -2300.0400390625, "loss": 16.1384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.62008666992188, "rewards/margins": -6.077277183532715, "rewards/rejected": -144.54278564453125, "step": 23620 }, { "epoch": 1.37, "grad_norm": 96.08427429199219, "learning_rate": 0.0005456480513951779, "logits/chosen": -14.720937728881836, "logits/rejected": -15.780891418457031, "logps/chosen": -3092.744384765625, "logps/rejected": -2463.238037109375, "loss": 30.8523, "rewards/accuracies": 0.5, "rewards/chosen": -160.7108612060547, "rewards/margins": -27.920345306396484, "rewards/rejected": -132.79052734375, "step": 23630 }, { "epoch": 1.37, "grad_norm": 109.38690948486328, "learning_rate": 0.0005454545454545455, "logits/chosen": -16.19162368774414, "logits/rejected": -16.38900375366211, "logps/chosen": -2734.9755859375, "logps/rejected": -2465.77294921875, "loss": 16.5156, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.31593322753906, "rewards/margins": -11.509918212890625, "rewards/rejected": -144.80599975585938, "step": 23640 }, { "epoch": 1.37, "grad_norm": 34.85596466064453, "learning_rate": 0.000545261039513913, "logits/chosen": -21.269994735717773, "logits/rejected": -22.119482040405273, "logps/chosen": -2631.098388671875, "logps/rejected": -2491.048095703125, "loss": 19.8246, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -197.90306091308594, "rewards/margins": -5.973258972167969, "rewards/rejected": -191.9298095703125, "step": 23650 }, { "epoch": 1.37, "grad_norm": 5.268802165985107, "learning_rate": 0.0005450675335732807, "logits/chosen": -18.684494018554688, "logits/rejected": -18.502368927001953, "logps/chosen": -2649.30712890625, "logps/rejected": -2450.754150390625, "loss": 10.8944, "rewards/accuracies": 0.5, "rewards/chosen": -164.99667358398438, "rewards/margins": 1.9662796258926392, "rewards/rejected": -166.96295166015625, "step": 23660 }, { "epoch": 1.37, "grad_norm": 0.05512610822916031, "learning_rate": 0.0005448740276326483, "logits/chosen": -19.019041061401367, "logits/rejected": -19.12312126159668, "logps/chosen": -2442.29052734375, "logps/rejected": -2591.669921875, "loss": 5.2776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.86875915527344, "rewards/margins": 10.901298522949219, "rewards/rejected": -190.77005004882812, "step": 23670 }, { "epoch": 1.37, "grad_norm": 52.805904388427734, "learning_rate": 0.0005446805216920159, "logits/chosen": -17.014545440673828, "logits/rejected": -17.025129318237305, "logps/chosen": -2465.946533203125, "logps/rejected": -2681.022216796875, "loss": 6.7528, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -167.8300323486328, "rewards/margins": -3.2645397186279297, "rewards/rejected": -164.5654754638672, "step": 23680 }, { "epoch": 1.37, "grad_norm": 1.2318528778504542e-19, "learning_rate": 0.0005444870157513836, "logits/chosen": -16.440135955810547, "logits/rejected": -16.79428482055664, "logps/chosen": -2524.87255859375, "logps/rejected": -2749.886474609375, "loss": 10.5199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.6483383178711, "rewards/margins": 9.317354202270508, "rewards/rejected": -128.9656982421875, "step": 23690 }, { "epoch": 1.37, "grad_norm": 251.61434936523438, "learning_rate": 0.0005442935098107512, "logits/chosen": -16.943395614624023, "logits/rejected": -16.838411331176758, "logps/chosen": -2396.99658203125, "logps/rejected": -2625.661376953125, "loss": 9.3053, "rewards/accuracies": 0.5, "rewards/chosen": -124.5176773071289, "rewards/margins": 6.17914342880249, "rewards/rejected": -130.69680786132812, "step": 23700 }, { "epoch": 1.37, "grad_norm": 170.31141662597656, "learning_rate": 0.0005441000038701188, "logits/chosen": -12.089136123657227, "logits/rejected": -12.117259979248047, "logps/chosen": -3056.814208984375, "logps/rejected": -2943.942138671875, "loss": 12.9423, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -101.7036361694336, "rewards/margins": -5.266707897186279, "rewards/rejected": -96.43693542480469, "step": 23710 }, { "epoch": 1.37, "grad_norm": 0.00014408861170522869, "learning_rate": 0.0005439064979294864, "logits/chosen": -16.080257415771484, "logits/rejected": -14.978456497192383, "logps/chosen": -3031.199462890625, "logps/rejected": -2844.843994140625, "loss": 15.1353, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.3066864013672, "rewards/margins": -4.0670270919799805, "rewards/rejected": -187.2396697998047, "step": 23720 }, { "epoch": 1.37, "grad_norm": 0.00917602889239788, "learning_rate": 0.000543712991988854, "logits/chosen": -15.515767097473145, "logits/rejected": -15.627153396606445, "logps/chosen": -2778.23876953125, "logps/rejected": -2469.60986328125, "loss": 19.6327, "rewards/accuracies": 0.5, "rewards/chosen": -154.01568603515625, "rewards/margins": -15.291757583618164, "rewards/rejected": -138.72393798828125, "step": 23730 }, { "epoch": 1.37, "grad_norm": 14.0518217086792, "learning_rate": 0.0005435194860482216, "logits/chosen": -13.827980041503906, "logits/rejected": -13.703386306762695, "logps/chosen": -2835.773681640625, "logps/rejected": -2515.039794921875, "loss": 3.1739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.36004638671875, "rewards/margins": 12.06913948059082, "rewards/rejected": -158.42916870117188, "step": 23740 }, { "epoch": 1.37, "grad_norm": 71.23351287841797, "learning_rate": 0.0005433259801075893, "logits/chosen": -14.680575370788574, "logits/rejected": -14.82396125793457, "logps/chosen": -2689.37255859375, "logps/rejected": -2604.465576171875, "loss": 19.1549, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -136.89971923828125, "rewards/margins": -17.784862518310547, "rewards/rejected": -119.11485290527344, "step": 23750 }, { "epoch": 1.38, "grad_norm": 0.0002531068166717887, "learning_rate": 0.000543132474166957, "logits/chosen": -13.948896408081055, "logits/rejected": -13.924420356750488, "logps/chosen": -2887.14208984375, "logps/rejected": -2801.469482421875, "loss": 5.595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -114.14056396484375, "rewards/margins": 10.698995590209961, "rewards/rejected": -124.83956146240234, "step": 23760 }, { "epoch": 1.38, "grad_norm": 45.27812957763672, "learning_rate": 0.0005429389682263246, "logits/chosen": -15.067914962768555, "logits/rejected": -15.247653007507324, "logps/chosen": -3011.04833984375, "logps/rejected": -2626.72216796875, "loss": 3.8071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -192.424560546875, "rewards/margins": 0.9466297030448914, "rewards/rejected": -193.37118530273438, "step": 23770 }, { "epoch": 1.38, "grad_norm": 0.0006897278944961727, "learning_rate": 0.0005427454622856922, "logits/chosen": -17.881343841552734, "logits/rejected": -18.027978897094727, "logps/chosen": -2559.60205078125, "logps/rejected": -2595.1259765625, "loss": 10.0972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.02230834960938, "rewards/margins": 2.266529083251953, "rewards/rejected": -191.28884887695312, "step": 23780 }, { "epoch": 1.38, "grad_norm": 1.6143262726586727e-08, "learning_rate": 0.0005425519563450598, "logits/chosen": -13.74773120880127, "logits/rejected": -13.44264030456543, "logps/chosen": -3209.232666015625, "logps/rejected": -3263.494873046875, "loss": 3.5082, "rewards/accuracies": 0.5, "rewards/chosen": -188.32662963867188, "rewards/margins": 4.679683208465576, "rewards/rejected": -193.0063018798828, "step": 23790 }, { "epoch": 1.38, "grad_norm": 1.7277103081525524e-09, "learning_rate": 0.0005423584504044275, "logits/chosen": -12.140738487243652, "logits/rejected": -12.459060668945312, "logps/chosen": -3234.912109375, "logps/rejected": -3157.407958984375, "loss": 6.2811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -173.24444580078125, "rewards/margins": 4.5677971839904785, "rewards/rejected": -177.81222534179688, "step": 23800 }, { "epoch": 1.38, "grad_norm": 91.41607666015625, "learning_rate": 0.0005421649444637951, "logits/chosen": -13.383848190307617, "logits/rejected": -12.841384887695312, "logps/chosen": -3297.71142578125, "logps/rejected": -3151.825439453125, "loss": 14.1551, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -197.24864196777344, "rewards/margins": -5.463340759277344, "rewards/rejected": -191.78529357910156, "step": 23810 }, { "epoch": 1.38, "grad_norm": 23.84079933166504, "learning_rate": 0.0005419714385231627, "logits/chosen": -14.627660751342773, "logits/rejected": -14.720914840698242, "logps/chosen": -3120.596435546875, "logps/rejected": -2952.412841796875, "loss": 6.8887, "rewards/accuracies": 0.5, "rewards/chosen": -210.10757446289062, "rewards/margins": 0.2507511079311371, "rewards/rejected": -210.3583221435547, "step": 23820 }, { "epoch": 1.38, "grad_norm": 6.8757297412958e-05, "learning_rate": 0.0005417779325825303, "logits/chosen": -17.19331932067871, "logits/rejected": -16.778575897216797, "logps/chosen": -2869.88330078125, "logps/rejected": -3084.355224609375, "loss": 2.7692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.42274475097656, "rewards/margins": 7.3187994956970215, "rewards/rejected": -177.74154663085938, "step": 23830 }, { "epoch": 1.38, "grad_norm": 1.1979382179561071e-05, "learning_rate": 0.0005415844266418979, "logits/chosen": -15.418744087219238, "logits/rejected": -16.358585357666016, "logps/chosen": -3082.12890625, "logps/rejected": -3042.910888671875, "loss": 8.7955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.0681610107422, "rewards/margins": 5.164134979248047, "rewards/rejected": -204.2322998046875, "step": 23840 }, { "epoch": 1.38, "grad_norm": 0.03904884308576584, "learning_rate": 0.0005413909207012655, "logits/chosen": -16.587528228759766, "logits/rejected": -16.09902000427246, "logps/chosen": -2893.989990234375, "logps/rejected": -2995.67431640625, "loss": 4.5411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.18301391601562, "rewards/margins": 2.0986855030059814, "rewards/rejected": -211.2816619873047, "step": 23850 }, { "epoch": 1.38, "grad_norm": 3.6332348827272654e-05, "learning_rate": 0.0005411974147606332, "logits/chosen": -13.864156723022461, "logits/rejected": -13.672174453735352, "logps/chosen": -3075.08544921875, "logps/rejected": -2946.459228515625, "loss": 5.8187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.05772399902344, "rewards/margins": 4.088860988616943, "rewards/rejected": -171.14659118652344, "step": 23860 }, { "epoch": 1.38, "grad_norm": 61.13933563232422, "learning_rate": 0.0005410039088200009, "logits/chosen": -13.353909492492676, "logits/rejected": -13.031652450561523, "logps/chosen": -2862.607421875, "logps/rejected": -2676.40576171875, "loss": 9.6841, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -212.66256713867188, "rewards/margins": -6.996253967285156, "rewards/rejected": -205.66635131835938, "step": 23870 }, { "epoch": 1.38, "grad_norm": 3.5789067709401934e-08, "learning_rate": 0.0005408104028793684, "logits/chosen": -11.323631286621094, "logits/rejected": -11.46406364440918, "logps/chosen": -3059.223876953125, "logps/rejected": -3208.697509765625, "loss": 2.0776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.3066864013672, "rewards/margins": 9.10142707824707, "rewards/rejected": -172.40811157226562, "step": 23880 }, { "epoch": 1.38, "grad_norm": 111.42375946044922, "learning_rate": 0.000540616896938736, "logits/chosen": -10.327512741088867, "logits/rejected": -10.496676445007324, "logps/chosen": -3118.10693359375, "logps/rejected": -2557.15087890625, "loss": 21.7301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.3213348388672, "rewards/margins": -7.9846391677856445, "rewards/rejected": -146.33670043945312, "step": 23890 }, { "epoch": 1.38, "grad_norm": 55.54536437988281, "learning_rate": 0.0005404233909981036, "logits/chosen": -13.838183403015137, "logits/rejected": -14.191556930541992, "logps/chosen": -2764.30517578125, "logps/rejected": -2498.14306640625, "loss": 18.1214, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -180.38832092285156, "rewards/margins": -12.084335327148438, "rewards/rejected": -168.30397033691406, "step": 23900 }, { "epoch": 1.38, "grad_norm": 57.374000549316406, "learning_rate": 0.0005402298850574712, "logits/chosen": -14.972200393676758, "logits/rejected": -16.35415267944336, "logps/chosen": -2797.634033203125, "logps/rejected": -2642.66650390625, "loss": 6.0459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.32156372070312, "rewards/margins": -2.6081466674804688, "rewards/rejected": -171.71340942382812, "step": 23910 }, { "epoch": 1.38, "grad_norm": 102.98866271972656, "learning_rate": 0.0005400363791168389, "logits/chosen": -18.642810821533203, "logits/rejected": -20.320003509521484, "logps/chosen": -2803.151611328125, "logps/rejected": -2597.206787109375, "loss": 17.4912, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -135.97349548339844, "rewards/margins": -10.236159324645996, "rewards/rejected": -125.73734283447266, "step": 23920 }, { "epoch": 1.39, "grad_norm": 4.065382957458496, "learning_rate": 0.0005398428731762065, "logits/chosen": -17.274587631225586, "logits/rejected": -18.435497283935547, "logps/chosen": -2362.148193359375, "logps/rejected": -2060.07861328125, "loss": 13.9429, "rewards/accuracies": 0.5, "rewards/chosen": -157.66958618164062, "rewards/margins": -8.50037956237793, "rewards/rejected": -149.169189453125, "step": 23930 }, { "epoch": 1.39, "grad_norm": 2.8188662781758467e-07, "learning_rate": 0.0005396493672355741, "logits/chosen": -17.62454605102539, "logits/rejected": -18.212923049926758, "logps/chosen": -2688.682861328125, "logps/rejected": -2656.54736328125, "loss": 4.834, "rewards/accuracies": 0.5, "rewards/chosen": -129.68283081054688, "rewards/margins": 1.2468544244766235, "rewards/rejected": -130.92967224121094, "step": 23940 }, { "epoch": 1.39, "grad_norm": 78.20499420166016, "learning_rate": 0.0005394558612949417, "logits/chosen": -15.356878280639648, "logits/rejected": -16.44415283203125, "logps/chosen": -2674.517578125, "logps/rejected": -2569.127685546875, "loss": 1.6462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.28536987304688, "rewards/margins": 17.5457706451416, "rewards/rejected": -188.83116149902344, "step": 23950 }, { "epoch": 1.39, "grad_norm": 0.0039720358327031136, "learning_rate": 0.0005392623553543093, "logits/chosen": -13.302226066589355, "logits/rejected": -13.333703994750977, "logps/chosen": -2954.33056640625, "logps/rejected": -2952.63623046875, "loss": 7.6864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.43489837646484, "rewards/margins": -2.751415729522705, "rewards/rejected": -105.68348693847656, "step": 23960 }, { "epoch": 1.39, "grad_norm": 50.303192138671875, "learning_rate": 0.0005390688494136771, "logits/chosen": -17.79387664794922, "logits/rejected": -18.906147003173828, "logps/chosen": -2366.052001953125, "logps/rejected": -2412.421142578125, "loss": 5.4725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -129.34881591796875, "rewards/margins": 0.2931075990200043, "rewards/rejected": -129.6419219970703, "step": 23970 }, { "epoch": 1.39, "grad_norm": 37.137168884277344, "learning_rate": 0.0005388753434730447, "logits/chosen": -16.7145938873291, "logits/rejected": -16.282352447509766, "logps/chosen": -2679.23779296875, "logps/rejected": -2756.893798828125, "loss": 6.3428, "rewards/accuracies": 0.5, "rewards/chosen": -125.43540954589844, "rewards/margins": 10.967005729675293, "rewards/rejected": -136.4024200439453, "step": 23980 }, { "epoch": 1.39, "grad_norm": 23.239131927490234, "learning_rate": 0.0005386818375324123, "logits/chosen": -19.257749557495117, "logits/rejected": -20.065860748291016, "logps/chosen": -2652.459716796875, "logps/rejected": -2269.20556640625, "loss": 1.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.5751953125, "rewards/margins": 3.7901711463928223, "rewards/rejected": -174.3653564453125, "step": 23990 }, { "epoch": 1.39, "grad_norm": 0.006309064570814371, "learning_rate": 0.0005384883315917799, "logits/chosen": -19.34821319580078, "logits/rejected": -19.276817321777344, "logps/chosen": -2807.873046875, "logps/rejected": -2644.545166015625, "loss": 14.6092, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -211.0284423828125, "rewards/margins": -9.723645210266113, "rewards/rejected": -201.30479431152344, "step": 24000 }, { "epoch": 1.39, "grad_norm": 0.05914174020290375, "learning_rate": 0.0005382948256511475, "logits/chosen": -16.222400665283203, "logits/rejected": -16.923931121826172, "logps/chosen": -3147.864013671875, "logps/rejected": -2535.43798828125, "loss": 4.7064, "rewards/accuracies": 0.5, "rewards/chosen": -127.14766693115234, "rewards/margins": 7.849204063415527, "rewards/rejected": -134.9968719482422, "step": 24010 }, { "epoch": 1.39, "grad_norm": 2.943743083960726e-06, "learning_rate": 0.0005381013197105151, "logits/chosen": -14.490602493286133, "logits/rejected": -14.359642028808594, "logps/chosen": -3011.753173828125, "logps/rejected": -2702.8212890625, "loss": 3.902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -89.55238342285156, "rewards/margins": 10.064637184143066, "rewards/rejected": -99.61701965332031, "step": 24020 }, { "epoch": 1.39, "grad_norm": 59.45996856689453, "learning_rate": 0.0005379078137698828, "logits/chosen": -16.781660079956055, "logits/rejected": -16.419490814208984, "logps/chosen": -3246.20751953125, "logps/rejected": -3174.977294921875, "loss": 7.6535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -212.2992706298828, "rewards/margins": 0.5579937100410461, "rewards/rejected": -212.8572540283203, "step": 24030 }, { "epoch": 1.39, "grad_norm": 3.649062163546546e-10, "learning_rate": 0.0005377143078292504, "logits/chosen": -18.617115020751953, "logits/rejected": -19.229541778564453, "logps/chosen": -2919.365478515625, "logps/rejected": -2971.458251953125, "loss": 4.7916, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -181.56100463867188, "rewards/margins": 7.102858543395996, "rewards/rejected": -188.66384887695312, "step": 24040 }, { "epoch": 1.39, "grad_norm": 0.000247389281867072, "learning_rate": 0.000537520801888618, "logits/chosen": -25.99820327758789, "logits/rejected": -27.865137100219727, "logps/chosen": -2881.86279296875, "logps/rejected": -2831.77587890625, "loss": 3.4848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -226.59848022460938, "rewards/margins": 17.082435607910156, "rewards/rejected": -243.68093872070312, "step": 24050 }, { "epoch": 1.39, "grad_norm": 0.6071951985359192, "learning_rate": 0.0005373272959479856, "logits/chosen": -17.421184539794922, "logits/rejected": -17.04397201538086, "logps/chosen": -2495.54052734375, "logps/rejected": -2341.50732421875, "loss": 0.8363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.03997802734375, "rewards/margins": 10.746923446655273, "rewards/rejected": -141.78689575195312, "step": 24060 }, { "epoch": 1.39, "grad_norm": 0.0, "learning_rate": 0.0005371337900073532, "logits/chosen": -15.462422370910645, "logits/rejected": -15.7633638381958, "logps/chosen": -2611.830078125, "logps/rejected": -1830.7972412109375, "loss": 3.7062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -87.11066436767578, "rewards/margins": 20.066696166992188, "rewards/rejected": -107.1773681640625, "step": 24070 }, { "epoch": 1.39, "grad_norm": 29.038930892944336, "learning_rate": 0.000536940284066721, "logits/chosen": -15.441680908203125, "logits/rejected": -15.65039348602295, "logps/chosen": -2792.018310546875, "logps/rejected": -2777.542236328125, "loss": 0.8875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.3192138671875, "rewards/margins": 13.402078628540039, "rewards/rejected": -143.72128295898438, "step": 24080 }, { "epoch": 1.39, "grad_norm": 87.60728454589844, "learning_rate": 0.0005367467781260885, "logits/chosen": -15.638517379760742, "logits/rejected": -15.397850036621094, "logps/chosen": -2920.994873046875, "logps/rejected": -2857.861083984375, "loss": 4.5637, "rewards/accuracies": 0.5, "rewards/chosen": -150.93576049804688, "rewards/margins": 8.05879020690918, "rewards/rejected": -158.99456787109375, "step": 24090 }, { "epoch": 1.39, "grad_norm": 42.11144256591797, "learning_rate": 0.0005365532721854561, "logits/chosen": -14.512277603149414, "logits/rejected": -15.089166641235352, "logps/chosen": -2912.692138671875, "logps/rejected": -2812.458984375, "loss": 30.2327, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -141.46243286132812, "rewards/margins": -28.25406265258789, "rewards/rejected": -113.2083740234375, "step": 24100 }, { "epoch": 1.4, "grad_norm": 0.39234283566474915, "learning_rate": 0.0005363597662448237, "logits/chosen": -13.937024116516113, "logits/rejected": -14.141672134399414, "logps/chosen": -2938.48095703125, "logps/rejected": -2849.31396484375, "loss": 12.8333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.73240661621094, "rewards/margins": -1.1522241830825806, "rewards/rejected": -147.58018493652344, "step": 24110 }, { "epoch": 1.4, "grad_norm": 43.371910095214844, "learning_rate": 0.0005361662603041913, "logits/chosen": -15.769978523254395, "logits/rejected": -16.196622848510742, "logps/chosen": -2565.30126953125, "logps/rejected": -2539.34716796875, "loss": 3.7566, "rewards/accuracies": 0.5, "rewards/chosen": -164.8447265625, "rewards/margins": 4.052135944366455, "rewards/rejected": -168.89686584472656, "step": 24120 }, { "epoch": 1.4, "grad_norm": 0.00016344807227142155, "learning_rate": 0.0005359727543635589, "logits/chosen": -12.704607963562012, "logits/rejected": -12.692279815673828, "logps/chosen": -2845.912109375, "logps/rejected": -2400.724365234375, "loss": 5.3313, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -143.44451904296875, "rewards/margins": -1.4167349338531494, "rewards/rejected": -142.02780151367188, "step": 24130 }, { "epoch": 1.4, "grad_norm": 60.098873138427734, "learning_rate": 0.0005357792484229266, "logits/chosen": -14.242886543273926, "logits/rejected": -14.635795593261719, "logps/chosen": -2890.7841796875, "logps/rejected": -2941.3369140625, "loss": 3.8186, "rewards/accuracies": 0.5, "rewards/chosen": -109.9770736694336, "rewards/margins": 5.579412937164307, "rewards/rejected": -115.55648040771484, "step": 24140 }, { "epoch": 1.4, "grad_norm": 1.3384436614582995e-10, "learning_rate": 0.0005355857424822942, "logits/chosen": -12.511390686035156, "logits/rejected": -12.63642406463623, "logps/chosen": -2960.699951171875, "logps/rejected": -2507.98583984375, "loss": 1.781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -122.81208801269531, "rewards/margins": 12.961817741394043, "rewards/rejected": -135.77389526367188, "step": 24150 }, { "epoch": 1.4, "grad_norm": 50.374664306640625, "learning_rate": 0.0005353922365416618, "logits/chosen": -14.286230087280273, "logits/rejected": -14.002934455871582, "logps/chosen": -2826.640869140625, "logps/rejected": -2392.12646484375, "loss": 14.7003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -183.5220947265625, "rewards/margins": -12.971208572387695, "rewards/rejected": -170.5509033203125, "step": 24160 }, { "epoch": 1.4, "grad_norm": 6.525156550196698e-06, "learning_rate": 0.0005351987306010294, "logits/chosen": -14.985681533813477, "logits/rejected": -16.002099990844727, "logps/chosen": -2784.055908203125, "logps/rejected": -2741.445068359375, "loss": 8.3908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -196.39263916015625, "rewards/margins": 0.40194472670555115, "rewards/rejected": -196.79458618164062, "step": 24170 }, { "epoch": 1.4, "grad_norm": 6.1344099044799805, "learning_rate": 0.0005350052246603971, "logits/chosen": -14.132211685180664, "logits/rejected": -14.335721015930176, "logps/chosen": -2655.76513671875, "logps/rejected": -2265.920654296875, "loss": 12.5422, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -129.40882873535156, "rewards/margins": -8.181900024414062, "rewards/rejected": -121.2269287109375, "step": 24180 }, { "epoch": 1.4, "grad_norm": 9.446505546569824, "learning_rate": 0.0005348117187197647, "logits/chosen": -15.416389465332031, "logits/rejected": -15.045595169067383, "logps/chosen": -2751.89306640625, "logps/rejected": -2489.957763671875, "loss": 3.4832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -93.76976013183594, "rewards/margins": 5.352324485778809, "rewards/rejected": -99.12208557128906, "step": 24190 }, { "epoch": 1.4, "grad_norm": 50.59552764892578, "learning_rate": 0.0005346182127791324, "logits/chosen": -11.889016151428223, "logits/rejected": -11.836904525756836, "logps/chosen": -2743.05126953125, "logps/rejected": -2244.5419921875, "loss": 5.1808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.86527633666992, "rewards/margins": 19.06686782836914, "rewards/rejected": -52.93214797973633, "step": 24200 }, { "epoch": 1.4, "grad_norm": 23.039527893066406, "learning_rate": 0.0005344247068385, "logits/chosen": -16.330142974853516, "logits/rejected": -16.234590530395508, "logps/chosen": -2527.06494140625, "logps/rejected": -2298.600341796875, "loss": 13.7977, "rewards/accuracies": 0.5, "rewards/chosen": -178.97970581054688, "rewards/margins": -7.268067359924316, "rewards/rejected": -171.71163940429688, "step": 24210 }, { "epoch": 1.4, "grad_norm": 0.0678672268986702, "learning_rate": 0.0005342312008978676, "logits/chosen": -14.165173530578613, "logits/rejected": -14.254838943481445, "logps/chosen": -2631.156494140625, "logps/rejected": -2667.43212890625, "loss": 6.1673, "rewards/accuracies": 0.5, "rewards/chosen": -153.9722137451172, "rewards/margins": -2.9332573413848877, "rewards/rejected": -151.03895568847656, "step": 24220 }, { "epoch": 1.4, "grad_norm": 2.1260897486996555e-09, "learning_rate": 0.0005340376949572352, "logits/chosen": -15.490262985229492, "logits/rejected": -15.962315559387207, "logps/chosen": -2582.703125, "logps/rejected": -2219.023681640625, "loss": 26.6258, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -197.74441528320312, "rewards/margins": -21.166383743286133, "rewards/rejected": -176.57806396484375, "step": 24230 }, { "epoch": 1.4, "grad_norm": 3.2136986297790097e-18, "learning_rate": 0.0005338441890166028, "logits/chosen": -15.038561820983887, "logits/rejected": -14.52764892578125, "logps/chosen": -2638.24560546875, "logps/rejected": -2298.730224609375, "loss": 7.944, "rewards/accuracies": 0.5, "rewards/chosen": -95.93245697021484, "rewards/margins": 4.809052467346191, "rewards/rejected": -100.74151611328125, "step": 24240 }, { "epoch": 1.4, "grad_norm": 4.276667594909668, "learning_rate": 0.0005336506830759704, "logits/chosen": -11.876466751098633, "logits/rejected": -11.798197746276855, "logps/chosen": -2954.368408203125, "logps/rejected": -2825.73095703125, "loss": 15.0798, "rewards/accuracies": 0.5, "rewards/chosen": -67.4316635131836, "rewards/margins": -11.258119583129883, "rewards/rejected": -56.173545837402344, "step": 24250 }, { "epoch": 1.4, "grad_norm": 0.085366390645504, "learning_rate": 0.0005334571771353381, "logits/chosen": -18.141799926757812, "logits/rejected": -19.25252342224121, "logps/chosen": -2723.75, "logps/rejected": -2573.85498046875, "loss": 19.2612, "rewards/accuracies": 0.5, "rewards/chosen": -207.68408203125, "rewards/margins": -13.67894172668457, "rewards/rejected": -194.00515747070312, "step": 24260 }, { "epoch": 1.4, "grad_norm": 12.160903930664062, "learning_rate": 0.0005332636711947057, "logits/chosen": -16.380327224731445, "logits/rejected": -17.02693748474121, "logps/chosen": -2787.938232421875, "logps/rejected": -2481.915771484375, "loss": 3.4867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.92913818359375, "rewards/margins": 2.688272476196289, "rewards/rejected": -144.61740112304688, "step": 24270 }, { "epoch": 1.41, "grad_norm": 2.2252634305532093e-13, "learning_rate": 0.0005330701652540733, "logits/chosen": -15.139425277709961, "logits/rejected": -16.45573616027832, "logps/chosen": -2912.228515625, "logps/rejected": -2438.973876953125, "loss": 6.6404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.2084732055664, "rewards/margins": 9.624444961547852, "rewards/rejected": -112.83292388916016, "step": 24280 }, { "epoch": 1.41, "grad_norm": 107.56838989257812, "learning_rate": 0.000532876659313441, "logits/chosen": -12.822189331054688, "logits/rejected": -12.495528221130371, "logps/chosen": -2592.386962890625, "logps/rejected": -2904.20458984375, "loss": 17.1074, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -70.83821868896484, "rewards/margins": -13.610989570617676, "rewards/rejected": -57.22722625732422, "step": 24290 }, { "epoch": 1.41, "grad_norm": 8.908580780029297, "learning_rate": 0.0005326831533728086, "logits/chosen": -14.46136474609375, "logits/rejected": -14.33268928527832, "logps/chosen": -2937.901611328125, "logps/rejected": -2836.296630859375, "loss": 8.298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.43465423583984, "rewards/margins": -1.207423448562622, "rewards/rejected": -108.22723388671875, "step": 24300 }, { "epoch": 1.41, "grad_norm": 52.06281280517578, "learning_rate": 0.0005324896474321762, "logits/chosen": -15.890498161315918, "logits/rejected": -16.053136825561523, "logps/chosen": -2580.2158203125, "logps/rejected": -2538.636474609375, "loss": 6.1441, "rewards/accuracies": 0.5, "rewards/chosen": -180.0286865234375, "rewards/margins": -1.7150284051895142, "rewards/rejected": -178.3136444091797, "step": 24310 }, { "epoch": 1.41, "grad_norm": 8.483026192607213e-08, "learning_rate": 0.0005322961414915438, "logits/chosen": -15.433247566223145, "logits/rejected": -16.182621002197266, "logps/chosen": -2878.443115234375, "logps/rejected": -2821.97607421875, "loss": 7.766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.197265625, "rewards/margins": 2.0142340660095215, "rewards/rejected": -147.2115020751953, "step": 24320 }, { "epoch": 1.41, "grad_norm": 0.013067037798464298, "learning_rate": 0.0005321026355509114, "logits/chosen": -13.279994010925293, "logits/rejected": -13.35815143585205, "logps/chosen": -3135.56640625, "logps/rejected": -3070.07861328125, "loss": 4.7588, "rewards/accuracies": 0.5, "rewards/chosen": -154.7169952392578, "rewards/margins": 3.1216156482696533, "rewards/rejected": -157.838623046875, "step": 24330 }, { "epoch": 1.41, "grad_norm": 1.2069848764895141e-07, "learning_rate": 0.000531909129610279, "logits/chosen": -19.696016311645508, "logits/rejected": -21.499616622924805, "logps/chosen": -2917.318359375, "logps/rejected": -2912.138427734375, "loss": 5.6113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.86386108398438, "rewards/margins": 4.894004821777344, "rewards/rejected": -189.7578582763672, "step": 24340 }, { "epoch": 1.41, "grad_norm": 9.498172760009766, "learning_rate": 0.0005317156236696466, "logits/chosen": -19.45499610900879, "logits/rejected": -21.139602661132812, "logps/chosen": -2307.868408203125, "logps/rejected": -2247.767822265625, "loss": 2.2578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -198.69911193847656, "rewards/margins": 9.224908828735352, "rewards/rejected": -207.9240264892578, "step": 24350 }, { "epoch": 1.41, "grad_norm": 133.2607879638672, "learning_rate": 0.0005315221177290142, "logits/chosen": -17.97540283203125, "logits/rejected": -18.633962631225586, "logps/chosen": -2775.94677734375, "logps/rejected": -2405.28564453125, "loss": 10.5348, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -204.64877319335938, "rewards/margins": -1.6140239238739014, "rewards/rejected": -203.03472900390625, "step": 24360 }, { "epoch": 1.41, "grad_norm": 52.15245819091797, "learning_rate": 0.0005313286117883819, "logits/chosen": -13.984889030456543, "logits/rejected": -14.089651107788086, "logps/chosen": -2982.25146484375, "logps/rejected": -2767.52294921875, "loss": 7.6033, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -126.39398193359375, "rewards/margins": -4.963696479797363, "rewards/rejected": -121.43028259277344, "step": 24370 }, { "epoch": 1.41, "grad_norm": 0.0, "learning_rate": 0.0005311351058477495, "logits/chosen": -16.127580642700195, "logits/rejected": -16.4183292388916, "logps/chosen": -2644.8955078125, "logps/rejected": -2764.45263671875, "loss": 1.668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.57608032226562, "rewards/margins": 15.759747505187988, "rewards/rejected": -161.33584594726562, "step": 24380 }, { "epoch": 1.41, "grad_norm": 64.58900451660156, "learning_rate": 0.0005309415999071172, "logits/chosen": -15.162182807922363, "logits/rejected": -15.089727401733398, "logps/chosen": -2574.018310546875, "logps/rejected": -2488.90234375, "loss": 23.0016, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -127.80633544921875, "rewards/margins": -17.740520477294922, "rewards/rejected": -110.0658187866211, "step": 24390 }, { "epoch": 1.41, "grad_norm": 97.04133605957031, "learning_rate": 0.0005307480939664848, "logits/chosen": -15.02100658416748, "logits/rejected": -15.009228706359863, "logps/chosen": -2038.6136474609375, "logps/rejected": -2117.52099609375, "loss": 2.1504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.86634826660156, "rewards/margins": 7.291609764099121, "rewards/rejected": -158.157958984375, "step": 24400 }, { "epoch": 1.41, "grad_norm": 0.011182221584022045, "learning_rate": 0.0005305545880258524, "logits/chosen": -15.942037582397461, "logits/rejected": -15.887351989746094, "logps/chosen": -2630.507080078125, "logps/rejected": -2833.436279296875, "loss": 0.2087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.39840698242188, "rewards/margins": 28.6710205078125, "rewards/rejected": -220.06942749023438, "step": 24410 }, { "epoch": 1.41, "grad_norm": 2.1860010623931885, "learning_rate": 0.00053036108208522, "logits/chosen": -18.295433044433594, "logits/rejected": -19.04853057861328, "logps/chosen": -2587.70849609375, "logps/rejected": -2715.0361328125, "loss": 1.196, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -225.90298461914062, "rewards/margins": 21.373126983642578, "rewards/rejected": -247.27609252929688, "step": 24420 }, { "epoch": 1.41, "grad_norm": 2.311617208761163e-06, "learning_rate": 0.0005301675761445877, "logits/chosen": -13.341937065124512, "logits/rejected": -13.064623832702637, "logps/chosen": -3274.575439453125, "logps/rejected": -3105.63232421875, "loss": 9.4571, "rewards/accuracies": 0.5, "rewards/chosen": -104.72013854980469, "rewards/margins": -0.35195502638816833, "rewards/rejected": -104.36817932128906, "step": 24430 }, { "epoch": 1.41, "grad_norm": 3.5838568224200174e-11, "learning_rate": 0.0005299740702039553, "logits/chosen": -13.36225700378418, "logits/rejected": -13.344332695007324, "logps/chosen": -3001.605224609375, "logps/rejected": -2684.73681640625, "loss": 18.3763, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -196.39373779296875, "rewards/margins": -13.715861320495605, "rewards/rejected": -182.67788696289062, "step": 24440 }, { "epoch": 1.42, "grad_norm": 78.11144256591797, "learning_rate": 0.0005297805642633229, "logits/chosen": -13.87769889831543, "logits/rejected": -13.998739242553711, "logps/chosen": -2760.25927734375, "logps/rejected": -2950.886474609375, "loss": 3.2647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -195.2361297607422, "rewards/margins": 36.893280029296875, "rewards/rejected": -232.12942504882812, "step": 24450 }, { "epoch": 1.42, "grad_norm": 126.0875015258789, "learning_rate": 0.0005295870583226905, "logits/chosen": -10.18484878540039, "logits/rejected": -10.476022720336914, "logps/chosen": -2694.81298828125, "logps/rejected": -2385.572265625, "loss": 7.7888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -95.38710021972656, "rewards/margins": 12.598607063293457, "rewards/rejected": -107.98570251464844, "step": 24460 }, { "epoch": 1.42, "grad_norm": 6.488864898681641, "learning_rate": 0.0005293935523820581, "logits/chosen": -10.881797790527344, "logits/rejected": -11.00593376159668, "logps/chosen": -2640.10205078125, "logps/rejected": -2774.58447265625, "loss": 13.7096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.76177978515625, "rewards/margins": -5.239875793457031, "rewards/rejected": -126.52191162109375, "step": 24470 }, { "epoch": 1.42, "grad_norm": 0.0, "learning_rate": 0.0005292000464414258, "logits/chosen": -9.007013320922852, "logits/rejected": -9.766324043273926, "logps/chosen": -3199.86474609375, "logps/rejected": -3221.55615234375, "loss": 25.111, "rewards/accuracies": 0.5, "rewards/chosen": -180.65164184570312, "rewards/margins": -1.853308081626892, "rewards/rejected": -178.79832458496094, "step": 24480 }, { "epoch": 1.42, "grad_norm": 1.994321428355761e-05, "learning_rate": 0.0005290065405007934, "logits/chosen": -11.585241317749023, "logits/rejected": -11.6668119430542, "logps/chosen": -2928.141357421875, "logps/rejected": -2815.69970703125, "loss": 5.2374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -226.74111938476562, "rewards/margins": 6.062033176422119, "rewards/rejected": -232.8031463623047, "step": 24490 }, { "epoch": 1.42, "grad_norm": 219.256591796875, "learning_rate": 0.0005288130345601611, "logits/chosen": -12.553034782409668, "logits/rejected": -12.721640586853027, "logps/chosen": -2616.76025390625, "logps/rejected": -2627.76416015625, "loss": 19.8488, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -192.3821563720703, "rewards/margins": -12.625801086425781, "rewards/rejected": -179.75637817382812, "step": 24500 }, { "epoch": 1.42, "grad_norm": 6.467423372669145e-05, "learning_rate": 0.0005286195286195287, "logits/chosen": -14.613200187683105, "logits/rejected": -14.24921703338623, "logps/chosen": -2833.93505859375, "logps/rejected": -2765.19384765625, "loss": 8.9055, "rewards/accuracies": 0.5, "rewards/chosen": -216.9782257080078, "rewards/margins": -4.79235315322876, "rewards/rejected": -212.1858673095703, "step": 24510 }, { "epoch": 1.42, "grad_norm": 1.4626818867213842e-08, "learning_rate": 0.0005284260226788963, "logits/chosen": -14.663583755493164, "logits/rejected": -14.968015670776367, "logps/chosen": -2897.38916015625, "logps/rejected": -2788.0888671875, "loss": 3.7329, "rewards/accuracies": 0.5, "rewards/chosen": -196.9436492919922, "rewards/margins": 4.312732219696045, "rewards/rejected": -201.2563934326172, "step": 24520 }, { "epoch": 1.42, "grad_norm": 82.05342864990234, "learning_rate": 0.0005282325167382638, "logits/chosen": -14.48304271697998, "logits/rejected": -14.375950813293457, "logps/chosen": -2918.79443359375, "logps/rejected": -2804.217529296875, "loss": 7.0089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.19793701171875, "rewards/margins": 4.851439476013184, "rewards/rejected": -166.04937744140625, "step": 24530 }, { "epoch": 1.42, "grad_norm": 3.237874625483528e-05, "learning_rate": 0.0005280390107976315, "logits/chosen": -14.484346389770508, "logits/rejected": -14.657999038696289, "logps/chosen": -2749.339111328125, "logps/rejected": -2702.97119140625, "loss": 2.3771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.42625427246094, "rewards/margins": 4.6835737228393555, "rewards/rejected": -166.10983276367188, "step": 24540 }, { "epoch": 1.42, "grad_norm": 190.0686492919922, "learning_rate": 0.0005278455048569991, "logits/chosen": -15.17283821105957, "logits/rejected": -15.041089057922363, "logps/chosen": -2902.724365234375, "logps/rejected": -2733.272705078125, "loss": 18.3517, "rewards/accuracies": 0.5, "rewards/chosen": -177.70758056640625, "rewards/margins": -13.00169849395752, "rewards/rejected": -164.70587158203125, "step": 24550 }, { "epoch": 1.42, "grad_norm": 0.000789110956247896, "learning_rate": 0.0005276519989163667, "logits/chosen": -16.608522415161133, "logits/rejected": -16.553415298461914, "logps/chosen": -2598.80517578125, "logps/rejected": -2496.99169921875, "loss": 2.6252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -177.41920471191406, "rewards/margins": 26.384990692138672, "rewards/rejected": -203.80416870117188, "step": 24560 }, { "epoch": 1.42, "grad_norm": 0.0362095944583416, "learning_rate": 0.0005274584929757343, "logits/chosen": -14.80647087097168, "logits/rejected": -14.848971366882324, "logps/chosen": -2534.448974609375, "logps/rejected": -2579.556396484375, "loss": 2.5518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -105.51850891113281, "rewards/margins": 4.631704807281494, "rewards/rejected": -110.15020751953125, "step": 24570 }, { "epoch": 1.42, "grad_norm": 2.627528429031372, "learning_rate": 0.0005272649870351019, "logits/chosen": -16.156984329223633, "logits/rejected": -16.324390411376953, "logps/chosen": -3086.34130859375, "logps/rejected": -3013.91552734375, "loss": 15.0767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.7733917236328, "rewards/margins": -6.2106733322143555, "rewards/rejected": -183.5626983642578, "step": 24580 }, { "epoch": 1.42, "grad_norm": 0.0156551506370306, "learning_rate": 0.0005270714810944695, "logits/chosen": -17.089191436767578, "logits/rejected": -17.6933536529541, "logps/chosen": -2609.49462890625, "logps/rejected": -2154.48876953125, "loss": 27.1802, "rewards/accuracies": 0.5, "rewards/chosen": -119.92701721191406, "rewards/margins": -11.259891510009766, "rewards/rejected": -108.6671142578125, "step": 24590 }, { "epoch": 1.42, "grad_norm": 87.857177734375, "learning_rate": 0.0005268779751538373, "logits/chosen": -17.684810638427734, "logits/rejected": -18.38467025756836, "logps/chosen": -2944.030029296875, "logps/rejected": -2742.74853515625, "loss": 14.7905, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -211.8691864013672, "rewards/margins": -13.098226547241211, "rewards/rejected": -198.7709503173828, "step": 24600 }, { "epoch": 1.42, "grad_norm": 0.0016663159476593137, "learning_rate": 0.0005266844692132049, "logits/chosen": -18.839832305908203, "logits/rejected": -19.559335708618164, "logps/chosen": -2964.43359375, "logps/rejected": -2853.4892578125, "loss": 1.7388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -179.74122619628906, "rewards/margins": 16.63974952697754, "rewards/rejected": -196.3809814453125, "step": 24610 }, { "epoch": 1.43, "grad_norm": 4.82095584810808e-18, "learning_rate": 0.0005264909632725725, "logits/chosen": -16.283546447753906, "logits/rejected": -17.76873016357422, "logps/chosen": -2829.76416015625, "logps/rejected": -2723.188720703125, "loss": 12.1213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -99.49771881103516, "rewards/margins": 0.14922523498535156, "rewards/rejected": -99.64693450927734, "step": 24620 }, { "epoch": 1.43, "grad_norm": 0.9102115035057068, "learning_rate": 0.0005262974573319401, "logits/chosen": -18.60873031616211, "logits/rejected": -19.327831268310547, "logps/chosen": -2918.2392578125, "logps/rejected": -2914.711181640625, "loss": 6.874, "rewards/accuracies": 0.5, "rewards/chosen": -202.53512573242188, "rewards/margins": 0.9194687008857727, "rewards/rejected": -203.45462036132812, "step": 24630 }, { "epoch": 1.43, "grad_norm": 83.10449981689453, "learning_rate": 0.0005261039513913077, "logits/chosen": -19.982128143310547, "logits/rejected": -20.689651489257812, "logps/chosen": -2936.500244140625, "logps/rejected": -2789.489501953125, "loss": 6.7742, "rewards/accuracies": 0.5, "rewards/chosen": -164.50436401367188, "rewards/margins": -1.5692230463027954, "rewards/rejected": -162.93515014648438, "step": 24640 }, { "epoch": 1.43, "grad_norm": 7.486784133018432e-23, "learning_rate": 0.0005259104454506754, "logits/chosen": -15.493156433105469, "logits/rejected": -16.58525276184082, "logps/chosen": -2579.36474609375, "logps/rejected": -3219.40283203125, "loss": 21.5816, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.96746826171875, "rewards/margins": -15.158548355102539, "rewards/rejected": -141.80892944335938, "step": 24650 }, { "epoch": 1.43, "grad_norm": 662.0346069335938, "learning_rate": 0.000525716939510043, "logits/chosen": -13.342008590698242, "logits/rejected": -15.030858993530273, "logps/chosen": -2920.939697265625, "logps/rejected": -3002.505615234375, "loss": 10.8071, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -169.6688690185547, "rewards/margins": -3.307962417602539, "rewards/rejected": -166.36093139648438, "step": 24660 }, { "epoch": 1.43, "grad_norm": 60.394195556640625, "learning_rate": 0.0005255234335694106, "logits/chosen": -15.836874008178711, "logits/rejected": -15.357894897460938, "logps/chosen": -2862.952392578125, "logps/rejected": -2765.938232421875, "loss": 7.7195, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.5429229736328, "rewards/margins": 12.944955825805664, "rewards/rejected": -170.48788452148438, "step": 24670 }, { "epoch": 1.43, "grad_norm": 0.028736215084791183, "learning_rate": 0.0005253299276287782, "logits/chosen": -16.783292770385742, "logits/rejected": -16.69588851928711, "logps/chosen": -2409.443603515625, "logps/rejected": -2477.69775390625, "loss": 2.1737, "rewards/accuracies": 0.5, "rewards/chosen": -213.3378448486328, "rewards/margins": 2.768177032470703, "rewards/rejected": -216.1060028076172, "step": 24680 }, { "epoch": 1.43, "grad_norm": 57.84859848022461, "learning_rate": 0.0005251364216881458, "logits/chosen": -13.39826774597168, "logits/rejected": -13.370434761047363, "logps/chosen": -2590.68408203125, "logps/rejected": -2973.30908203125, "loss": 6.1551, "rewards/accuracies": 0.5, "rewards/chosen": -186.16323852539062, "rewards/margins": -1.736050009727478, "rewards/rejected": -184.4271697998047, "step": 24690 }, { "epoch": 1.43, "grad_norm": 2.346457428359372e-08, "learning_rate": 0.0005249429157475134, "logits/chosen": -12.919778823852539, "logits/rejected": -13.08355712890625, "logps/chosen": -2895.09716796875, "logps/rejected": -2806.13623046875, "loss": 14.5743, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -222.2392578125, "rewards/margins": -10.713756561279297, "rewards/rejected": -211.52548217773438, "step": 24700 }, { "epoch": 1.43, "grad_norm": 2.503157615661621, "learning_rate": 0.0005247494098068812, "logits/chosen": -11.233613014221191, "logits/rejected": -10.920829772949219, "logps/chosen": -2988.68603515625, "logps/rejected": -2565.224609375, "loss": 15.3461, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -191.95846557617188, "rewards/margins": -13.277127265930176, "rewards/rejected": -178.6813201904297, "step": 24710 }, { "epoch": 1.43, "grad_norm": 0.23197324573993683, "learning_rate": 0.0005245559038662488, "logits/chosen": -12.372451782226562, "logits/rejected": -12.760978698730469, "logps/chosen": -2549.244384765625, "logps/rejected": -2286.686767578125, "loss": 5.5466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.19960021972656, "rewards/margins": 2.0209038257598877, "rewards/rejected": -158.22052001953125, "step": 24720 }, { "epoch": 1.43, "grad_norm": 32.218772888183594, "learning_rate": 0.0005243623979256164, "logits/chosen": -12.575360298156738, "logits/rejected": -12.626911163330078, "logps/chosen": -2944.9501953125, "logps/rejected": -2435.50146484375, "loss": 9.0058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -121.96110534667969, "rewards/margins": -1.3905606269836426, "rewards/rejected": -120.570556640625, "step": 24730 }, { "epoch": 1.43, "grad_norm": 0.07233069837093353, "learning_rate": 0.000524168891984984, "logits/chosen": -11.562278747558594, "logits/rejected": -11.59621810913086, "logps/chosen": -2862.986328125, "logps/rejected": -2860.01513671875, "loss": 3.0905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.2115020751953, "rewards/margins": 3.071859836578369, "rewards/rejected": -137.28335571289062, "step": 24740 }, { "epoch": 1.43, "grad_norm": 111.98631286621094, "learning_rate": 0.0005239753860443515, "logits/chosen": -12.114093780517578, "logits/rejected": -12.327585220336914, "logps/chosen": -2659.37255859375, "logps/rejected": -2582.29931640625, "loss": 5.5675, "rewards/accuracies": 0.5, "rewards/chosen": -119.79130554199219, "rewards/margins": 11.542825698852539, "rewards/rejected": -131.33413696289062, "step": 24750 }, { "epoch": 1.43, "grad_norm": 0.02904621511697769, "learning_rate": 0.0005237818801037191, "logits/chosen": -12.562320709228516, "logits/rejected": -12.347081184387207, "logps/chosen": -2662.572265625, "logps/rejected": -2617.746337890625, "loss": 2.384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.99954223632812, "rewards/margins": 7.371853828430176, "rewards/rejected": -195.37139892578125, "step": 24760 }, { "epoch": 1.43, "grad_norm": 1.802374936232809e-05, "learning_rate": 0.0005235883741630868, "logits/chosen": -11.15786075592041, "logits/rejected": -11.638517379760742, "logps/chosen": -2753.08837890625, "logps/rejected": -2620.656005859375, "loss": 2.8357, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -148.7285614013672, "rewards/margins": 10.166500091552734, "rewards/rejected": -158.89505004882812, "step": 24770 }, { "epoch": 1.43, "grad_norm": 15.542980194091797, "learning_rate": 0.0005233948682224544, "logits/chosen": -14.335996627807617, "logits/rejected": -15.046793937683105, "logps/chosen": -2681.38134765625, "logps/rejected": -2765.48876953125, "loss": 1.0461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.46591186523438, "rewards/margins": 6.158263683319092, "rewards/rejected": -213.62417602539062, "step": 24780 }, { "epoch": 1.43, "grad_norm": 2.180621595471166e-05, "learning_rate": 0.000523201362281822, "logits/chosen": -12.722524642944336, "logits/rejected": -13.212820053100586, "logps/chosen": -2836.98388671875, "logps/rejected": -2794.623046875, "loss": 4.7827, "rewards/accuracies": 0.5, "rewards/chosen": -151.1420135498047, "rewards/margins": -1.4159924983978271, "rewards/rejected": -149.7260284423828, "step": 24790 }, { "epoch": 1.44, "grad_norm": 63.01487350463867, "learning_rate": 0.0005230078563411896, "logits/chosen": -12.984336853027344, "logits/rejected": -12.673467636108398, "logps/chosen": -3165.14892578125, "logps/rejected": -2776.979248046875, "loss": 28.6405, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -171.67442321777344, "rewards/margins": -21.12271499633789, "rewards/rejected": -150.55169677734375, "step": 24800 }, { "epoch": 1.44, "grad_norm": 0.004875713028013706, "learning_rate": 0.0005228143504005573, "logits/chosen": -14.832651138305664, "logits/rejected": -15.06372356414795, "logps/chosen": -2385.45556640625, "logps/rejected": -2294.880615234375, "loss": 13.4607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.12142944335938, "rewards/margins": -4.288449287414551, "rewards/rejected": -199.83297729492188, "step": 24810 }, { "epoch": 1.44, "grad_norm": 113.62139129638672, "learning_rate": 0.000522620844459925, "logits/chosen": -12.874394416809082, "logits/rejected": -13.33806324005127, "logps/chosen": -2656.96826171875, "logps/rejected": -2300.567138671875, "loss": 22.8402, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -139.5615692138672, "rewards/margins": -17.748817443847656, "rewards/rejected": -121.812744140625, "step": 24820 }, { "epoch": 1.44, "grad_norm": 8.969845421979983e-10, "learning_rate": 0.0005224273385192926, "logits/chosen": -12.979901313781738, "logits/rejected": -13.97840690612793, "logps/chosen": -2470.79736328125, "logps/rejected": -2157.46923828125, "loss": 35.4718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.22323608398438, "rewards/margins": -24.92574691772461, "rewards/rejected": -150.29747009277344, "step": 24830 }, { "epoch": 1.44, "grad_norm": 1.6888606832465847e-11, "learning_rate": 0.0005222338325786602, "logits/chosen": -11.998001098632812, "logits/rejected": -12.025887489318848, "logps/chosen": -2999.498046875, "logps/rejected": -2496.44189453125, "loss": 4.7243, "rewards/accuracies": 0.5, "rewards/chosen": -118.63725280761719, "rewards/margins": 6.913261413574219, "rewards/rejected": -125.550537109375, "step": 24840 }, { "epoch": 1.44, "grad_norm": 0.10396262258291245, "learning_rate": 0.0005220403266380278, "logits/chosen": -13.044784545898438, "logits/rejected": -13.034692764282227, "logps/chosen": -2169.053466796875, "logps/rejected": -2127.361328125, "loss": 7.7905, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.98091125488281, "rewards/margins": -4.268954753875732, "rewards/rejected": -121.71195983886719, "step": 24850 }, { "epoch": 1.44, "grad_norm": 0.0003008426574524492, "learning_rate": 0.0005218468206973954, "logits/chosen": -14.530024528503418, "logits/rejected": -15.258657455444336, "logps/chosen": -2672.259033203125, "logps/rejected": -2301.33203125, "loss": 23.0441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.1429443359375, "rewards/margins": -7.12780237197876, "rewards/rejected": -159.01516723632812, "step": 24860 }, { "epoch": 1.44, "grad_norm": 0.0, "learning_rate": 0.000521653314756763, "logits/chosen": -13.525344848632812, "logits/rejected": -14.218320846557617, "logps/chosen": -2530.69140625, "logps/rejected": -2644.966064453125, "loss": 8.8501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -114.9437484741211, "rewards/margins": 21.496740341186523, "rewards/rejected": -136.4404754638672, "step": 24870 }, { "epoch": 1.44, "grad_norm": 0.00023101974511519074, "learning_rate": 0.0005214598088161307, "logits/chosen": -13.107789993286133, "logits/rejected": -12.895973205566406, "logps/chosen": -2721.977783203125, "logps/rejected": -2165.20166015625, "loss": 1.5042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.21781158447266, "rewards/margins": 16.21462631225586, "rewards/rejected": -137.4324493408203, "step": 24880 }, { "epoch": 1.44, "grad_norm": 6.596057415008545, "learning_rate": 0.0005212663028754983, "logits/chosen": -14.48144817352295, "logits/rejected": -14.832847595214844, "logps/chosen": -2563.716796875, "logps/rejected": -2498.574951171875, "loss": 7.3155, "rewards/accuracies": 0.5, "rewards/chosen": -155.478759765625, "rewards/margins": -1.4869308471679688, "rewards/rejected": -153.99183654785156, "step": 24890 }, { "epoch": 1.44, "grad_norm": 53.33954620361328, "learning_rate": 0.0005210727969348659, "logits/chosen": -13.392074584960938, "logits/rejected": -13.309026718139648, "logps/chosen": -2411.39892578125, "logps/rejected": -2010.179931640625, "loss": 10.582, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -177.33787536621094, "rewards/margins": -7.653508186340332, "rewards/rejected": -169.68435668945312, "step": 24900 }, { "epoch": 1.44, "grad_norm": 1.3570733869983087e-07, "learning_rate": 0.0005208792909942335, "logits/chosen": -13.213749885559082, "logits/rejected": -13.132841110229492, "logps/chosen": -2778.55908203125, "logps/rejected": -2483.813232421875, "loss": 4.7565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.66207885742188, "rewards/margins": 0.8677719831466675, "rewards/rejected": -177.52984619140625, "step": 24910 }, { "epoch": 1.44, "grad_norm": 0.0060067796148359776, "learning_rate": 0.0005206857850536012, "logits/chosen": -11.105103492736816, "logits/rejected": -10.955390930175781, "logps/chosen": -2692.72119140625, "logps/rejected": -2555.269775390625, "loss": 2.0622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.18386840820312, "rewards/margins": 3.2576305866241455, "rewards/rejected": -164.44149780273438, "step": 24920 }, { "epoch": 1.44, "grad_norm": 45.919776916503906, "learning_rate": 0.0005204922791129689, "logits/chosen": -9.814379692077637, "logits/rejected": -9.928912162780762, "logps/chosen": -2615.830810546875, "logps/rejected": -2557.30322265625, "loss": 7.3908, "rewards/accuracies": 0.5, "rewards/chosen": -161.7264404296875, "rewards/margins": -1.4430453777313232, "rewards/rejected": -160.2834014892578, "step": 24930 }, { "epoch": 1.44, "grad_norm": 5.732040882110596, "learning_rate": 0.0005202987731723365, "logits/chosen": -11.042360305786133, "logits/rejected": -11.392278671264648, "logps/chosen": -2905.92333984375, "logps/rejected": -2809.38818359375, "loss": 11.8028, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -92.6852035522461, "rewards/margins": -7.38418436050415, "rewards/rejected": -85.30103302001953, "step": 24940 }, { "epoch": 1.44, "grad_norm": 0.054964903742074966, "learning_rate": 0.0005201052672317041, "logits/chosen": -10.65577220916748, "logits/rejected": -10.634347915649414, "logps/chosen": -2713.16748046875, "logps/rejected": -2414.29345703125, "loss": 7.8189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -66.69086456298828, "rewards/margins": 0.1752816140651703, "rewards/rejected": -66.86614227294922, "step": 24950 }, { "epoch": 1.44, "grad_norm": 64.3124008178711, "learning_rate": 0.0005199117612910717, "logits/chosen": -13.471273422241211, "logits/rejected": -13.464197158813477, "logps/chosen": -2136.451171875, "logps/rejected": -1848.6875, "loss": 24.416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.0420684814453, "rewards/margins": -20.603609085083008, "rewards/rejected": -135.43844604492188, "step": 24960 }, { "epoch": 1.45, "grad_norm": 1.8387048869620331e-19, "learning_rate": 0.0005197182553504392, "logits/chosen": -14.01356029510498, "logits/rejected": -13.82417106628418, "logps/chosen": -2771.2431640625, "logps/rejected": -2564.01123046875, "loss": 6.2019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -162.96322631835938, "rewards/margins": 1.2135238647460938, "rewards/rejected": -164.17674255371094, "step": 24970 }, { "epoch": 1.45, "grad_norm": 3.5905356248642443e-22, "learning_rate": 0.0005195247494098068, "logits/chosen": -13.396435737609863, "logits/rejected": -14.131428718566895, "logps/chosen": -2637.935302734375, "logps/rejected": -2646.19091796875, "loss": 2.8904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -105.72047424316406, "rewards/margins": 9.619401931762695, "rewards/rejected": -115.3398666381836, "step": 24980 }, { "epoch": 1.45, "grad_norm": 0.36661240458488464, "learning_rate": 0.0005193312434691744, "logits/chosen": -13.214445114135742, "logits/rejected": -13.048219680786133, "logps/chosen": -2444.175537109375, "logps/rejected": -2094.462646484375, "loss": 12.9215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -82.71328735351562, "rewards/margins": 5.687665939331055, "rewards/rejected": -88.40095520019531, "step": 24990 }, { "epoch": 1.45, "grad_norm": 0.05103134363889694, "learning_rate": 0.0005191377375285421, "logits/chosen": -14.582295417785645, "logits/rejected": -15.127279281616211, "logps/chosen": -2743.47705078125, "logps/rejected": -2741.614501953125, "loss": 5.7446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -198.44839477539062, "rewards/margins": 11.187280654907227, "rewards/rejected": -209.6356964111328, "step": 25000 }, { "epoch": 1.45, "grad_norm": 0.00041941372910514474, "learning_rate": 0.0005189442315879097, "logits/chosen": -12.797408103942871, "logits/rejected": -12.79761028289795, "logps/chosen": -2780.48681640625, "logps/rejected": -2723.76904296875, "loss": 1.1843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -123.8518295288086, "rewards/margins": 10.071246147155762, "rewards/rejected": -133.92306518554688, "step": 25010 }, { "epoch": 1.45, "grad_norm": 6.59729528427124, "learning_rate": 0.0005187507256472774, "logits/chosen": -12.284525871276855, "logits/rejected": -12.344091415405273, "logps/chosen": -2507.968994140625, "logps/rejected": -2409.314208984375, "loss": 4.033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -111.05592346191406, "rewards/margins": 3.6048405170440674, "rewards/rejected": -114.6607666015625, "step": 25020 }, { "epoch": 1.45, "grad_norm": 48.03073501586914, "learning_rate": 0.000518557219706645, "logits/chosen": -12.054925918579102, "logits/rejected": -12.174970626831055, "logps/chosen": -2749.52685546875, "logps/rejected": -2732.8388671875, "loss": 1.3671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.5792236328125, "rewards/margins": 8.140096664428711, "rewards/rejected": -112.71932220458984, "step": 25030 }, { "epoch": 1.45, "grad_norm": 0.008090578019618988, "learning_rate": 0.0005183637137660126, "logits/chosen": -12.097787857055664, "logits/rejected": -12.908040046691895, "logps/chosen": -2816.818603515625, "logps/rejected": -2821.588134765625, "loss": 0.5637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -135.95816040039062, "rewards/margins": 13.817492485046387, "rewards/rejected": -149.77565002441406, "step": 25040 }, { "epoch": 1.45, "grad_norm": 94.38026428222656, "learning_rate": 0.0005181702078253803, "logits/chosen": -13.723396301269531, "logits/rejected": -13.790153503417969, "logps/chosen": -2126.610107421875, "logps/rejected": -2313.1435546875, "loss": 19.5629, "rewards/accuracies": 0.5, "rewards/chosen": -131.63241577148438, "rewards/margins": 3.9271137714385986, "rewards/rejected": -135.55953979492188, "step": 25050 }, { "epoch": 1.45, "grad_norm": 102.97103118896484, "learning_rate": 0.0005179767018847479, "logits/chosen": -13.481069564819336, "logits/rejected": -13.513397216796875, "logps/chosen": -2780.96630859375, "logps/rejected": -2252.85693359375, "loss": 27.9613, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.2931365966797, "rewards/margins": -18.52384376525879, "rewards/rejected": -109.76930236816406, "step": 25060 }, { "epoch": 1.45, "grad_norm": 46.58636474609375, "learning_rate": 0.0005177831959441155, "logits/chosen": -13.550390243530273, "logits/rejected": -13.826667785644531, "logps/chosen": -2248.439208984375, "logps/rejected": -2255.9423828125, "loss": 2.6417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.26902770996094, "rewards/margins": 6.355033874511719, "rewards/rejected": -160.62405395507812, "step": 25070 }, { "epoch": 1.45, "grad_norm": 6.332049196089429e-08, "learning_rate": 0.0005175896900034831, "logits/chosen": -12.805688858032227, "logits/rejected": -13.319096565246582, "logps/chosen": -2923.298828125, "logps/rejected": -2701.225830078125, "loss": 23.5055, "rewards/accuracies": 0.5, "rewards/chosen": -134.5532684326172, "rewards/margins": -16.687477111816406, "rewards/rejected": -117.86579895019531, "step": 25080 }, { "epoch": 1.45, "grad_norm": 123.45120239257812, "learning_rate": 0.0005173961840628507, "logits/chosen": -18.246061325073242, "logits/rejected": -18.871234893798828, "logps/chosen": -2437.35107421875, "logps/rejected": -2248.24169921875, "loss": 15.4139, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -208.94235229492188, "rewards/margins": -11.126996040344238, "rewards/rejected": -197.81536865234375, "step": 25090 }, { "epoch": 1.45, "grad_norm": 8.360272829577298e-08, "learning_rate": 0.0005172026781222183, "logits/chosen": -14.094810485839844, "logits/rejected": -14.128886222839355, "logps/chosen": -2756.263427734375, "logps/rejected": -2800.420166015625, "loss": 1.4544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -77.02328491210938, "rewards/margins": 12.491029739379883, "rewards/rejected": -89.51432037353516, "step": 25100 }, { "epoch": 1.45, "grad_norm": 2.4709797799005173e-05, "learning_rate": 0.000517009172181586, "logits/chosen": -15.584689140319824, "logits/rejected": -16.49276351928711, "logps/chosen": -2669.8779296875, "logps/rejected": -3094.224609375, "loss": 13.6146, "rewards/accuracies": 0.5, "rewards/chosen": -103.1332778930664, "rewards/margins": -3.293099880218506, "rewards/rejected": -99.84016418457031, "step": 25110 }, { "epoch": 1.45, "grad_norm": 156.58578491210938, "learning_rate": 0.0005168156662409536, "logits/chosen": -18.91615104675293, "logits/rejected": -18.52254867553711, "logps/chosen": -3121.475341796875, "logps/rejected": -3041.10986328125, "loss": 15.4245, "rewards/accuracies": 0.5, "rewards/chosen": -206.1322021484375, "rewards/margins": -9.853071212768555, "rewards/rejected": -196.2791290283203, "step": 25120 }, { "epoch": 1.45, "grad_norm": 2.930685520172119, "learning_rate": 0.0005166221603003213, "logits/chosen": -16.40956687927246, "logits/rejected": -16.906951904296875, "logps/chosen": -3561.40576171875, "logps/rejected": -3135.43701171875, "loss": 10.5292, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.20504760742188, "rewards/margins": -3.8642361164093018, "rewards/rejected": -196.34078979492188, "step": 25130 }, { "epoch": 1.46, "grad_norm": 0.0005604025791399181, "learning_rate": 0.0005164286543596889, "logits/chosen": -17.773143768310547, "logits/rejected": -17.571353912353516, "logps/chosen": -3015.3271484375, "logps/rejected": -2808.721923828125, "loss": 13.4975, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -205.0640411376953, "rewards/margins": -10.720893859863281, "rewards/rejected": -194.34317016601562, "step": 25140 }, { "epoch": 1.46, "grad_norm": 55.76126480102539, "learning_rate": 0.0005162351484190565, "logits/chosen": -15.775744438171387, "logits/rejected": -15.744783401489258, "logps/chosen": -3126.83740234375, "logps/rejected": -3075.271728515625, "loss": 4.0909, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.20318603515625, "rewards/margins": 0.6395538449287415, "rewards/rejected": -166.8427276611328, "step": 25150 }, { "epoch": 1.46, "grad_norm": 88.936767578125, "learning_rate": 0.0005160416424784242, "logits/chosen": -19.802148818969727, "logits/rejected": -20.853790283203125, "logps/chosen": -2955.75, "logps/rejected": -2972.996826171875, "loss": 5.0289, "rewards/accuracies": 0.5, "rewards/chosen": -204.22842407226562, "rewards/margins": 4.139172077178955, "rewards/rejected": -208.3675994873047, "step": 25160 }, { "epoch": 1.46, "grad_norm": 0.01386198028922081, "learning_rate": 0.0005158481365377918, "logits/chosen": -14.068906784057617, "logits/rejected": -14.557233810424805, "logps/chosen": -3133.810302734375, "logps/rejected": -2720.15087890625, "loss": 5.7567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.13368225097656, "rewards/margins": 3.6291465759277344, "rewards/rejected": -137.7628173828125, "step": 25170 }, { "epoch": 1.46, "grad_norm": 5.493239640941283e-08, "learning_rate": 0.0005156546305971593, "logits/chosen": -14.498136520385742, "logits/rejected": -14.806056022644043, "logps/chosen": -3029.235595703125, "logps/rejected": -2730.98486328125, "loss": 4.5098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -146.8375244140625, "rewards/margins": 19.553752899169922, "rewards/rejected": -166.3912811279297, "step": 25180 }, { "epoch": 1.46, "grad_norm": 0.0, "learning_rate": 0.0005154611246565269, "logits/chosen": -16.52730941772461, "logits/rejected": -16.288084030151367, "logps/chosen": -2204.145751953125, "logps/rejected": -2353.258544921875, "loss": 3.2732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.65438842773438, "rewards/margins": 18.002161026000977, "rewards/rejected": -207.6565704345703, "step": 25190 }, { "epoch": 1.46, "grad_norm": 99.44931030273438, "learning_rate": 0.0005152676187158945, "logits/chosen": -14.867704391479492, "logits/rejected": -14.878091812133789, "logps/chosen": -2370.134521484375, "logps/rejected": -2389.53662109375, "loss": 5.203, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.47586059570312, "rewards/margins": 4.225289344787598, "rewards/rejected": -177.70114135742188, "step": 25200 }, { "epoch": 1.46, "grad_norm": 7.726814510533586e-05, "learning_rate": 0.0005150741127752621, "logits/chosen": -10.783918380737305, "logits/rejected": -10.729841232299805, "logps/chosen": -2498.042236328125, "logps/rejected": -2251.385009765625, "loss": 1.705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -80.09835815429688, "rewards/margins": 15.111242294311523, "rewards/rejected": -95.20960235595703, "step": 25210 }, { "epoch": 1.46, "grad_norm": 0.012210710905492306, "learning_rate": 0.0005148806068346298, "logits/chosen": -12.119331359863281, "logits/rejected": -12.329645156860352, "logps/chosen": -3085.402587890625, "logps/rejected": -2832.51806640625, "loss": 6.6691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.16627502441406, "rewards/margins": 5.04226016998291, "rewards/rejected": -162.20852661132812, "step": 25220 }, { "epoch": 1.46, "grad_norm": 4.653700358403512e-08, "learning_rate": 0.0005146871008939975, "logits/chosen": -13.770421981811523, "logits/rejected": -13.769343376159668, "logps/chosen": -2396.780029296875, "logps/rejected": -2480.7998046875, "loss": 2.2878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.59768676757812, "rewards/margins": 16.13669776916504, "rewards/rejected": -178.73439025878906, "step": 25230 }, { "epoch": 1.46, "grad_norm": 1.9047716856002808, "learning_rate": 0.0005144935949533651, "logits/chosen": -14.794177055358887, "logits/rejected": -15.040277481079102, "logps/chosen": -2729.93896484375, "logps/rejected": -2756.971923828125, "loss": 4.1644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -237.7235870361328, "rewards/margins": 6.0424346923828125, "rewards/rejected": -243.7660369873047, "step": 25240 }, { "epoch": 1.46, "grad_norm": 0.012673834338784218, "learning_rate": 0.0005143000890127327, "logits/chosen": -12.990850448608398, "logits/rejected": -13.144274711608887, "logps/chosen": -2575.9716796875, "logps/rejected": -2816.74365234375, "loss": 4.2795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -159.55015563964844, "rewards/margins": 11.12253475189209, "rewards/rejected": -170.6726837158203, "step": 25250 }, { "epoch": 1.46, "grad_norm": 38.016273498535156, "learning_rate": 0.0005141065830721003, "logits/chosen": -15.021400451660156, "logits/rejected": -15.71851921081543, "logps/chosen": -2603.7578125, "logps/rejected": -2578.660400390625, "loss": 2.9981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -225.7439422607422, "rewards/margins": 4.728871822357178, "rewards/rejected": -230.4728240966797, "step": 25260 }, { "epoch": 1.46, "grad_norm": 6.304919719696045, "learning_rate": 0.0005139130771314679, "logits/chosen": -15.303814888000488, "logits/rejected": -15.77624225616455, "logps/chosen": -2502.26416015625, "logps/rejected": -2778.181640625, "loss": 13.8052, "rewards/accuracies": 0.5, "rewards/chosen": -210.29183959960938, "rewards/margins": 2.1219093799591064, "rewards/rejected": -212.4137725830078, "step": 25270 }, { "epoch": 1.46, "grad_norm": 0.02474498748779297, "learning_rate": 0.0005137195711908356, "logits/chosen": -10.43603515625, "logits/rejected": -10.51298713684082, "logps/chosen": -2979.489013671875, "logps/rejected": -2903.215087890625, "loss": 8.1348, "rewards/accuracies": 0.5, "rewards/chosen": -86.70854187011719, "rewards/margins": -1.1007217168807983, "rewards/rejected": -85.6078109741211, "step": 25280 }, { "epoch": 1.46, "grad_norm": 67.6539077758789, "learning_rate": 0.0005135260652502032, "logits/chosen": -14.601037979125977, "logits/rejected": -14.489416122436523, "logps/chosen": -2791.549560546875, "logps/rejected": -2505.584716796875, "loss": 16.3081, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.63368225097656, "rewards/margins": -13.058874130249023, "rewards/rejected": -171.57479858398438, "step": 25290 }, { "epoch": 1.46, "grad_norm": 6.955245494842529, "learning_rate": 0.0005133325593095708, "logits/chosen": -16.335386276245117, "logits/rejected": -15.821817398071289, "logps/chosen": -2733.002685546875, "logps/rejected": -2598.170654296875, "loss": 3.831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.32125854492188, "rewards/margins": 5.333748817443848, "rewards/rejected": -170.655029296875, "step": 25300 }, { "epoch": 1.47, "grad_norm": 65.56172943115234, "learning_rate": 0.0005131390533689384, "logits/chosen": -14.903846740722656, "logits/rejected": -14.941621780395508, "logps/chosen": -2727.76708984375, "logps/rejected": -2502.876953125, "loss": 5.6629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.04104614257812, "rewards/margins": 8.14671802520752, "rewards/rejected": -169.18777465820312, "step": 25310 }, { "epoch": 1.47, "grad_norm": 83.15167236328125, "learning_rate": 0.000512945547428306, "logits/chosen": -13.023877143859863, "logits/rejected": -12.749277114868164, "logps/chosen": -2920.3388671875, "logps/rejected": -2722.106201171875, "loss": 3.2402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -125.64739990234375, "rewards/margins": 6.5456109046936035, "rewards/rejected": -132.19302368164062, "step": 25320 }, { "epoch": 1.47, "grad_norm": 4.996974780624441e-07, "learning_rate": 0.0005127520414876738, "logits/chosen": -14.2874116897583, "logits/rejected": -14.431096076965332, "logps/chosen": -2666.772216796875, "logps/rejected": -2245.636962890625, "loss": 4.1846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.68408203125, "rewards/margins": 6.3971052169799805, "rewards/rejected": -175.08119201660156, "step": 25330 }, { "epoch": 1.47, "grad_norm": 0.008317865431308746, "learning_rate": 0.0005125585355470414, "logits/chosen": -15.00049877166748, "logits/rejected": -15.006097793579102, "logps/chosen": -2499.64794921875, "logps/rejected": -2660.05419921875, "loss": 11.2832, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -204.9650421142578, "rewards/margins": -4.721395015716553, "rewards/rejected": -200.24363708496094, "step": 25340 }, { "epoch": 1.47, "grad_norm": 14.680657386779785, "learning_rate": 0.000512365029606409, "logits/chosen": -14.778753280639648, "logits/rejected": -15.246912956237793, "logps/chosen": -2672.177001953125, "logps/rejected": -2592.805419921875, "loss": 4.5838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -235.52792358398438, "rewards/margins": 1.8550094366073608, "rewards/rejected": -237.3829345703125, "step": 25350 }, { "epoch": 1.47, "grad_norm": 48.243045806884766, "learning_rate": 0.0005121715236657766, "logits/chosen": -13.1087064743042, "logits/rejected": -13.20594596862793, "logps/chosen": -3034.0078125, "logps/rejected": -2849.91259765625, "loss": 15.4119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -198.61183166503906, "rewards/margins": -9.787416458129883, "rewards/rejected": -188.8243865966797, "step": 25360 }, { "epoch": 1.47, "grad_norm": 23.57746696472168, "learning_rate": 0.0005119780177251442, "logits/chosen": -13.771995544433594, "logits/rejected": -13.461915969848633, "logps/chosen": -2702.01416015625, "logps/rejected": -2396.26708984375, "loss": 2.9997, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -165.50308227539062, "rewards/margins": 2.852912187576294, "rewards/rejected": -168.35598754882812, "step": 25370 }, { "epoch": 1.47, "grad_norm": 1.4815384149551392, "learning_rate": 0.0005117845117845118, "logits/chosen": -14.0335111618042, "logits/rejected": -14.083477973937988, "logps/chosen": -2553.11572265625, "logps/rejected": -2483.889404296875, "loss": 2.9389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -122.9701156616211, "rewards/margins": 7.0390801429748535, "rewards/rejected": -130.00918579101562, "step": 25380 }, { "epoch": 1.47, "grad_norm": 1.172456979751587, "learning_rate": 0.0005115910058438795, "logits/chosen": -15.554710388183594, "logits/rejected": -16.369325637817383, "logps/chosen": -2894.54052734375, "logps/rejected": -2709.54541015625, "loss": 14.1213, "rewards/accuracies": 0.5, "rewards/chosen": -179.7307586669922, "rewards/margins": -10.653130531311035, "rewards/rejected": -169.07765197753906, "step": 25390 }, { "epoch": 1.47, "grad_norm": 0.0034697894006967545, "learning_rate": 0.000511397499903247, "logits/chosen": -16.332460403442383, "logits/rejected": -16.07363510131836, "logps/chosen": -2485.18017578125, "logps/rejected": -2103.62548828125, "loss": 33.6965, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -210.4722900390625, "rewards/margins": -29.31182289123535, "rewards/rejected": -181.1604461669922, "step": 25400 }, { "epoch": 1.47, "grad_norm": 0.0, "learning_rate": 0.0005112039939626146, "logits/chosen": -15.644421577453613, "logits/rejected": -15.674074172973633, "logps/chosen": -2217.365234375, "logps/rejected": -2328.39892578125, "loss": 3.1529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.23635864257812, "rewards/margins": 13.577451705932617, "rewards/rejected": -202.81381225585938, "step": 25410 }, { "epoch": 1.47, "grad_norm": 8.291661262512207, "learning_rate": 0.0005110104880219822, "logits/chosen": -12.624677658081055, "logits/rejected": -12.695817947387695, "logps/chosen": -2883.23974609375, "logps/rejected": -2510.441162109375, "loss": 13.2157, "rewards/accuracies": 0.5, "rewards/chosen": -72.6899642944336, "rewards/margins": -7.805311679840088, "rewards/rejected": -64.88465118408203, "step": 25420 }, { "epoch": 1.47, "grad_norm": 1.7617378489376279e-06, "learning_rate": 0.0005108169820813498, "logits/chosen": -16.757238388061523, "logits/rejected": -16.809497833251953, "logps/chosen": -2003.3375244140625, "logps/rejected": -2229.810791015625, "loss": 2.1724, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.98399353027344, "rewards/margins": 21.2160701751709, "rewards/rejected": -193.2000732421875, "step": 25430 }, { "epoch": 1.47, "grad_norm": 0.15410833060741425, "learning_rate": 0.0005106234761407175, "logits/chosen": -15.888505935668945, "logits/rejected": -15.791841506958008, "logps/chosen": -2654.886474609375, "logps/rejected": -2509.266845703125, "loss": 4.2654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -149.76242065429688, "rewards/margins": 9.143136978149414, "rewards/rejected": -158.90554809570312, "step": 25440 }, { "epoch": 1.47, "grad_norm": 8.07660944701638e-06, "learning_rate": 0.0005104299702000852, "logits/chosen": -15.147331237792969, "logits/rejected": -15.450884819030762, "logps/chosen": -2503.84423828125, "logps/rejected": -2475.170654296875, "loss": 4.1306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.41659545898438, "rewards/margins": 14.508206367492676, "rewards/rejected": -159.9248046875, "step": 25450 }, { "epoch": 1.47, "grad_norm": 0.0, "learning_rate": 0.0005102364642594528, "logits/chosen": -12.328811645507812, "logits/rejected": -12.325850486755371, "logps/chosen": -2595.62841796875, "logps/rejected": -2430.6630859375, "loss": 8.0223, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -78.24974060058594, "rewards/margins": 1.1427078247070312, "rewards/rejected": -79.39244079589844, "step": 25460 }, { "epoch": 1.47, "grad_norm": 40.849403381347656, "learning_rate": 0.0005100429583188204, "logits/chosen": -12.412518501281738, "logits/rejected": -12.146799087524414, "logps/chosen": -2538.11376953125, "logps/rejected": -2420.20751953125, "loss": 19.4403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -93.89869689941406, "rewards/margins": -5.932398796081543, "rewards/rejected": -87.96630096435547, "step": 25470 }, { "epoch": 1.47, "grad_norm": 56.55830383300781, "learning_rate": 0.000509849452378188, "logits/chosen": -13.599111557006836, "logits/rejected": -13.472188949584961, "logps/chosen": -2915.817138671875, "logps/rejected": -2414.282470703125, "loss": 6.0843, "rewards/accuracies": 0.5, "rewards/chosen": -145.85153198242188, "rewards/margins": 2.168621778488159, "rewards/rejected": -148.02015686035156, "step": 25480 }, { "epoch": 1.48, "grad_norm": 66.43904113769531, "learning_rate": 0.0005096559464375556, "logits/chosen": -18.073068618774414, "logits/rejected": -17.225082397460938, "logps/chosen": -2682.32080078125, "logps/rejected": -2116.76611328125, "loss": 38.0799, "rewards/accuracies": 0.5, "rewards/chosen": -162.58111572265625, "rewards/margins": -22.79201889038086, "rewards/rejected": -139.78909301757812, "step": 25490 }, { "epoch": 1.48, "grad_norm": 104.5379409790039, "learning_rate": 0.0005094624404969232, "logits/chosen": -14.49590015411377, "logits/rejected": -14.380514144897461, "logps/chosen": -2669.373779296875, "logps/rejected": -2246.32763671875, "loss": 22.9707, "rewards/accuracies": 0.5, "rewards/chosen": -114.44169616699219, "rewards/margins": -17.414203643798828, "rewards/rejected": -97.02749633789062, "step": 25500 }, { "epoch": 1.48, "grad_norm": 2.08865476452047e-05, "learning_rate": 0.0005092689345562909, "logits/chosen": -15.261670112609863, "logits/rejected": -15.294267654418945, "logps/chosen": -2778.820068359375, "logps/rejected": -2545.478759765625, "loss": 13.2034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.72525024414062, "rewards/margins": -4.88641881942749, "rewards/rejected": -165.83883666992188, "step": 25510 }, { "epoch": 1.48, "grad_norm": 44.20393753051758, "learning_rate": 0.0005090754286156585, "logits/chosen": -15.638879776000977, "logits/rejected": -15.684186935424805, "logps/chosen": -2594.62109375, "logps/rejected": -2631.681396484375, "loss": 4.2532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.3020782470703, "rewards/margins": 4.877816677093506, "rewards/rejected": -149.17990112304688, "step": 25520 }, { "epoch": 1.48, "grad_norm": 7.559034565929323e-05, "learning_rate": 0.0005088819226750261, "logits/chosen": -17.664989471435547, "logits/rejected": -18.70656967163086, "logps/chosen": -2677.65966796875, "logps/rejected": -2478.53955078125, "loss": 25.1153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.39425659179688, "rewards/margins": -14.422947883605957, "rewards/rejected": -148.9713134765625, "step": 25530 }, { "epoch": 1.48, "grad_norm": 0.0010621713008731604, "learning_rate": 0.0005086884167343938, "logits/chosen": -12.987419128417969, "logits/rejected": -13.075350761413574, "logps/chosen": -3120.56982421875, "logps/rejected": -2769.408935546875, "loss": 2.7702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -90.00244140625, "rewards/margins": 16.84012794494629, "rewards/rejected": -106.84256744384766, "step": 25540 }, { "epoch": 1.48, "grad_norm": 4.62277364730835, "learning_rate": 0.0005084949107937614, "logits/chosen": -18.924448013305664, "logits/rejected": -18.575862884521484, "logps/chosen": -2727.06005859375, "logps/rejected": -2757.102783203125, "loss": 5.4266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -169.0304718017578, "rewards/margins": -3.420144557952881, "rewards/rejected": -165.61032104492188, "step": 25550 }, { "epoch": 1.48, "grad_norm": 0.10131532698869705, "learning_rate": 0.0005083014048531291, "logits/chosen": -15.16998291015625, "logits/rejected": -15.10124397277832, "logps/chosen": -2751.64111328125, "logps/rejected": -2446.27490234375, "loss": 11.9039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.27108764648438, "rewards/margins": -5.490694999694824, "rewards/rejected": -152.78038024902344, "step": 25560 }, { "epoch": 1.48, "grad_norm": 2.0846471215918427e-07, "learning_rate": 0.0005081078989124967, "logits/chosen": -15.443063735961914, "logits/rejected": -15.493861198425293, "logps/chosen": -2622.506591796875, "logps/rejected": -2634.021484375, "loss": 0.2402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -181.2206573486328, "rewards/margins": 13.365072250366211, "rewards/rejected": -194.58572387695312, "step": 25570 }, { "epoch": 1.48, "grad_norm": 69.25985717773438, "learning_rate": 0.0005079143929718643, "logits/chosen": -15.821131706237793, "logits/rejected": -15.852360725402832, "logps/chosen": -2954.95751953125, "logps/rejected": -3004.333984375, "loss": 5.7556, "rewards/accuracies": 0.5, "rewards/chosen": -187.3557586669922, "rewards/margins": 5.859733581542969, "rewards/rejected": -193.2154998779297, "step": 25580 }, { "epoch": 1.48, "grad_norm": 0.010467678308486938, "learning_rate": 0.0005077208870312319, "logits/chosen": -20.124893188476562, "logits/rejected": -20.95660400390625, "logps/chosen": -2581.44287109375, "logps/rejected": -2573.033447265625, "loss": 17.3166, "rewards/accuracies": 0.5, "rewards/chosen": -159.5516357421875, "rewards/margins": 3.4624714851379395, "rewards/rejected": -163.01412963867188, "step": 25590 }, { "epoch": 1.48, "grad_norm": 0.35957878828048706, "learning_rate": 0.0005075273810905995, "logits/chosen": -14.734003067016602, "logits/rejected": -14.7918062210083, "logps/chosen": -2576.5361328125, "logps/rejected": -2655.823486328125, "loss": 16.3027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.67091369628906, "rewards/margins": 1.6348152160644531, "rewards/rejected": -110.30574035644531, "step": 25600 }, { "epoch": 1.48, "grad_norm": 68.69920349121094, "learning_rate": 0.0005073338751499671, "logits/chosen": -16.8751277923584, "logits/rejected": -16.757017135620117, "logps/chosen": -2738.340576171875, "logps/rejected": -2696.30712890625, "loss": 15.2185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -202.3041534423828, "rewards/margins": 2.1014914512634277, "rewards/rejected": -204.4056396484375, "step": 25610 }, { "epoch": 1.48, "grad_norm": 3.672579765319824, "learning_rate": 0.0005071403692093347, "logits/chosen": -17.612096786499023, "logits/rejected": -17.617530822753906, "logps/chosen": -2190.799072265625, "logps/rejected": -2172.233154296875, "loss": 3.8913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.4022216796875, "rewards/margins": 1.463202714920044, "rewards/rejected": -155.8654327392578, "step": 25620 }, { "epoch": 1.48, "grad_norm": 169.95376586914062, "learning_rate": 0.0005069468632687023, "logits/chosen": -16.02220916748047, "logits/rejected": -16.107494354248047, "logps/chosen": -2672.358642578125, "logps/rejected": -2591.6669921875, "loss": 13.039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -111.54505920410156, "rewards/margins": 2.8175036907196045, "rewards/rejected": -114.3625717163086, "step": 25630 }, { "epoch": 1.48, "grad_norm": 54.03392791748047, "learning_rate": 0.0005067533573280699, "logits/chosen": -17.554052352905273, "logits/rejected": -17.312267303466797, "logps/chosen": -2681.40869140625, "logps/rejected": -2509.095703125, "loss": 13.0687, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -169.61245727539062, "rewards/margins": -9.303303718566895, "rewards/rejected": -160.3091583251953, "step": 25640 }, { "epoch": 1.48, "grad_norm": 103.18496704101562, "learning_rate": 0.0005065598513874376, "logits/chosen": -18.464393615722656, "logits/rejected": -18.382415771484375, "logps/chosen": -2815.724365234375, "logps/rejected": -2843.07470703125, "loss": 2.5296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.79293823242188, "rewards/margins": 9.675416946411133, "rewards/rejected": -219.46835327148438, "step": 25650 }, { "epoch": 1.49, "grad_norm": 0.0014797068433836102, "learning_rate": 0.0005063663454468052, "logits/chosen": -15.345911979675293, "logits/rejected": -15.583932876586914, "logps/chosen": -2765.660888671875, "logps/rejected": -2740.944580078125, "loss": 6.0829, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -175.36485290527344, "rewards/margins": -3.5820517539978027, "rewards/rejected": -171.7827911376953, "step": 25660 }, { "epoch": 1.49, "grad_norm": 0.40548592805862427, "learning_rate": 0.0005061728395061728, "logits/chosen": -16.83405303955078, "logits/rejected": -17.550952911376953, "logps/chosen": -2913.25390625, "logps/rejected": -2926.989990234375, "loss": 0.2482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -176.5511474609375, "rewards/margins": 15.813430786132812, "rewards/rejected": -192.3645782470703, "step": 25670 }, { "epoch": 1.49, "grad_norm": 85.37881469726562, "learning_rate": 0.0005059793335655405, "logits/chosen": -17.10567855834961, "logits/rejected": -17.256332397460938, "logps/chosen": -2556.3662109375, "logps/rejected": -2439.96728515625, "loss": 7.9853, "rewards/accuracies": 0.5, "rewards/chosen": -181.85134887695312, "rewards/margins": 8.838497161865234, "rewards/rejected": -190.68983459472656, "step": 25680 }, { "epoch": 1.49, "grad_norm": 0.4375452697277069, "learning_rate": 0.0005057858276249081, "logits/chosen": -16.107091903686523, "logits/rejected": -15.678644180297852, "logps/chosen": -3326.940673828125, "logps/rejected": -3113.85498046875, "loss": 4.6812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.76873779296875, "rewards/margins": 7.970231056213379, "rewards/rejected": -156.73898315429688, "step": 25690 }, { "epoch": 1.49, "grad_norm": 0.01781105063855648, "learning_rate": 0.0005055923216842757, "logits/chosen": -16.562536239624023, "logits/rejected": -16.59476089477539, "logps/chosen": -3332.591064453125, "logps/rejected": -3287.80615234375, "loss": 13.0799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -180.59674072265625, "rewards/margins": 1.3031638860702515, "rewards/rejected": -181.89991760253906, "step": 25700 }, { "epoch": 1.49, "grad_norm": 9.546497344970703, "learning_rate": 0.0005053988157436433, "logits/chosen": -19.318452835083008, "logits/rejected": -19.369327545166016, "logps/chosen": -2659.2734375, "logps/rejected": -2810.126220703125, "loss": 0.7181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -201.32015991210938, "rewards/margins": 24.037900924682617, "rewards/rejected": -225.3580322265625, "step": 25710 }, { "epoch": 1.49, "grad_norm": 6.61730432510376, "learning_rate": 0.0005052053098030109, "logits/chosen": -20.02339744567871, "logits/rejected": -21.544055938720703, "logps/chosen": -2819.737060546875, "logps/rejected": -2551.32666015625, "loss": 12.3144, "rewards/accuracies": 0.5, "rewards/chosen": -216.97708129882812, "rewards/margins": -7.66452693939209, "rewards/rejected": -209.3125762939453, "step": 25720 }, { "epoch": 1.49, "grad_norm": 97.3667984008789, "learning_rate": 0.0005050118038623786, "logits/chosen": -13.625448226928711, "logits/rejected": -13.666854858398438, "logps/chosen": -2854.942626953125, "logps/rejected": -2863.513916015625, "loss": 7.1292, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -111.4285659790039, "rewards/margins": -1.2188125848770142, "rewards/rejected": -110.2097396850586, "step": 25730 }, { "epoch": 1.49, "grad_norm": 14.108513832092285, "learning_rate": 0.0005048182979217462, "logits/chosen": -13.170466423034668, "logits/rejected": -13.755300521850586, "logps/chosen": -2771.199462890625, "logps/rejected": -2553.71484375, "loss": 16.7704, "rewards/accuracies": 0.5, "rewards/chosen": -157.96763610839844, "rewards/margins": -7.9782257080078125, "rewards/rejected": -149.9894256591797, "step": 25740 }, { "epoch": 1.49, "grad_norm": 375.4010009765625, "learning_rate": 0.0005046247919811139, "logits/chosen": -12.543669700622559, "logits/rejected": -12.577600479125977, "logps/chosen": -2642.124755859375, "logps/rejected": -2718.61865234375, "loss": 12.0656, "rewards/accuracies": 0.5, "rewards/chosen": -93.735107421875, "rewards/margins": -4.8019819259643555, "rewards/rejected": -88.93312072753906, "step": 25750 }, { "epoch": 1.49, "grad_norm": 96.85240936279297, "learning_rate": 0.0005044312860404815, "logits/chosen": -11.04870891571045, "logits/rejected": -11.100729942321777, "logps/chosen": -2839.15966796875, "logps/rejected": -2741.4072265625, "loss": 12.259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -85.14590454101562, "rewards/margins": 7.1817755699157715, "rewards/rejected": -92.32768249511719, "step": 25760 }, { "epoch": 1.49, "grad_norm": 1.9580568075180054, "learning_rate": 0.0005042377800998491, "logits/chosen": -13.920476913452148, "logits/rejected": -13.872957229614258, "logps/chosen": -2503.495361328125, "logps/rejected": -2140.365234375, "loss": 10.2027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.2154312133789, "rewards/margins": 17.037826538085938, "rewards/rejected": -138.2532501220703, "step": 25770 }, { "epoch": 1.49, "grad_norm": 143.59896850585938, "learning_rate": 0.0005040442741592167, "logits/chosen": -11.920764923095703, "logits/rejected": -11.764814376831055, "logps/chosen": -3026.2646484375, "logps/rejected": -2970.021240234375, "loss": 4.2195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.95712280273438, "rewards/margins": 2.2278666496276855, "rewards/rejected": -157.18496704101562, "step": 25780 }, { "epoch": 1.49, "grad_norm": 75.90866088867188, "learning_rate": 0.0005038507682185844, "logits/chosen": -14.292132377624512, "logits/rejected": -14.067720413208008, "logps/chosen": -2637.125732421875, "logps/rejected": -2308.066650390625, "loss": 3.0044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.66033935546875, "rewards/margins": 1.4340624809265137, "rewards/rejected": -196.0944061279297, "step": 25790 }, { "epoch": 1.49, "grad_norm": 3.7161174759603455e-07, "learning_rate": 0.000503657262277952, "logits/chosen": -14.115511894226074, "logits/rejected": -14.233917236328125, "logps/chosen": -2710.49072265625, "logps/rejected": -2704.45458984375, "loss": 7.08, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -200.1949005126953, "rewards/margins": 1.5159794092178345, "rewards/rejected": -201.71087646484375, "step": 25800 }, { "epoch": 1.49, "grad_norm": 1.223007321357727, "learning_rate": 0.0005034637563373196, "logits/chosen": -13.34607219696045, "logits/rejected": -13.381062507629395, "logps/chosen": -2599.95556640625, "logps/rejected": -1848.5042724609375, "loss": 32.9546, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -152.15206909179688, "rewards/margins": -27.68625259399414, "rewards/rejected": -124.46580505371094, "step": 25810 }, { "epoch": 1.49, "grad_norm": 8.042665285756811e-06, "learning_rate": 0.0005032702503966872, "logits/chosen": -11.874464988708496, "logits/rejected": -11.862672805786133, "logps/chosen": -2936.294921875, "logps/rejected": -2716.583251953125, "loss": 6.7938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -45.38798141479492, "rewards/margins": 6.359243392944336, "rewards/rejected": -51.747215270996094, "step": 25820 }, { "epoch": 1.5, "grad_norm": 2.2181999683380127, "learning_rate": 0.0005030767444560548, "logits/chosen": -15.159235954284668, "logits/rejected": -15.212742805480957, "logps/chosen": -2801.93359375, "logps/rejected": -2642.017822265625, "loss": 1.3475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -99.2818374633789, "rewards/margins": 18.650033950805664, "rewards/rejected": -117.93186950683594, "step": 25830 }, { "epoch": 1.5, "grad_norm": 16.950483322143555, "learning_rate": 0.0005028832385154223, "logits/chosen": -19.299909591674805, "logits/rejected": -20.348865509033203, "logps/chosen": -2718.40869140625, "logps/rejected": -2350.4541015625, "loss": 20.3211, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -208.608642578125, "rewards/margins": -17.844196319580078, "rewards/rejected": -190.76443481445312, "step": 25840 }, { "epoch": 1.5, "grad_norm": 21.523534774780273, "learning_rate": 0.00050268973257479, "logits/chosen": -16.406307220458984, "logits/rejected": -17.587127685546875, "logps/chosen": -2949.07421875, "logps/rejected": -2509.130859375, "loss": 20.8933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.8629913330078, "rewards/margins": -10.05697250366211, "rewards/rejected": -159.80601501464844, "step": 25850 }, { "epoch": 1.5, "grad_norm": 2.6461582241826485e-18, "learning_rate": 0.0005024962266341577, "logits/chosen": -19.476581573486328, "logits/rejected": -20.061201095581055, "logps/chosen": -2794.5556640625, "logps/rejected": -2748.649169921875, "loss": 2.7973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -201.49331665039062, "rewards/margins": 8.995622634887695, "rewards/rejected": -210.4889373779297, "step": 25860 }, { "epoch": 1.5, "grad_norm": 6.624595642089844, "learning_rate": 0.0005023027206935253, "logits/chosen": -15.749913215637207, "logits/rejected": -15.917015075683594, "logps/chosen": -3004.399658203125, "logps/rejected": -2433.11962890625, "loss": 18.9746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -102.5676498413086, "rewards/margins": 5.193164825439453, "rewards/rejected": -107.76081848144531, "step": 25870 }, { "epoch": 1.5, "grad_norm": 0.00021059115533716977, "learning_rate": 0.0005021092147528929, "logits/chosen": -16.878732681274414, "logits/rejected": -17.019636154174805, "logps/chosen": -2866.502685546875, "logps/rejected": -2599.63525390625, "loss": 5.5237, "rewards/accuracies": 0.5, "rewards/chosen": -191.88436889648438, "rewards/margins": 3.1153435707092285, "rewards/rejected": -194.9997100830078, "step": 25880 }, { "epoch": 1.5, "grad_norm": 0.09046931564807892, "learning_rate": 0.0005019157088122605, "logits/chosen": -19.842571258544922, "logits/rejected": -20.823291778564453, "logps/chosen": -2562.243896484375, "logps/rejected": -2627.48095703125, "loss": 4.6215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -212.17886352539062, "rewards/margins": 11.164423942565918, "rewards/rejected": -223.34326171875, "step": 25890 }, { "epoch": 1.5, "grad_norm": 0.0, "learning_rate": 0.0005017222028716282, "logits/chosen": -13.864936828613281, "logits/rejected": -14.114710807800293, "logps/chosen": -3076.894775390625, "logps/rejected": -2623.80517578125, "loss": 4.8879, "rewards/accuracies": 0.5, "rewards/chosen": -150.82957458496094, "rewards/margins": 15.251104354858398, "rewards/rejected": -166.0806884765625, "step": 25900 }, { "epoch": 1.5, "grad_norm": 9.733536720275879, "learning_rate": 0.0005015286969309958, "logits/chosen": -17.584163665771484, "logits/rejected": -17.858112335205078, "logps/chosen": -3564.841064453125, "logps/rejected": -3076.709716796875, "loss": 20.0066, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -213.7275390625, "rewards/margins": -12.513932228088379, "rewards/rejected": -201.21359252929688, "step": 25910 }, { "epoch": 1.5, "grad_norm": 0.0, "learning_rate": 0.0005013351909903634, "logits/chosen": -13.310882568359375, "logits/rejected": -13.018457412719727, "logps/chosen": -3082.15185546875, "logps/rejected": -2821.460205078125, "loss": 2.6135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -59.379600524902344, "rewards/margins": 18.675819396972656, "rewards/rejected": -78.055419921875, "step": 25920 }, { "epoch": 1.5, "grad_norm": 5.53882086341286e-15, "learning_rate": 0.000501141685049731, "logits/chosen": -13.902244567871094, "logits/rejected": -14.464483261108398, "logps/chosen": -3504.327392578125, "logps/rejected": -3226.67138671875, "loss": 7.1534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.02346801757812, "rewards/margins": 10.265556335449219, "rewards/rejected": -155.2890167236328, "step": 25930 }, { "epoch": 1.5, "grad_norm": 57.77516174316406, "learning_rate": 0.0005009481791090986, "logits/chosen": -15.095849990844727, "logits/rejected": -15.047826766967773, "logps/chosen": -2494.462646484375, "logps/rejected": -2567.107666015625, "loss": 7.9148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.6395721435547, "rewards/margins": -0.5635444521903992, "rewards/rejected": -174.07601928710938, "step": 25940 }, { "epoch": 1.5, "grad_norm": 83.28369903564453, "learning_rate": 0.0005007546731684662, "logits/chosen": -11.515836715698242, "logits/rejected": -11.490588188171387, "logps/chosen": -3130.01025390625, "logps/rejected": -2687.57373046875, "loss": 3.1875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.2613525390625, "rewards/margins": 10.565116882324219, "rewards/rejected": -135.8264923095703, "step": 25950 }, { "epoch": 1.5, "grad_norm": 1.322101354598999, "learning_rate": 0.000500561167227834, "logits/chosen": -14.781333923339844, "logits/rejected": -15.280484199523926, "logps/chosen": -2678.32373046875, "logps/rejected": -2865.1181640625, "loss": 9.2754, "rewards/accuracies": 0.5, "rewards/chosen": -155.66751098632812, "rewards/margins": -2.3109993934631348, "rewards/rejected": -153.35650634765625, "step": 25960 }, { "epoch": 1.5, "grad_norm": 9.719309124976368e-19, "learning_rate": 0.0005003676612872016, "logits/chosen": -15.026884078979492, "logits/rejected": -14.899991035461426, "logps/chosen": -2649.82080078125, "logps/rejected": -2097.955322265625, "loss": 16.365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.04974365234375, "rewards/margins": 1.5329704284667969, "rewards/rejected": -159.58270263671875, "step": 25970 }, { "epoch": 1.5, "grad_norm": 5.830225944519043, "learning_rate": 0.0005001741553465692, "logits/chosen": -12.474132537841797, "logits/rejected": -12.43813419342041, "logps/chosen": -3004.39453125, "logps/rejected": -3100.5166015625, "loss": 0.113, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -147.46798706054688, "rewards/margins": 13.812431335449219, "rewards/rejected": -161.2804412841797, "step": 25980 }, { "epoch": 1.5, "grad_norm": 47.832977294921875, "learning_rate": 0.0004999806494059368, "logits/chosen": -13.131078720092773, "logits/rejected": -13.545877456665039, "logps/chosen": -2574.847900390625, "logps/rejected": -2580.794189453125, "loss": 4.7654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -191.09779357910156, "rewards/margins": 4.472840309143066, "rewards/rejected": -195.5706329345703, "step": 25990 }, { "epoch": 1.5, "grad_norm": 160.40704345703125, "learning_rate": 0.0004997871434653044, "logits/chosen": -12.461874961853027, "logits/rejected": -12.528558731079102, "logps/chosen": -2979.62744140625, "logps/rejected": -2697.968994140625, "loss": 4.8589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -182.90533447265625, "rewards/margins": 9.216509819030762, "rewards/rejected": -192.12184143066406, "step": 26000 }, { "epoch": 1.51, "grad_norm": 1.2664289240575942e-18, "learning_rate": 0.000499593637524672, "logits/chosen": -14.821456909179688, "logits/rejected": -16.06186294555664, "logps/chosen": -2552.19384765625, "logps/rejected": -2262.89306640625, "loss": 25.2298, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -177.49038696289062, "rewards/margins": -18.25138282775879, "rewards/rejected": -159.239013671875, "step": 26010 }, { "epoch": 1.51, "grad_norm": 286.4902648925781, "learning_rate": 0.0004994001315840397, "logits/chosen": -15.52184772491455, "logits/rejected": -15.748163223266602, "logps/chosen": -2502.357666015625, "logps/rejected": -2438.51318359375, "loss": 17.2491, "rewards/accuracies": 0.5, "rewards/chosen": -142.00692749023438, "rewards/margins": -5.330636024475098, "rewards/rejected": -136.67630004882812, "step": 26020 }, { "epoch": 1.51, "grad_norm": 0.10661341995000839, "learning_rate": 0.0004992066256434073, "logits/chosen": -16.2825870513916, "logits/rejected": -16.525081634521484, "logps/chosen": -2498.47412109375, "logps/rejected": -2653.057861328125, "loss": 2.9522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.90902709960938, "rewards/margins": 5.201062202453613, "rewards/rejected": -143.11009216308594, "step": 26030 }, { "epoch": 1.51, "grad_norm": 1.0080116987228394, "learning_rate": 0.0004990131197027749, "logits/chosen": -18.724836349487305, "logits/rejected": -18.988752365112305, "logps/chosen": -2162.401611328125, "logps/rejected": -1988.4700927734375, "loss": 15.2218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.34060668945312, "rewards/margins": -10.776611328125, "rewards/rejected": -144.56399536132812, "step": 26040 }, { "epoch": 1.51, "grad_norm": 35.49711990356445, "learning_rate": 0.0004988196137621425, "logits/chosen": -16.584028244018555, "logits/rejected": -16.848377227783203, "logps/chosen": -2582.11572265625, "logps/rejected": -2530.751220703125, "loss": 7.8844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.68260192871094, "rewards/margins": 0.31854087114334106, "rewards/rejected": -177.0011444091797, "step": 26050 }, { "epoch": 1.51, "grad_norm": 0.4541318416595459, "learning_rate": 0.0004986261078215101, "logits/chosen": -14.179118156433105, "logits/rejected": -14.376371383666992, "logps/chosen": -2952.7451171875, "logps/rejected": -2661.552001953125, "loss": 3.2994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -108.9818344116211, "rewards/margins": 5.6878533363342285, "rewards/rejected": -114.669677734375, "step": 26060 }, { "epoch": 1.51, "grad_norm": 68.919677734375, "learning_rate": 0.0004984326018808778, "logits/chosen": -16.460865020751953, "logits/rejected": -16.562639236450195, "logps/chosen": -2451.525390625, "logps/rejected": -2421.56884765625, "loss": 4.0754, "rewards/accuracies": 0.5, "rewards/chosen": -178.93087768554688, "rewards/margins": 2.3904213905334473, "rewards/rejected": -181.3212890625, "step": 26070 }, { "epoch": 1.51, "grad_norm": 9.916615795191319e-08, "learning_rate": 0.0004982390959402454, "logits/chosen": -16.334001541137695, "logits/rejected": -16.625507354736328, "logps/chosen": -2752.5615234375, "logps/rejected": -2687.7763671875, "loss": 5.928, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -175.85633850097656, "rewards/margins": 0.9537546038627625, "rewards/rejected": -176.81008911132812, "step": 26080 }, { "epoch": 1.51, "grad_norm": 11.2514066696167, "learning_rate": 0.000498045589999613, "logits/chosen": -15.058825492858887, "logits/rejected": -15.039006233215332, "logps/chosen": -3232.232177734375, "logps/rejected": -3106.510009765625, "loss": 10.5786, "rewards/accuracies": 0.5, "rewards/chosen": -145.4476776123047, "rewards/margins": -6.90826416015625, "rewards/rejected": -138.53941345214844, "step": 26090 }, { "epoch": 1.51, "grad_norm": 71.38715362548828, "learning_rate": 0.0004978520840589806, "logits/chosen": -17.125255584716797, "logits/rejected": -17.197895050048828, "logps/chosen": -2960.06787109375, "logps/rejected": -3049.21923828125, "loss": 9.4, "rewards/accuracies": 0.5, "rewards/chosen": -204.58319091796875, "rewards/margins": -6.124908447265625, "rewards/rejected": -198.45826721191406, "step": 26100 }, { "epoch": 1.51, "grad_norm": 0.002230878220871091, "learning_rate": 0.0004976585781183482, "logits/chosen": -14.848095893859863, "logits/rejected": -14.985997200012207, "logps/chosen": -3174.587158203125, "logps/rejected": -3047.484375, "loss": 3.9435, "rewards/accuracies": 0.5, "rewards/chosen": -170.2872314453125, "rewards/margins": 7.418601036071777, "rewards/rejected": -177.70582580566406, "step": 26110 }, { "epoch": 1.51, "grad_norm": 64.57064056396484, "learning_rate": 0.0004974650721777158, "logits/chosen": -13.898958206176758, "logits/rejected": -14.119898796081543, "logps/chosen": -3334.10693359375, "logps/rejected": -2853.23095703125, "loss": 6.3156, "rewards/accuracies": 0.5, "rewards/chosen": -171.93960571289062, "rewards/margins": 0.8580427169799805, "rewards/rejected": -172.7976837158203, "step": 26120 }, { "epoch": 1.51, "grad_norm": 136.10488891601562, "learning_rate": 0.0004972715662370835, "logits/chosen": -17.739652633666992, "logits/rejected": -18.19293785095215, "logps/chosen": -3049.29296875, "logps/rejected": -2597.61865234375, "loss": 30.6087, "rewards/accuracies": 0.5, "rewards/chosen": -194.15109252929688, "rewards/margins": -21.20813751220703, "rewards/rejected": -172.94296264648438, "step": 26130 }, { "epoch": 1.51, "grad_norm": 0.006536534521728754, "learning_rate": 0.0004970780602964511, "logits/chosen": -16.585329055786133, "logits/rejected": -17.40131378173828, "logps/chosen": -2617.4638671875, "logps/rejected": -2606.124267578125, "loss": 0.6522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.4058074951172, "rewards/margins": 13.364328384399414, "rewards/rejected": -168.77012634277344, "step": 26140 }, { "epoch": 1.51, "grad_norm": 4.274870590847968e-08, "learning_rate": 0.0004968845543558188, "logits/chosen": -17.00253677368164, "logits/rejected": -16.487030029296875, "logps/chosen": -2838.77880859375, "logps/rejected": -2769.35400390625, "loss": 4.5333, "rewards/accuracies": 0.5, "rewards/chosen": -177.96798706054688, "rewards/margins": 0.3322891294956207, "rewards/rejected": -178.30026245117188, "step": 26150 }, { "epoch": 1.51, "grad_norm": 7.686191960676325e-11, "learning_rate": 0.0004966910484151864, "logits/chosen": -16.723106384277344, "logits/rejected": -16.82941436767578, "logps/chosen": -3016.828857421875, "logps/rejected": -2745.989013671875, "loss": 32.9184, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -157.43685913085938, "rewards/margins": -22.531614303588867, "rewards/rejected": -134.9052276611328, "step": 26160 }, { "epoch": 1.51, "grad_norm": 3.814319970985025e-09, "learning_rate": 0.0004964975424745539, "logits/chosen": -16.115886688232422, "logits/rejected": -16.388513565063477, "logps/chosen": -2861.816162109375, "logps/rejected": -2909.262939453125, "loss": 10.3413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -190.17483520507812, "rewards/margins": -2.8769469261169434, "rewards/rejected": -187.29788208007812, "step": 26170 }, { "epoch": 1.52, "grad_norm": 0.005154747981578112, "learning_rate": 0.0004963040365339215, "logits/chosen": -15.325032234191895, "logits/rejected": -15.512611389160156, "logps/chosen": -2877.95556640625, "logps/rejected": -2631.225830078125, "loss": 2.4517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.40029907226562, "rewards/margins": 13.592401504516602, "rewards/rejected": -181.99270629882812, "step": 26180 }, { "epoch": 1.52, "grad_norm": 49.7083854675293, "learning_rate": 0.0004961105305932892, "logits/chosen": -17.86717987060547, "logits/rejected": -17.37555503845215, "logps/chosen": -2877.91357421875, "logps/rejected": -2557.54052734375, "loss": 16.2869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.09066772460938, "rewards/margins": -7.207177639007568, "rewards/rejected": -173.8834686279297, "step": 26190 }, { "epoch": 1.52, "grad_norm": 0.00048703752690926194, "learning_rate": 0.0004959170246526569, "logits/chosen": -17.01730728149414, "logits/rejected": -16.855098724365234, "logps/chosen": -2693.844970703125, "logps/rejected": -2680.302734375, "loss": 0.0712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -140.46856689453125, "rewards/margins": 20.997875213623047, "rewards/rejected": -161.46644592285156, "step": 26200 }, { "epoch": 1.52, "grad_norm": 18.862394332885742, "learning_rate": 0.0004957235187120245, "logits/chosen": -17.8519229888916, "logits/rejected": -17.299802780151367, "logps/chosen": -2523.7060546875, "logps/rejected": -2164.3408203125, "loss": 5.2526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.6750946044922, "rewards/margins": 8.8340482711792, "rewards/rejected": -194.5091552734375, "step": 26210 }, { "epoch": 1.52, "grad_norm": 0.007516286801546812, "learning_rate": 0.0004955300127713921, "logits/chosen": -14.440587043762207, "logits/rejected": -14.257365226745605, "logps/chosen": -2941.350341796875, "logps/rejected": -2625.784423828125, "loss": 1.7279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.40381622314453, "rewards/margins": 19.41579818725586, "rewards/rejected": -107.81961822509766, "step": 26220 }, { "epoch": 1.52, "grad_norm": 72.36271667480469, "learning_rate": 0.0004953365068307597, "logits/chosen": -15.615335464477539, "logits/rejected": -15.69427490234375, "logps/chosen": -3086.09912109375, "logps/rejected": -3098.26806640625, "loss": 4.1923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.46969604492188, "rewards/margins": 8.978052139282227, "rewards/rejected": -166.44773864746094, "step": 26230 }, { "epoch": 1.52, "grad_norm": 31.289775848388672, "learning_rate": 0.0004951430008901274, "logits/chosen": -15.773966789245605, "logits/rejected": -15.430364608764648, "logps/chosen": -3283.133544921875, "logps/rejected": -2880.173828125, "loss": 0.6712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -178.288330078125, "rewards/margins": 20.49991226196289, "rewards/rejected": -198.7882537841797, "step": 26240 }, { "epoch": 1.52, "grad_norm": 0.0, "learning_rate": 0.000494949494949495, "logits/chosen": -16.52020835876465, "logits/rejected": -16.983776092529297, "logps/chosen": -2677.09619140625, "logps/rejected": -2734.960693359375, "loss": 1.9958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -152.1239471435547, "rewards/margins": 38.72956848144531, "rewards/rejected": -190.853515625, "step": 26250 }, { "epoch": 1.52, "grad_norm": 0.004377501085400581, "learning_rate": 0.0004947559890088626, "logits/chosen": -14.565988540649414, "logits/rejected": -14.728177070617676, "logps/chosen": -3084.75439453125, "logps/rejected": -3107.80712890625, "loss": 9.0919, "rewards/accuracies": 0.5, "rewards/chosen": -164.1813201904297, "rewards/margins": -2.5014750957489014, "rewards/rejected": -161.6798553466797, "step": 26260 }, { "epoch": 1.52, "grad_norm": 13.755463600158691, "learning_rate": 0.0004945624830682302, "logits/chosen": -15.060297966003418, "logits/rejected": -14.806129455566406, "logps/chosen": -3002.889892578125, "logps/rejected": -3048.30615234375, "loss": 3.0817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -190.1741485595703, "rewards/margins": 6.842806816101074, "rewards/rejected": -197.01693725585938, "step": 26270 }, { "epoch": 1.52, "grad_norm": 88.88219451904297, "learning_rate": 0.0004943689771275978, "logits/chosen": -15.076171875, "logits/rejected": -15.33912181854248, "logps/chosen": -2918.02685546875, "logps/rejected": -2875.62646484375, "loss": 6.0986, "rewards/accuracies": 0.5, "rewards/chosen": -199.8855438232422, "rewards/margins": 5.585556983947754, "rewards/rejected": -205.4711151123047, "step": 26280 }, { "epoch": 1.52, "grad_norm": 11.635324478149414, "learning_rate": 0.0004941754711869654, "logits/chosen": -13.140764236450195, "logits/rejected": -12.983980178833008, "logps/chosen": -2816.012451171875, "logps/rejected": -2590.43603515625, "loss": 2.5406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.13497924804688, "rewards/margins": 15.84485912322998, "rewards/rejected": -158.97984313964844, "step": 26290 }, { "epoch": 1.52, "grad_norm": 0.5517475008964539, "learning_rate": 0.0004939819652463331, "logits/chosen": -13.409971237182617, "logits/rejected": -13.334169387817383, "logps/chosen": -2652.557861328125, "logps/rejected": -2479.8876953125, "loss": 1.2906, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -132.6431884765625, "rewards/margins": 22.780160903930664, "rewards/rejected": -155.42333984375, "step": 26300 }, { "epoch": 1.52, "grad_norm": 99.64978790283203, "learning_rate": 0.0004937884593057007, "logits/chosen": -15.536903381347656, "logits/rejected": -16.38861656188965, "logps/chosen": -2640.78076171875, "logps/rejected": -2693.578369140625, "loss": 3.8435, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.07131958007812, "rewards/margins": 3.522101879119873, "rewards/rejected": -202.59341430664062, "step": 26310 }, { "epoch": 1.52, "grad_norm": 24.762619018554688, "learning_rate": 0.0004935949533650683, "logits/chosen": -13.314526557922363, "logits/rejected": -13.211715698242188, "logps/chosen": -2470.359375, "logps/rejected": -2424.29638671875, "loss": 20.8274, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.19241333007812, "rewards/margins": -16.810148239135742, "rewards/rejected": -157.3822479248047, "step": 26320 }, { "epoch": 1.52, "grad_norm": 0.02749568782746792, "learning_rate": 0.0004934014474244359, "logits/chosen": -13.7531156539917, "logits/rejected": -13.74194049835205, "logps/chosen": -2877.427001953125, "logps/rejected": -2841.16845703125, "loss": 6.8189, "rewards/accuracies": 0.5, "rewards/chosen": -217.9778289794922, "rewards/margins": -4.4543561935424805, "rewards/rejected": -213.5234832763672, "step": 26330 }, { "epoch": 1.52, "grad_norm": 5.383022653404623e-06, "learning_rate": 0.0004932079414838035, "logits/chosen": -14.51348876953125, "logits/rejected": -15.081815719604492, "logps/chosen": -2556.03759765625, "logps/rejected": -2646.709716796875, "loss": 5.7434, "rewards/accuracies": 0.5, "rewards/chosen": -114.42118072509766, "rewards/margins": 4.836550235748291, "rewards/rejected": -119.25772857666016, "step": 26340 }, { "epoch": 1.53, "grad_norm": 325.26483154296875, "learning_rate": 0.0004930144355431711, "logits/chosen": -12.648927688598633, "logits/rejected": -12.449270248413086, "logps/chosen": -2604.012939453125, "logps/rejected": -2835.30029296875, "loss": 4.5972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -183.63446044921875, "rewards/margins": 0.23279953002929688, "rewards/rejected": -183.86727905273438, "step": 26350 }, { "epoch": 1.53, "grad_norm": 161.4103546142578, "learning_rate": 0.0004928209296025389, "logits/chosen": -12.486359596252441, "logits/rejected": -12.143373489379883, "logps/chosen": -2885.581298828125, "logps/rejected": -2728.409423828125, "loss": 29.1303, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -116.0878677368164, "rewards/margins": -10.223607063293457, "rewards/rejected": -105.8642578125, "step": 26360 }, { "epoch": 1.53, "grad_norm": 2.229729034297634e-05, "learning_rate": 0.0004926274236619065, "logits/chosen": -10.658709526062012, "logits/rejected": -10.620878219604492, "logps/chosen": -3056.763671875, "logps/rejected": -2953.93701171875, "loss": 4.5761, "rewards/accuracies": 0.5, "rewards/chosen": -181.5926971435547, "rewards/margins": 4.427336692810059, "rewards/rejected": -186.02000427246094, "step": 26370 }, { "epoch": 1.53, "grad_norm": 96.16648864746094, "learning_rate": 0.0004924339177212741, "logits/chosen": -11.534860610961914, "logits/rejected": -11.219461441040039, "logps/chosen": -2605.003662109375, "logps/rejected": -2516.47900390625, "loss": 5.2843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -118.90316009521484, "rewards/margins": 11.733713150024414, "rewards/rejected": -130.63687133789062, "step": 26380 }, { "epoch": 1.53, "grad_norm": 17.893579483032227, "learning_rate": 0.0004922404117806416, "logits/chosen": -13.807302474975586, "logits/rejected": -14.120382308959961, "logps/chosen": -2806.426025390625, "logps/rejected": -2755.123779296875, "loss": 14.0819, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.7450408935547, "rewards/margins": -5.302180767059326, "rewards/rejected": -167.44284057617188, "step": 26390 }, { "epoch": 1.53, "grad_norm": 4.4198711357523734e-10, "learning_rate": 0.0004920469058400092, "logits/chosen": -13.179951667785645, "logits/rejected": -13.206082344055176, "logps/chosen": -3004.563232421875, "logps/rejected": -2532.567138671875, "loss": 4.205, "rewards/accuracies": 0.5, "rewards/chosen": -111.2044448852539, "rewards/margins": 9.568746566772461, "rewards/rejected": -120.77317810058594, "step": 26400 }, { "epoch": 1.53, "grad_norm": 0.00022465003712568432, "learning_rate": 0.000491853399899377, "logits/chosen": -16.364856719970703, "logits/rejected": -17.98471450805664, "logps/chosen": -1939.885009765625, "logps/rejected": -1946.1773681640625, "loss": 11.5381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.45327758789062, "rewards/margins": 5.569361686706543, "rewards/rejected": -171.0226287841797, "step": 26410 }, { "epoch": 1.53, "grad_norm": 1.507280189549931e-09, "learning_rate": 0.0004916598939587446, "logits/chosen": -16.516162872314453, "logits/rejected": -17.525522232055664, "logps/chosen": -3049.845947265625, "logps/rejected": -2889.46923828125, "loss": 4.1723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.9490509033203, "rewards/margins": 3.288926601409912, "rewards/rejected": -208.2379608154297, "step": 26420 }, { "epoch": 1.53, "grad_norm": 52.12187194824219, "learning_rate": 0.0004914663880181122, "logits/chosen": -17.152027130126953, "logits/rejected": -17.446144104003906, "logps/chosen": -2445.24365234375, "logps/rejected": -2390.29931640625, "loss": 9.2227, "rewards/accuracies": 0.5, "rewards/chosen": -206.8247833251953, "rewards/margins": -3.273487091064453, "rewards/rejected": -203.55126953125, "step": 26430 }, { "epoch": 1.53, "grad_norm": 105.89813995361328, "learning_rate": 0.0004912728820774798, "logits/chosen": -15.61425495147705, "logits/rejected": -15.996231079101562, "logps/chosen": -2732.84814453125, "logps/rejected": -2402.167236328125, "loss": 16.6488, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.02206420898438, "rewards/margins": -13.108637809753418, "rewards/rejected": -125.91341400146484, "step": 26440 }, { "epoch": 1.53, "grad_norm": 19.864776611328125, "learning_rate": 0.0004910793761368474, "logits/chosen": -13.5660982131958, "logits/rejected": -13.905778884887695, "logps/chosen": -2755.56005859375, "logps/rejected": -2430.367919921875, "loss": 11.6991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -112.9598388671875, "rewards/margins": -8.726058959960938, "rewards/rejected": -104.23377990722656, "step": 26450 }, { "epoch": 1.53, "grad_norm": 243.1739044189453, "learning_rate": 0.000490885870196215, "logits/chosen": -14.37291431427002, "logits/rejected": -13.962028503417969, "logps/chosen": -2517.44189453125, "logps/rejected": -2658.45166015625, "loss": 11.1522, "rewards/accuracies": 0.5, "rewards/chosen": -122.4767074584961, "rewards/margins": -2.3774311542510986, "rewards/rejected": -120.09928131103516, "step": 26460 }, { "epoch": 1.53, "grad_norm": 2.855003344848228e-07, "learning_rate": 0.0004906923642555827, "logits/chosen": -16.977222442626953, "logits/rejected": -17.092859268188477, "logps/chosen": -2475.70068359375, "logps/rejected": -2650.037353515625, "loss": 2.1867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.2067413330078, "rewards/margins": 13.72553825378418, "rewards/rejected": -193.9322967529297, "step": 26470 }, { "epoch": 1.53, "grad_norm": 11.884686470031738, "learning_rate": 0.0004904988583149503, "logits/chosen": -17.32373046875, "logits/rejected": -17.161094665527344, "logps/chosen": -2511.61279296875, "logps/rejected": -2484.100341796875, "loss": 5.0663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -218.7263641357422, "rewards/margins": 1.29217529296875, "rewards/rejected": -220.01852416992188, "step": 26480 }, { "epoch": 1.53, "grad_norm": 20.232898712158203, "learning_rate": 0.0004903053523743179, "logits/chosen": -17.606409072875977, "logits/rejected": -17.859479904174805, "logps/chosen": -2760.20556640625, "logps/rejected": -2624.745361328125, "loss": 3.1412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.03347778320312, "rewards/margins": 8.361135482788086, "rewards/rejected": -180.39462280273438, "step": 26490 }, { "epoch": 1.53, "grad_norm": 2.832836389541626, "learning_rate": 0.0004901118464336855, "logits/chosen": -16.200538635253906, "logits/rejected": -16.740192413330078, "logps/chosen": -3031.264404296875, "logps/rejected": -2662.583984375, "loss": 23.4532, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.08639526367188, "rewards/margins": -22.38382911682129, "rewards/rejected": -166.70257568359375, "step": 26500 }, { "epoch": 1.53, "grad_norm": 2.070907056950233e-10, "learning_rate": 0.0004899183404930531, "logits/chosen": -15.921002388000488, "logits/rejected": -15.997268676757812, "logps/chosen": -2649.5908203125, "logps/rejected": -2360.4404296875, "loss": 7.1777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -94.97857666015625, "rewards/margins": 6.121715545654297, "rewards/rejected": -101.10028839111328, "step": 26510 }, { "epoch": 1.54, "grad_norm": 4.7800614161319643e-11, "learning_rate": 0.0004897248345524207, "logits/chosen": -14.435084342956543, "logits/rejected": -14.41222858428955, "logps/chosen": -2582.16015625, "logps/rejected": -2486.61279296875, "loss": 3.941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -76.28289794921875, "rewards/margins": 4.659149169921875, "rewards/rejected": -80.94204711914062, "step": 26520 }, { "epoch": 1.54, "grad_norm": 10.99560260772705, "learning_rate": 0.0004895313286117884, "logits/chosen": -14.340110778808594, "logits/rejected": -14.461418151855469, "logps/chosen": -2353.95068359375, "logps/rejected": -2423.678466796875, "loss": 1.7254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -129.12060546875, "rewards/margins": 12.066267013549805, "rewards/rejected": -141.18685913085938, "step": 26530 }, { "epoch": 1.54, "grad_norm": 0.6835576891899109, "learning_rate": 0.000489337822671156, "logits/chosen": -15.308286666870117, "logits/rejected": -15.133737564086914, "logps/chosen": -2493.148193359375, "logps/rejected": -2547.77294921875, "loss": 7.6084, "rewards/accuracies": 0.5, "rewards/chosen": -160.41354370117188, "rewards/margins": -5.765969753265381, "rewards/rejected": -154.6475830078125, "step": 26540 }, { "epoch": 1.54, "grad_norm": 4.499361038208008, "learning_rate": 0.0004891443167305236, "logits/chosen": -15.728303909301758, "logits/rejected": -15.583186149597168, "logps/chosen": -2517.23828125, "logps/rejected": -2537.45361328125, "loss": 1.7533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.69635009765625, "rewards/margins": 5.334231376647949, "rewards/rejected": -125.03059387207031, "step": 26550 }, { "epoch": 1.54, "grad_norm": 2.1485357137862593e-06, "learning_rate": 0.0004889508107898913, "logits/chosen": -17.328495025634766, "logits/rejected": -17.841156005859375, "logps/chosen": -2761.210693359375, "logps/rejected": -2903.73388671875, "loss": 6.9647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.91897583007812, "rewards/margins": 5.485205173492432, "rewards/rejected": -180.40419006347656, "step": 26560 }, { "epoch": 1.54, "grad_norm": 97.43338775634766, "learning_rate": 0.0004887573048492589, "logits/chosen": -15.648694038391113, "logits/rejected": -15.949514389038086, "logps/chosen": -3051.885009765625, "logps/rejected": -2872.79931640625, "loss": 13.8344, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -198.53211975097656, "rewards/margins": -7.7294206619262695, "rewards/rejected": -190.80270385742188, "step": 26570 }, { "epoch": 1.54, "grad_norm": 8.96150287223868e-14, "learning_rate": 0.0004885637989086266, "logits/chosen": -17.102375030517578, "logits/rejected": -17.321441650390625, "logps/chosen": -2492.180419921875, "logps/rejected": -2460.849365234375, "loss": 2.5079, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -183.30868530273438, "rewards/margins": 3.3169078826904297, "rewards/rejected": -186.62559509277344, "step": 26580 }, { "epoch": 1.54, "grad_norm": 3.2317636013031006, "learning_rate": 0.0004883702929679942, "logits/chosen": -18.47951889038086, "logits/rejected": -18.915283203125, "logps/chosen": -2694.19873046875, "logps/rejected": -2682.22265625, "loss": 1.4127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.9871826171875, "rewards/margins": 7.2132978439331055, "rewards/rejected": -177.2004852294922, "step": 26590 }, { "epoch": 1.54, "grad_norm": 99.4631118774414, "learning_rate": 0.00048817678702736173, "logits/chosen": -16.42232894897461, "logits/rejected": -16.534584045410156, "logps/chosen": -2644.861328125, "logps/rejected": -2339.97998046875, "loss": 16.3721, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -151.87457275390625, "rewards/margins": -11.470939636230469, "rewards/rejected": -140.40362548828125, "step": 26600 }, { "epoch": 1.54, "grad_norm": 91.9088363647461, "learning_rate": 0.00048798328108672935, "logits/chosen": -13.965242385864258, "logits/rejected": -14.137042045593262, "logps/chosen": -3065.891845703125, "logps/rejected": -2806.634521484375, "loss": 1.3964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -86.9244384765625, "rewards/margins": 13.315195083618164, "rewards/rejected": -100.23963165283203, "step": 26610 }, { "epoch": 1.54, "grad_norm": 188.6658172607422, "learning_rate": 0.000487789775146097, "logits/chosen": -14.8855619430542, "logits/rejected": -15.44825267791748, "logps/chosen": -2586.987548828125, "logps/rejected": -2892.91455078125, "loss": 18.1086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.64674377441406, "rewards/margins": 7.132199764251709, "rewards/rejected": -117.77894592285156, "step": 26620 }, { "epoch": 1.54, "grad_norm": 90.71810150146484, "learning_rate": 0.00048759626920546464, "logits/chosen": -12.671835899353027, "logits/rejected": -12.92164421081543, "logps/chosen": -3032.69091796875, "logps/rejected": -2718.41845703125, "loss": 3.2569, "rewards/accuracies": 0.5, "rewards/chosen": -142.51266479492188, "rewards/margins": 1.8150932788848877, "rewards/rejected": -144.32774353027344, "step": 26630 }, { "epoch": 1.54, "grad_norm": 90.5609130859375, "learning_rate": 0.00048740276326483226, "logits/chosen": -12.497696876525879, "logits/rejected": -12.721742630004883, "logps/chosen": -2919.286376953125, "logps/rejected": -2435.36865234375, "loss": 7.3246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -86.30940246582031, "rewards/margins": 13.884488105773926, "rewards/rejected": -100.19389343261719, "step": 26640 }, { "epoch": 1.54, "grad_norm": 3.032035965588875e-05, "learning_rate": 0.0004872092573241999, "logits/chosen": -14.470537185668945, "logits/rejected": -15.826716423034668, "logps/chosen": -2446.80419921875, "logps/rejected": -2598.354248046875, "loss": 6.5753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -97.0911865234375, "rewards/margins": 5.267748832702637, "rewards/rejected": -102.35894775390625, "step": 26650 }, { "epoch": 1.54, "grad_norm": 2.1071068501132828e-11, "learning_rate": 0.00048701575138356744, "logits/chosen": -13.752298355102539, "logits/rejected": -13.750048637390137, "logps/chosen": -2486.38720703125, "logps/rejected": -2125.08642578125, "loss": 5.9462, "rewards/accuracies": 0.5, "rewards/chosen": -125.66459655761719, "rewards/margins": 6.9524359703063965, "rewards/rejected": -132.61703491210938, "step": 26660 }, { "epoch": 1.54, "grad_norm": 0.14369353652000427, "learning_rate": 0.0004868222454429351, "logits/chosen": -13.115852355957031, "logits/rejected": -13.64483642578125, "logps/chosen": -2739.967041015625, "logps/rejected": -2810.99755859375, "loss": 5.5613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.67727661132812, "rewards/margins": 3.369825839996338, "rewards/rejected": -138.047119140625, "step": 26670 }, { "epoch": 1.54, "grad_norm": 2.5577792661657384e-10, "learning_rate": 0.0004866287395023027, "logits/chosen": -12.706475257873535, "logits/rejected": -13.465472221374512, "logps/chosen": -2937.242919921875, "logps/rejected": -2347.24267578125, "loss": 4.8921, "rewards/accuracies": 0.5, "rewards/chosen": -81.94515991210938, "rewards/margins": 3.8099677562713623, "rewards/rejected": -85.75513458251953, "step": 26680 }, { "epoch": 1.54, "grad_norm": 24.624526977539062, "learning_rate": 0.00048643523356167034, "logits/chosen": -15.534433364868164, "logits/rejected": -15.525113105773926, "logps/chosen": -2805.08203125, "logps/rejected": -3068.677978515625, "loss": 10.6704, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.69309997558594, "rewards/margins": -6.823824405670166, "rewards/rejected": -132.86929321289062, "step": 26690 }, { "epoch": 1.55, "grad_norm": 1.8046369731905543e-08, "learning_rate": 0.00048624172762103796, "logits/chosen": -15.431851387023926, "logits/rejected": -15.773536682128906, "logps/chosen": -2777.343017578125, "logps/rejected": -2778.300537109375, "loss": 2.9986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.5393829345703, "rewards/margins": 9.460317611694336, "rewards/rejected": -181.99969482421875, "step": 26700 }, { "epoch": 1.55, "grad_norm": 0.0, "learning_rate": 0.0004860482216804056, "logits/chosen": -14.656585693359375, "logits/rejected": -14.751153945922852, "logps/chosen": -2295.8291015625, "logps/rejected": -2492.66796875, "loss": 25.4127, "rewards/accuracies": 0.5, "rewards/chosen": -184.00892639160156, "rewards/margins": -6.310955047607422, "rewards/rejected": -177.6979522705078, "step": 26710 }, { "epoch": 1.55, "grad_norm": 0.059708938002586365, "learning_rate": 0.00048585471573977325, "logits/chosen": -17.025440216064453, "logits/rejected": -17.273155212402344, "logps/chosen": -2508.876953125, "logps/rejected": -2397.89404296875, "loss": 8.2064, "rewards/accuracies": 0.5, "rewards/chosen": -180.2772674560547, "rewards/margins": -2.439758777618408, "rewards/rejected": -177.83749389648438, "step": 26720 }, { "epoch": 1.55, "grad_norm": 2.0071399211883545, "learning_rate": 0.00048566120979914087, "logits/chosen": -18.830581665039062, "logits/rejected": -18.886091232299805, "logps/chosen": -2817.844970703125, "logps/rejected": -2778.68896484375, "loss": 3.5883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.78240966796875, "rewards/margins": 0.3193904757499695, "rewards/rejected": -188.101806640625, "step": 26730 }, { "epoch": 1.55, "grad_norm": 45.627784729003906, "learning_rate": 0.0004854677038585085, "logits/chosen": -17.424427032470703, "logits/rejected": -19.40098762512207, "logps/chosen": -2776.413818359375, "logps/rejected": -2741.78662109375, "loss": 6.6079, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -206.5513458251953, "rewards/margins": -2.678830623626709, "rewards/rejected": -203.87249755859375, "step": 26740 }, { "epoch": 1.55, "grad_norm": 165.3172607421875, "learning_rate": 0.0004852741979178761, "logits/chosen": -12.943588256835938, "logits/rejected": -13.1364107131958, "logps/chosen": -3136.751953125, "logps/rejected": -2692.304443359375, "loss": 4.5751, "rewards/accuracies": 0.5, "rewards/chosen": -136.30715942382812, "rewards/margins": 4.280483245849609, "rewards/rejected": -140.58766174316406, "step": 26750 }, { "epoch": 1.55, "grad_norm": 54.786903381347656, "learning_rate": 0.0004850806919772437, "logits/chosen": -15.760149002075195, "logits/rejected": -16.961212158203125, "logps/chosen": -2762.41259765625, "logps/rejected": -2655.287841796875, "loss": 6.7417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.13766479492188, "rewards/margins": 0.9018005132675171, "rewards/rejected": -136.03945922851562, "step": 26760 }, { "epoch": 1.55, "grad_norm": 0.0005983569426462054, "learning_rate": 0.00048488718603661134, "logits/chosen": -19.37417984008789, "logits/rejected": -19.50151252746582, "logps/chosen": -2617.381591796875, "logps/rejected": -2672.378662109375, "loss": 4.9326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.27700805664062, "rewards/margins": 8.064567565917969, "rewards/rejected": -203.34158325195312, "step": 26770 }, { "epoch": 1.55, "grad_norm": 2.2061312847654335e-06, "learning_rate": 0.00048469368009597895, "logits/chosen": -18.0382022857666, "logits/rejected": -19.276180267333984, "logps/chosen": -2737.688232421875, "logps/rejected": -2749.63623046875, "loss": 4.0616, "rewards/accuracies": 0.5, "rewards/chosen": -235.50039672851562, "rewards/margins": 8.824061393737793, "rewards/rejected": -244.32443237304688, "step": 26780 }, { "epoch": 1.55, "grad_norm": 2.824820655167315e-10, "learning_rate": 0.00048450017415534657, "logits/chosen": -16.852840423583984, "logits/rejected": -16.737218856811523, "logps/chosen": -2919.76904296875, "logps/rejected": -2969.60595703125, "loss": 7.2079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.2399673461914, "rewards/margins": 5.161046028137207, "rewards/rejected": -120.40101623535156, "step": 26790 }, { "epoch": 1.55, "grad_norm": 7.515947586522742e-11, "learning_rate": 0.0004843066682147142, "logits/chosen": -17.358858108520508, "logits/rejected": -17.75545883178711, "logps/chosen": -2784.281982421875, "logps/rejected": -2499.92529296875, "loss": 6.9998, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.38723754882812, "rewards/margins": 2.4449307918548584, "rewards/rejected": -183.83216857910156, "step": 26800 }, { "epoch": 1.55, "grad_norm": 100.9686508178711, "learning_rate": 0.0004841131622740818, "logits/chosen": -15.382139205932617, "logits/rejected": -14.214719772338867, "logps/chosen": -2894.65576171875, "logps/rejected": -2761.861328125, "loss": 13.2723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -146.62271118164062, "rewards/margins": -10.175407409667969, "rewards/rejected": -136.4473114013672, "step": 26810 }, { "epoch": 1.55, "grad_norm": 1.901137579807255e-06, "learning_rate": 0.0004839196563334494, "logits/chosen": -16.3238468170166, "logits/rejected": -17.65224266052246, "logps/chosen": -2891.8359375, "logps/rejected": -2483.965087890625, "loss": 16.5203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -149.6995391845703, "rewards/margins": 5.413387775421143, "rewards/rejected": -155.1129150390625, "step": 26820 }, { "epoch": 1.55, "grad_norm": 4.2379477477230004e-14, "learning_rate": 0.0004837261503928171, "logits/chosen": -21.167633056640625, "logits/rejected": -22.63889503479004, "logps/chosen": -2698.215576171875, "logps/rejected": -2918.206298828125, "loss": 10.7634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -202.75076293945312, "rewards/margins": 2.132424831390381, "rewards/rejected": -204.88314819335938, "step": 26830 }, { "epoch": 1.55, "grad_norm": 48.15196990966797, "learning_rate": 0.0004835326444521847, "logits/chosen": -15.968708992004395, "logits/rejected": -16.887449264526367, "logps/chosen": -2952.90576171875, "logps/rejected": -2709.17529296875, "loss": 1.3541, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -121.43379211425781, "rewards/margins": 12.575176239013672, "rewards/rejected": -134.0089874267578, "step": 26840 }, { "epoch": 1.55, "grad_norm": 67.02389526367188, "learning_rate": 0.00048333913851155233, "logits/chosen": -20.92049789428711, "logits/rejected": -23.197906494140625, "logps/chosen": -2737.109619140625, "logps/rejected": -2665.02099609375, "loss": 17.6844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -238.0604248046875, "rewards/margins": 3.572509765625, "rewards/rejected": -241.63290405273438, "step": 26850 }, { "epoch": 1.55, "grad_norm": 114.13301086425781, "learning_rate": 0.00048314563257091994, "logits/chosen": -18.623136520385742, "logits/rejected": -19.505603790283203, "logps/chosen": -2827.8603515625, "logps/rejected": -2735.642578125, "loss": 4.9851, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.63653564453125, "rewards/margins": 5.8855133056640625, "rewards/rejected": -175.52206420898438, "step": 26860 }, { "epoch": 1.56, "grad_norm": 1.348012368396212e-09, "learning_rate": 0.00048295212663028756, "logits/chosen": -17.489593505859375, "logits/rejected": -17.064661026000977, "logps/chosen": -2660.17138671875, "logps/rejected": -2219.60693359375, "loss": 20.687, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.07167053222656, "rewards/margins": -12.966723442077637, "rewards/rejected": -186.10494995117188, "step": 26870 }, { "epoch": 1.56, "grad_norm": 47.58934783935547, "learning_rate": 0.0004827586206896552, "logits/chosen": -14.147603988647461, "logits/rejected": -14.591772079467773, "logps/chosen": -3016.635986328125, "logps/rejected": -3065.95751953125, "loss": 6.8642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -98.84262084960938, "rewards/margins": -2.2863359451293945, "rewards/rejected": -96.5562744140625, "step": 26880 }, { "epoch": 1.56, "grad_norm": 38.278133392333984, "learning_rate": 0.0004825651147490228, "logits/chosen": -19.050960540771484, "logits/rejected": -20.178924560546875, "logps/chosen": -2363.826416015625, "logps/rejected": -2290.025390625, "loss": 12.0319, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -155.59767150878906, "rewards/margins": -6.665077209472656, "rewards/rejected": -148.93258666992188, "step": 26890 }, { "epoch": 1.56, "grad_norm": 189.66477966308594, "learning_rate": 0.0004823716088083904, "logits/chosen": -12.95746898651123, "logits/rejected": -14.972452163696289, "logps/chosen": -2792.54052734375, "logps/rejected": -2618.00439453125, "loss": 14.3768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.91400146484375, "rewards/margins": -9.682486534118652, "rewards/rejected": -132.23150634765625, "step": 26900 }, { "epoch": 1.56, "grad_norm": 1.0399813454853302e-08, "learning_rate": 0.00048217810286775803, "logits/chosen": -13.64448356628418, "logits/rejected": -13.823877334594727, "logps/chosen": -2265.856689453125, "logps/rejected": -2179.174072265625, "loss": 4.6306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.4048309326172, "rewards/margins": 9.0347900390625, "rewards/rejected": -161.4396209716797, "step": 26910 }, { "epoch": 1.56, "grad_norm": 145.6284942626953, "learning_rate": 0.00048198459692712565, "logits/chosen": -17.931575775146484, "logits/rejected": -18.915403366088867, "logps/chosen": -2640.202880859375, "logps/rejected": -2235.64501953125, "loss": 6.96, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.87779235839844, "rewards/margins": -3.9706764221191406, "rewards/rejected": -195.90711975097656, "step": 26920 }, { "epoch": 1.56, "grad_norm": 0.6009073257446289, "learning_rate": 0.0004817910909864933, "logits/chosen": -16.89620018005371, "logits/rejected": -18.49799346923828, "logps/chosen": -2292.673583984375, "logps/rejected": -2170.830078125, "loss": 12.7265, "rewards/accuracies": 0.5, "rewards/chosen": -159.66796875, "rewards/margins": -4.14473819732666, "rewards/rejected": -155.52322387695312, "step": 26930 }, { "epoch": 1.56, "grad_norm": 1.373543145888334e-08, "learning_rate": 0.00048159758504586094, "logits/chosen": -11.104990005493164, "logits/rejected": -10.763757705688477, "logps/chosen": -2878.6162109375, "logps/rejected": -3111.11669921875, "loss": 7.4256, "rewards/accuracies": 0.5, "rewards/chosen": -24.293066024780273, "rewards/margins": -0.0466730110347271, "rewards/rejected": -24.246387481689453, "step": 26940 }, { "epoch": 1.56, "grad_norm": 1.1723838611032988e-07, "learning_rate": 0.00048140407910522855, "logits/chosen": -16.519495010375977, "logits/rejected": -17.092416763305664, "logps/chosen": -2612.196533203125, "logps/rejected": -2434.950927734375, "loss": 1.4264, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -157.75595092773438, "rewards/margins": 14.755613327026367, "rewards/rejected": -172.51156616210938, "step": 26950 }, { "epoch": 1.56, "grad_norm": 85.30105590820312, "learning_rate": 0.00048121057316459617, "logits/chosen": -16.087827682495117, "logits/rejected": -16.07285499572754, "logps/chosen": -2531.90966796875, "logps/rejected": -2546.826416015625, "loss": 5.1138, "rewards/accuracies": 0.5, "rewards/chosen": -174.42477416992188, "rewards/margins": 3.1893134117126465, "rewards/rejected": -177.61410522460938, "step": 26960 }, { "epoch": 1.56, "grad_norm": 3.0706703662872314, "learning_rate": 0.0004810170672239638, "logits/chosen": -14.745755195617676, "logits/rejected": -15.325395584106445, "logps/chosen": -2942.903564453125, "logps/rejected": -2782.485107421875, "loss": 5.7086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.56266021728516, "rewards/margins": 4.228712558746338, "rewards/rejected": -131.7913818359375, "step": 26970 }, { "epoch": 1.56, "grad_norm": 3.959415573184799e-10, "learning_rate": 0.00048082356128333146, "logits/chosen": -16.673595428466797, "logits/rejected": -16.835668563842773, "logps/chosen": -2802.646728515625, "logps/rejected": -2810.274658203125, "loss": 3.2425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.84182739257812, "rewards/margins": 12.493509292602539, "rewards/rejected": -177.3353271484375, "step": 26980 }, { "epoch": 1.56, "grad_norm": 116.90380096435547, "learning_rate": 0.000480630055342699, "logits/chosen": -13.551725387573242, "logits/rejected": -14.476621627807617, "logps/chosen": -2918.6259765625, "logps/rejected": -2670.310302734375, "loss": 3.6741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -111.1832504272461, "rewards/margins": 8.765522956848145, "rewards/rejected": -119.94877624511719, "step": 26990 }, { "epoch": 1.56, "grad_norm": 0.10552503168582916, "learning_rate": 0.00048043654940206664, "logits/chosen": -15.755419731140137, "logits/rejected": -15.5274658203125, "logps/chosen": -2544.39892578125, "logps/rejected": -2343.651123046875, "loss": 13.4933, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -125.179931640625, "rewards/margins": -12.457498550415039, "rewards/rejected": -112.7224349975586, "step": 27000 }, { "epoch": 1.56, "grad_norm": 67.06067657470703, "learning_rate": 0.00048024304346143426, "logits/chosen": -15.773065567016602, "logits/rejected": -15.808061599731445, "logps/chosen": -2722.947509765625, "logps/rejected": -2381.339111328125, "loss": 19.873, "rewards/accuracies": 0.5, "rewards/chosen": -128.3125762939453, "rewards/margins": -9.258939743041992, "rewards/rejected": -119.05362701416016, "step": 27010 }, { "epoch": 1.56, "grad_norm": 0.00033295954926870763, "learning_rate": 0.0004800495375208019, "logits/chosen": -13.609151840209961, "logits/rejected": -13.725151062011719, "logps/chosen": -2651.4912109375, "logps/rejected": -2497.320556640625, "loss": 6.8687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.91242980957031, "rewards/margins": 5.265069007873535, "rewards/rejected": -126.177490234375, "step": 27020 }, { "epoch": 1.56, "grad_norm": 263.4233093261719, "learning_rate": 0.0004798560315801695, "logits/chosen": -16.770063400268555, "logits/rejected": -16.862680435180664, "logps/chosen": -2538.25927734375, "logps/rejected": -2540.84765625, "loss": 4.9581, "rewards/accuracies": 0.5, "rewards/chosen": -112.07502746582031, "rewards/margins": 5.643245220184326, "rewards/rejected": -117.71826171875, "step": 27030 }, { "epoch": 1.57, "grad_norm": 5.1281721924478774e-11, "learning_rate": 0.00047966252563953716, "logits/chosen": -16.132160186767578, "logits/rejected": -15.967656135559082, "logps/chosen": -2435.04248046875, "logps/rejected": -2419.40576171875, "loss": 6.7541, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -88.5458755493164, "rewards/margins": 1.3746814727783203, "rewards/rejected": -89.92057037353516, "step": 27040 }, { "epoch": 1.57, "grad_norm": 3.1543908138347233e-18, "learning_rate": 0.0004794690196989048, "logits/chosen": -14.931292533874512, "logits/rejected": -15.21601390838623, "logps/chosen": -3022.94091796875, "logps/rejected": -2841.589111328125, "loss": 3.3365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.1889762878418, "rewards/margins": 8.221049308776855, "rewards/rejected": -62.41002655029297, "step": 27050 }, { "epoch": 1.57, "grad_norm": 5.46185319194592e-09, "learning_rate": 0.0004792755137582724, "logits/chosen": -17.765884399414062, "logits/rejected": -17.619571685791016, "logps/chosen": -2255.693359375, "logps/rejected": -2729.69580078125, "loss": 1.9944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -135.84811401367188, "rewards/margins": 24.980022430419922, "rewards/rejected": -160.82814025878906, "step": 27060 }, { "epoch": 1.57, "grad_norm": 0.0010341316228732467, "learning_rate": 0.00047908200781764, "logits/chosen": -14.657636642456055, "logits/rejected": -14.7135009765625, "logps/chosen": -2755.74951171875, "logps/rejected": -2502.97705078125, "loss": 5.0147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -97.8328628540039, "rewards/margins": 6.122005462646484, "rewards/rejected": -103.9548568725586, "step": 27070 }, { "epoch": 1.57, "grad_norm": 40.9184684753418, "learning_rate": 0.00047888850187700763, "logits/chosen": -15.402995109558105, "logits/rejected": -15.252744674682617, "logps/chosen": -2885.0947265625, "logps/rejected": -2940.88720703125, "loss": 8.3107, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -124.43312072753906, "rewards/margins": -6.624534606933594, "rewards/rejected": -117.80859375, "step": 27080 }, { "epoch": 1.57, "grad_norm": 104.36800384521484, "learning_rate": 0.0004786949959363753, "logits/chosen": -16.32306671142578, "logits/rejected": -17.079715728759766, "logps/chosen": -2697.32763671875, "logps/rejected": -2489.05859375, "loss": 6.23, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.6906280517578, "rewards/margins": 7.847620964050293, "rewards/rejected": -158.53823852539062, "step": 27090 }, { "epoch": 1.57, "grad_norm": 85.19295501708984, "learning_rate": 0.00047850148999574287, "logits/chosen": -14.11902141571045, "logits/rejected": -13.950909614562988, "logps/chosen": -2868.18798828125, "logps/rejected": -3110.10595703125, "loss": 13.2345, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -100.3072738647461, "rewards/margins": -6.369589328765869, "rewards/rejected": -93.93769073486328, "step": 27100 }, { "epoch": 1.57, "grad_norm": 121.55706024169922, "learning_rate": 0.0004783079840551105, "logits/chosen": -15.124127388000488, "logits/rejected": -15.153074264526367, "logps/chosen": -2769.29443359375, "logps/rejected": -2707.025634765625, "loss": 4.4468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -171.46702575683594, "rewards/margins": 0.10770883411169052, "rewards/rejected": -171.57473754882812, "step": 27110 }, { "epoch": 1.57, "grad_norm": 204.2244873046875, "learning_rate": 0.0004781144781144781, "logits/chosen": -17.13341522216797, "logits/rejected": -18.260143280029297, "logps/chosen": -2454.10498046875, "logps/rejected": -2327.51171875, "loss": 17.6133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -215.59036254882812, "rewards/margins": -4.726905822753906, "rewards/rejected": -210.86343383789062, "step": 27120 }, { "epoch": 1.57, "grad_norm": 87.56521606445312, "learning_rate": 0.0004779209721738457, "logits/chosen": -15.516377449035645, "logits/rejected": -16.26357650756836, "logps/chosen": -2936.063232421875, "logps/rejected": -2684.625732421875, "loss": 27.4825, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -219.7249298095703, "rewards/margins": -25.862035751342773, "rewards/rejected": -193.86288452148438, "step": 27130 }, { "epoch": 1.57, "grad_norm": 1.7388288974761963, "learning_rate": 0.0004777274662332134, "logits/chosen": -16.320594787597656, "logits/rejected": -16.53900718688965, "logps/chosen": -2898.76513671875, "logps/rejected": -2787.853759765625, "loss": 4.0919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.6905517578125, "rewards/margins": 6.300023078918457, "rewards/rejected": -220.9905548095703, "step": 27140 }, { "epoch": 1.57, "grad_norm": 0.002746488433331251, "learning_rate": 0.000477533960292581, "logits/chosen": -16.344985961914062, "logits/rejected": -17.789308547973633, "logps/chosen": -2680.6611328125, "logps/rejected": -2279.0869140625, "loss": 3.1184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.20455932617188, "rewards/margins": 6.683444023132324, "rewards/rejected": -154.88800048828125, "step": 27150 }, { "epoch": 1.57, "grad_norm": 7.90232588769868e-05, "learning_rate": 0.0004773404543519486, "logits/chosen": -15.997406005859375, "logits/rejected": -17.480012893676758, "logps/chosen": -2887.973388671875, "logps/rejected": -2629.73095703125, "loss": 20.5276, "rewards/accuracies": 0.5, "rewards/chosen": -146.39852905273438, "rewards/margins": -10.26404857635498, "rewards/rejected": -136.13446044921875, "step": 27160 }, { "epoch": 1.57, "grad_norm": 3.2629841687096516e-21, "learning_rate": 0.00047714694841131624, "logits/chosen": -16.3976993560791, "logits/rejected": -17.96292495727539, "logps/chosen": -2703.02685546875, "logps/rejected": -2441.503173828125, "loss": 4.2902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.79043579101562, "rewards/margins": 17.38001823425293, "rewards/rejected": -180.17044067382812, "step": 27170 }, { "epoch": 1.57, "grad_norm": 0.0005951074999757111, "learning_rate": 0.00047695344247068386, "logits/chosen": -15.48614501953125, "logits/rejected": -16.197147369384766, "logps/chosen": -2319.57666015625, "logps/rejected": -2134.280029296875, "loss": 2.5078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -135.19540405273438, "rewards/margins": 10.357780456542969, "rewards/rejected": -145.55319213867188, "step": 27180 }, { "epoch": 1.57, "grad_norm": 1.5577425074297935e-05, "learning_rate": 0.00047675993653005153, "logits/chosen": -16.550220489501953, "logits/rejected": -18.097431182861328, "logps/chosen": -2720.22216796875, "logps/rejected": -2545.5107421875, "loss": 2.7271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -142.4814910888672, "rewards/margins": 17.962188720703125, "rewards/rejected": -160.4436798095703, "step": 27190 }, { "epoch": 1.57, "grad_norm": 40.14822769165039, "learning_rate": 0.00047656643058941915, "logits/chosen": -16.67306900024414, "logits/rejected": -17.205928802490234, "logps/chosen": -2584.980224609375, "logps/rejected": -2642.343994140625, "loss": 6.376, "rewards/accuracies": 0.5, "rewards/chosen": -159.79428100585938, "rewards/margins": 0.6199878454208374, "rewards/rejected": -160.41427612304688, "step": 27200 }, { "epoch": 1.58, "grad_norm": 1.6250820689123202e-09, "learning_rate": 0.0004763729246487867, "logits/chosen": -15.868014335632324, "logits/rejected": -16.35457992553711, "logps/chosen": -2668.68408203125, "logps/rejected": -2597.14892578125, "loss": 15.3754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.59030151367188, "rewards/margins": -4.675936698913574, "rewards/rejected": -159.9143829345703, "step": 27210 }, { "epoch": 1.58, "grad_norm": 3.120475753348728e-07, "learning_rate": 0.0004761794187081543, "logits/chosen": -16.42367172241211, "logits/rejected": -19.02010726928711, "logps/chosen": -2679.548828125, "logps/rejected": -2785.74658203125, "loss": 2.1527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.68801879882812, "rewards/margins": 10.010334968566895, "rewards/rejected": -157.6983642578125, "step": 27220 }, { "epoch": 1.58, "grad_norm": 46.771018981933594, "learning_rate": 0.00047598591276752194, "logits/chosen": -16.98089027404785, "logits/rejected": -16.772964477539062, "logps/chosen": -2749.83544921875, "logps/rejected": -2782.46826171875, "loss": 5.1862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -164.08810424804688, "rewards/margins": 1.858952283859253, "rewards/rejected": -165.94705200195312, "step": 27230 }, { "epoch": 1.58, "grad_norm": 4.1450857679592445e-05, "learning_rate": 0.00047579240682688956, "logits/chosen": -13.742915153503418, "logits/rejected": -13.750521659851074, "logps/chosen": -3074.667724609375, "logps/rejected": -2760.48486328125, "loss": 3.9602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -96.09457397460938, "rewards/margins": 1.8287216424942017, "rewards/rejected": -97.92329406738281, "step": 27240 }, { "epoch": 1.58, "grad_norm": 4.12522875356384e-15, "learning_rate": 0.00047559890088625723, "logits/chosen": -16.740280151367188, "logits/rejected": -16.934762954711914, "logps/chosen": -2687.759765625, "logps/rejected": -2855.654296875, "loss": 0.9722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -165.7317657470703, "rewards/margins": 15.007804870605469, "rewards/rejected": -180.73956298828125, "step": 27250 }, { "epoch": 1.58, "grad_norm": 0.0010241541313007474, "learning_rate": 0.00047540539494562485, "logits/chosen": -15.48266887664795, "logits/rejected": -15.197113037109375, "logps/chosen": -2989.45849609375, "logps/rejected": -2959.420654296875, "loss": 8.8186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.81974029541016, "rewards/margins": -1.1468899250030518, "rewards/rejected": -123.6728515625, "step": 27260 }, { "epoch": 1.58, "grad_norm": 3.475137948989868, "learning_rate": 0.00047521188900499247, "logits/chosen": -16.185131072998047, "logits/rejected": -16.946216583251953, "logps/chosen": -2786.79052734375, "logps/rejected": -2629.072265625, "loss": 18.5548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.84619140625, "rewards/margins": -4.968662261962891, "rewards/rejected": -168.87753295898438, "step": 27270 }, { "epoch": 1.58, "grad_norm": 0.007288196124136448, "learning_rate": 0.0004750183830643601, "logits/chosen": -15.790240287780762, "logits/rejected": -16.60552978515625, "logps/chosen": -2771.713134765625, "logps/rejected": -2658.186767578125, "loss": 15.9951, "rewards/accuracies": 0.5, "rewards/chosen": -167.5372314453125, "rewards/margins": -8.771586418151855, "rewards/rejected": -158.76564025878906, "step": 27280 }, { "epoch": 1.58, "grad_norm": 0.4708609879016876, "learning_rate": 0.0004748248771237277, "logits/chosen": -15.83813190460205, "logits/rejected": -16.502044677734375, "logps/chosen": -2648.40966796875, "logps/rejected": -2666.41064453125, "loss": 0.4915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -192.2183837890625, "rewards/margins": 4.236297607421875, "rewards/rejected": -196.45468139648438, "step": 27290 }, { "epoch": 1.58, "grad_norm": 52.14430236816406, "learning_rate": 0.00047463137118309537, "logits/chosen": -15.335638046264648, "logits/rejected": -15.481389045715332, "logps/chosen": -2345.868408203125, "logps/rejected": -2019.9466552734375, "loss": 1.6301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.84776306152344, "rewards/margins": 20.374053955078125, "rewards/rejected": -147.22183227539062, "step": 27300 }, { "epoch": 1.58, "grad_norm": 0.01360158622264862, "learning_rate": 0.000474437865242463, "logits/chosen": -15.151067733764648, "logits/rejected": -15.687443733215332, "logps/chosen": -2637.957763671875, "logps/rejected": -2370.77392578125, "loss": 7.2045, "rewards/accuracies": 0.5, "rewards/chosen": -162.4012908935547, "rewards/margins": -1.911230444908142, "rewards/rejected": -160.4900665283203, "step": 27310 }, { "epoch": 1.58, "grad_norm": 31.04189682006836, "learning_rate": 0.00047424435930183055, "logits/chosen": -14.879365921020508, "logits/rejected": -15.1829195022583, "logps/chosen": -2222.246337890625, "logps/rejected": -2585.62646484375, "loss": 9.9988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.5057373046875, "rewards/margins": -0.18052978813648224, "rewards/rejected": -158.32522583007812, "step": 27320 }, { "epoch": 1.58, "grad_norm": 64.98638153076172, "learning_rate": 0.00047405085336119817, "logits/chosen": -17.07155990600586, "logits/rejected": -17.405384063720703, "logps/chosen": -2677.609619140625, "logps/rejected": -2635.6328125, "loss": 4.9869, "rewards/accuracies": 0.5, "rewards/chosen": -153.11138916015625, "rewards/margins": -1.4789009094238281, "rewards/rejected": -151.63247680664062, "step": 27330 }, { "epoch": 1.58, "grad_norm": 0.08083053678274155, "learning_rate": 0.0004738573474205658, "logits/chosen": -17.468759536743164, "logits/rejected": -17.64230728149414, "logps/chosen": -3169.728271484375, "logps/rejected": -2555.475341796875, "loss": 5.1932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -117.63395690917969, "rewards/margins": 5.072545051574707, "rewards/rejected": -122.70650482177734, "step": 27340 }, { "epoch": 1.58, "grad_norm": 0.022366434335708618, "learning_rate": 0.00047366384147993346, "logits/chosen": -17.243770599365234, "logits/rejected": -18.83859634399414, "logps/chosen": -3002.20751953125, "logps/rejected": -2871.82275390625, "loss": 20.0133, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -190.79595947265625, "rewards/margins": -8.200410842895508, "rewards/rejected": -182.59555053710938, "step": 27350 }, { "epoch": 1.58, "grad_norm": 86.75557708740234, "learning_rate": 0.0004734703355393011, "logits/chosen": -19.998178482055664, "logits/rejected": -19.473669052124023, "logps/chosen": -2602.7861328125, "logps/rejected": -2417.71142578125, "loss": 13.1775, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -196.00253295898438, "rewards/margins": -11.685506820678711, "rewards/rejected": -184.31704711914062, "step": 27360 }, { "epoch": 1.58, "grad_norm": 0.0015324377454817295, "learning_rate": 0.0004732768295986687, "logits/chosen": -18.426658630371094, "logits/rejected": -18.28834342956543, "logps/chosen": -2638.99560546875, "logps/rejected": -2497.439208984375, "loss": 13.0417, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -192.2953338623047, "rewards/margins": -6.0803937911987305, "rewards/rejected": -186.21493530273438, "step": 27370 }, { "epoch": 1.58, "grad_norm": 122.79135131835938, "learning_rate": 0.0004730833236580363, "logits/chosen": -18.7381591796875, "logits/rejected": -20.926010131835938, "logps/chosen": -2284.73388671875, "logps/rejected": -2351.771240234375, "loss": 2.3388, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.05207824707031, "rewards/margins": 10.985057830810547, "rewards/rejected": -134.03713989257812, "step": 27380 }, { "epoch": 1.59, "grad_norm": 472.005126953125, "learning_rate": 0.0004728898177174039, "logits/chosen": -18.100914001464844, "logits/rejected": -19.91510581970215, "logps/chosen": -2944.10693359375, "logps/rejected": -2539.859130859375, "loss": 9.6668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -220.79714965820312, "rewards/margins": -1.1264011859893799, "rewards/rejected": -219.67074584960938, "step": 27390 }, { "epoch": 1.59, "grad_norm": 3.7607712745666504, "learning_rate": 0.0004726963117767716, "logits/chosen": -12.125334739685059, "logits/rejected": -11.85079288482666, "logps/chosen": -3118.169677734375, "logps/rejected": -3012.04150390625, "loss": 4.133, "rewards/accuracies": 0.5, "rewards/chosen": -57.82929611206055, "rewards/margins": 1.8346874713897705, "rewards/rejected": -59.66399002075195, "step": 27400 }, { "epoch": 1.59, "grad_norm": 14.232063293457031, "learning_rate": 0.0004725028058361392, "logits/chosen": -20.001142501831055, "logits/rejected": -20.784814834594727, "logps/chosen": -2511.9765625, "logps/rejected": -2287.91650390625, "loss": 1.8243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -183.6366424560547, "rewards/margins": 17.470352172851562, "rewards/rejected": -201.10699462890625, "step": 27410 }, { "epoch": 1.59, "grad_norm": 78.72351837158203, "learning_rate": 0.00047230929989550683, "logits/chosen": -13.592109680175781, "logits/rejected": -13.586636543273926, "logps/chosen": -2533.34326171875, "logps/rejected": -2898.063232421875, "loss": 8.6265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -73.2026138305664, "rewards/margins": 7.46548318862915, "rewards/rejected": -80.6680908203125, "step": 27420 }, { "epoch": 1.59, "grad_norm": 18.10085105895996, "learning_rate": 0.0004721157939548744, "logits/chosen": -15.084686279296875, "logits/rejected": -14.725570678710938, "logps/chosen": -2421.16943359375, "logps/rejected": -2593.77880859375, "loss": 14.877, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -125.8410873413086, "rewards/margins": -13.754399299621582, "rewards/rejected": -112.086669921875, "step": 27430 }, { "epoch": 1.59, "grad_norm": 1.665483784143041e-14, "learning_rate": 0.000471922288014242, "logits/chosen": -16.064044952392578, "logits/rejected": -18.121662139892578, "logps/chosen": -2283.195068359375, "logps/rejected": -2218.371337890625, "loss": 5.0512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -192.89480590820312, "rewards/margins": 9.610162734985352, "rewards/rejected": -202.50497436523438, "step": 27440 }, { "epoch": 1.59, "grad_norm": 62.14944076538086, "learning_rate": 0.0004717287820736097, "logits/chosen": -16.546823501586914, "logits/rejected": -17.821308135986328, "logps/chosen": -2736.43212890625, "logps/rejected": -2687.912353515625, "loss": 2.5551, "rewards/accuracies": 0.5, "rewards/chosen": -233.32553100585938, "rewards/margins": 2.8997740745544434, "rewards/rejected": -236.22531127929688, "step": 27450 }, { "epoch": 1.59, "grad_norm": 101.59368133544922, "learning_rate": 0.0004715352761329773, "logits/chosen": -16.89352798461914, "logits/rejected": -18.40207290649414, "logps/chosen": -2283.92041015625, "logps/rejected": -2302.1044921875, "loss": 1.861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.43162536621094, "rewards/margins": 17.152538299560547, "rewards/rejected": -205.5841522216797, "step": 27460 }, { "epoch": 1.59, "grad_norm": 0.31061631441116333, "learning_rate": 0.0004713417701923449, "logits/chosen": -15.561311721801758, "logits/rejected": -16.202960968017578, "logps/chosen": -2785.762451171875, "logps/rejected": -2798.276611328125, "loss": 4.1257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.99830627441406, "rewards/margins": 8.371359825134277, "rewards/rejected": -162.3696746826172, "step": 27470 }, { "epoch": 1.59, "grad_norm": 161.91116333007812, "learning_rate": 0.00047114826425171254, "logits/chosen": -14.738123893737793, "logits/rejected": -16.058961868286133, "logps/chosen": -3175.81787109375, "logps/rejected": -3017.973388671875, "loss": 18.2882, "rewards/accuracies": 0.5, "rewards/chosen": -89.65105438232422, "rewards/margins": -4.76220178604126, "rewards/rejected": -84.88884735107422, "step": 27480 }, { "epoch": 1.59, "grad_norm": 0.044978685677051544, "learning_rate": 0.00047095475831108015, "logits/chosen": -16.09091567993164, "logits/rejected": -15.949061393737793, "logps/chosen": -2929.36083984375, "logps/rejected": -3047.956298828125, "loss": 6.8423, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -89.0583724975586, "rewards/margins": -0.6651307344436646, "rewards/rejected": -88.39325714111328, "step": 27490 }, { "epoch": 1.59, "grad_norm": 0.000351306633092463, "learning_rate": 0.00047076125237044777, "logits/chosen": -15.955365180969238, "logits/rejected": -16.224016189575195, "logps/chosen": -3054.62939453125, "logps/rejected": -2953.0810546875, "loss": 3.0383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.88714599609375, "rewards/margins": 2.1715030670166016, "rewards/rejected": -160.0586395263672, "step": 27500 }, { "epoch": 1.59, "grad_norm": 2.4804148779367097e-06, "learning_rate": 0.00047056774642981544, "logits/chosen": -15.900352478027344, "logits/rejected": -16.73370361328125, "logps/chosen": -2850.781982421875, "logps/rejected": -2724.9873046875, "loss": 3.2848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.03753662109375, "rewards/margins": 22.48308753967285, "rewards/rejected": -126.5206298828125, "step": 27510 }, { "epoch": 1.59, "grad_norm": 136.99200439453125, "learning_rate": 0.00047037424048918306, "logits/chosen": -14.768241882324219, "logits/rejected": -16.135135650634766, "logps/chosen": -2912.8603515625, "logps/rejected": -2288.896484375, "loss": 3.9573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.93094635009766, "rewards/margins": 13.405977249145508, "rewards/rejected": -121.33692932128906, "step": 27520 }, { "epoch": 1.59, "grad_norm": 0.0048347776755690575, "learning_rate": 0.0004701807345485507, "logits/chosen": -22.155248641967773, "logits/rejected": -23.48849105834961, "logps/chosen": -2397.518310546875, "logps/rejected": -2624.36279296875, "loss": 0.5976, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.8599395751953, "rewards/margins": 28.278247833251953, "rewards/rejected": -196.13816833496094, "step": 27530 }, { "epoch": 1.59, "grad_norm": 1.9294454034479713e-08, "learning_rate": 0.00046998722860791824, "logits/chosen": -17.703365325927734, "logits/rejected": -17.608623504638672, "logps/chosen": -2994.48681640625, "logps/rejected": -2777.890380859375, "loss": 3.7086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -182.84158325195312, "rewards/margins": 15.145666122436523, "rewards/rejected": -197.9872589111328, "step": 27540 }, { "epoch": 1.59, "grad_norm": 87.53982543945312, "learning_rate": 0.00046979372266728586, "logits/chosen": -21.106969833374023, "logits/rejected": -23.891128540039062, "logps/chosen": -2757.452880859375, "logps/rejected": -2757.0556640625, "loss": 2.8557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.14064025878906, "rewards/margins": 6.117984771728516, "rewards/rejected": -185.25863647460938, "step": 27550 }, { "epoch": 1.6, "grad_norm": 52.595726013183594, "learning_rate": 0.0004696002167266535, "logits/chosen": -17.49942398071289, "logits/rejected": -18.72466278076172, "logps/chosen": -2774.159912109375, "logps/rejected": -2709.2822265625, "loss": 20.2636, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -147.89173889160156, "rewards/margins": -18.106239318847656, "rewards/rejected": -129.78549194335938, "step": 27560 }, { "epoch": 1.6, "grad_norm": 86.64825439453125, "learning_rate": 0.00046940671078602114, "logits/chosen": -16.169841766357422, "logits/rejected": -16.458240509033203, "logps/chosen": -2908.921875, "logps/rejected": -2696.267822265625, "loss": 4.0531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -156.1244659423828, "rewards/margins": 12.24123764038086, "rewards/rejected": -168.36569213867188, "step": 27570 }, { "epoch": 1.6, "grad_norm": 81.09327697753906, "learning_rate": 0.00046921320484538876, "logits/chosen": -16.032215118408203, "logits/rejected": -16.622568130493164, "logps/chosen": -2485.16162109375, "logps/rejected": -2914.95068359375, "loss": 7.8756, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.13595581054688, "rewards/margins": -0.32970428466796875, "rewards/rejected": -135.80624389648438, "step": 27580 }, { "epoch": 1.6, "grad_norm": 3.8048547139624134e-05, "learning_rate": 0.0004690196989047564, "logits/chosen": -15.627090454101562, "logits/rejected": -19.233531951904297, "logps/chosen": -3044.73388671875, "logps/rejected": -2895.265869140625, "loss": 2.074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.56930541992188, "rewards/margins": 6.97965145111084, "rewards/rejected": -166.5489501953125, "step": 27590 }, { "epoch": 1.6, "grad_norm": 138.0214080810547, "learning_rate": 0.000468826192964124, "logits/chosen": -17.758502960205078, "logits/rejected": -19.079851150512695, "logps/chosen": -2804.947265625, "logps/rejected": -2589.14892578125, "loss": 4.9544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.34683227539062, "rewards/margins": 4.815802097320557, "rewards/rejected": -143.16262817382812, "step": 27600 }, { "epoch": 1.6, "grad_norm": 99.09574890136719, "learning_rate": 0.00046863268702349167, "logits/chosen": -18.320451736450195, "logits/rejected": -18.152904510498047, "logps/chosen": -2241.54296875, "logps/rejected": -2390.546142578125, "loss": 1.6433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.1753692626953, "rewards/margins": 3.948765277862549, "rewards/rejected": -174.12411499023438, "step": 27610 }, { "epoch": 1.6, "grad_norm": 0.0011690190294757485, "learning_rate": 0.0004684391810828593, "logits/chosen": -15.03795051574707, "logits/rejected": -15.472517013549805, "logps/chosen": -2672.72900390625, "logps/rejected": -2671.5625, "loss": 9.0313, "rewards/accuracies": 0.5, "rewards/chosen": -153.89512634277344, "rewards/margins": 6.504904270172119, "rewards/rejected": -160.4000244140625, "step": 27620 }, { "epoch": 1.6, "grad_norm": 1.5303534382837825e-05, "learning_rate": 0.0004682456751422269, "logits/chosen": -17.180198669433594, "logits/rejected": -20.05377197265625, "logps/chosen": -2948.653076171875, "logps/rejected": -2753.302978515625, "loss": 19.7174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.73829650878906, "rewards/margins": -9.736991882324219, "rewards/rejected": -181.00131225585938, "step": 27630 }, { "epoch": 1.6, "grad_norm": 2.833854434751079e-14, "learning_rate": 0.00046805216920159446, "logits/chosen": -20.36829948425293, "logits/rejected": -20.09051513671875, "logps/chosen": -2856.2099609375, "logps/rejected": -2704.65869140625, "loss": 15.9341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -212.4430694580078, "rewards/margins": -3.9750938415527344, "rewards/rejected": -208.46798706054688, "step": 27640 }, { "epoch": 1.6, "grad_norm": 0.01135601382702589, "learning_rate": 0.0004678586632609621, "logits/chosen": -17.368017196655273, "logits/rejected": -18.877553939819336, "logps/chosen": -2761.49560546875, "logps/rejected": -2837.24560546875, "loss": 9.1601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.2512664794922, "rewards/margins": 0.5776607394218445, "rewards/rejected": -175.82891845703125, "step": 27650 }, { "epoch": 1.6, "grad_norm": 31.13336181640625, "learning_rate": 0.00046766515732032975, "logits/chosen": -19.649517059326172, "logits/rejected": -19.30859375, "logps/chosen": -2388.27197265625, "logps/rejected": -2325.062255859375, "loss": 1.8045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.91151428222656, "rewards/margins": 4.470355987548828, "rewards/rejected": -200.38186645507812, "step": 27660 }, { "epoch": 1.6, "grad_norm": 9.035921096801758, "learning_rate": 0.00046747165137969737, "logits/chosen": -17.416759490966797, "logits/rejected": -18.472246170043945, "logps/chosen": -2714.16259765625, "logps/rejected": -2595.62451171875, "loss": 21.9612, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.51316833496094, "rewards/margins": -19.185319900512695, "rewards/rejected": -135.32785034179688, "step": 27670 }, { "epoch": 1.6, "grad_norm": 50.585418701171875, "learning_rate": 0.000467278145439065, "logits/chosen": -16.556926727294922, "logits/rejected": -16.843280792236328, "logps/chosen": -2287.769287109375, "logps/rejected": -2667.659423828125, "loss": 7.5679, "rewards/accuracies": 0.5, "rewards/chosen": -160.41897583007812, "rewards/margins": 6.293772220611572, "rewards/rejected": -166.71275329589844, "step": 27680 }, { "epoch": 1.6, "grad_norm": 58.8082389831543, "learning_rate": 0.0004670846394984326, "logits/chosen": -13.932184219360352, "logits/rejected": -15.508729934692383, "logps/chosen": -3451.436767578125, "logps/rejected": -3341.19140625, "loss": 18.6117, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -143.1194610595703, "rewards/margins": -17.919513702392578, "rewards/rejected": -125.19993591308594, "step": 27690 }, { "epoch": 1.6, "grad_norm": 77.14796447753906, "learning_rate": 0.0004668911335578002, "logits/chosen": -19.323158264160156, "logits/rejected": -18.291545867919922, "logps/chosen": -2649.34423828125, "logps/rejected": -2676.86865234375, "loss": 6.4355, "rewards/accuracies": 0.5, "rewards/chosen": -203.3838653564453, "rewards/margins": 0.27393990755081177, "rewards/rejected": -203.6577911376953, "step": 27700 }, { "epoch": 1.6, "grad_norm": 58.175148010253906, "learning_rate": 0.00046669762761716784, "logits/chosen": -17.515806198120117, "logits/rejected": -18.703527450561523, "logps/chosen": -2325.183837890625, "logps/rejected": -2243.322509765625, "loss": 3.4017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -196.43789672851562, "rewards/margins": 2.60298228263855, "rewards/rejected": -199.0408935546875, "step": 27710 }, { "epoch": 1.6, "grad_norm": 0.009582610800862312, "learning_rate": 0.0004665041216765355, "logits/chosen": -15.820643424987793, "logits/rejected": -18.379648208618164, "logps/chosen": -3098.00146484375, "logps/rejected": -2890.81787109375, "loss": 5.0856, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -157.30917358398438, "rewards/margins": -2.3950624465942383, "rewards/rejected": -154.91412353515625, "step": 27720 }, { "epoch": 1.61, "grad_norm": 0.0006020761211402714, "learning_rate": 0.00046631061573590313, "logits/chosen": -18.303695678710938, "logits/rejected": -19.255809783935547, "logps/chosen": -2369.17578125, "logps/rejected": -2431.843017578125, "loss": 7.4986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -176.52944946289062, "rewards/margins": 4.545392990112305, "rewards/rejected": -181.07485961914062, "step": 27730 }, { "epoch": 1.61, "grad_norm": 3.985256677614679e-13, "learning_rate": 0.00046611710979527074, "logits/chosen": -16.21103858947754, "logits/rejected": -16.589387893676758, "logps/chosen": -2773.607177734375, "logps/rejected": -2651.49267578125, "loss": 1.3238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -140.26820373535156, "rewards/margins": 12.595806121826172, "rewards/rejected": -152.86398315429688, "step": 27740 }, { "epoch": 1.61, "grad_norm": 2.700645696657178e-12, "learning_rate": 0.0004659236038546383, "logits/chosen": -17.362449645996094, "logits/rejected": -18.32296371459961, "logps/chosen": -2786.799560546875, "logps/rejected": -2471.305908203125, "loss": 13.1038, "rewards/accuracies": 0.5, "rewards/chosen": -177.61390686035156, "rewards/margins": -1.219659447669983, "rewards/rejected": -176.39425659179688, "step": 27750 }, { "epoch": 1.61, "grad_norm": 64.5507583618164, "learning_rate": 0.0004657300979140059, "logits/chosen": -17.342731475830078, "logits/rejected": -17.30244255065918, "logps/chosen": -2556.849365234375, "logps/rejected": -2616.19287109375, "loss": 4.5827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -111.3052749633789, "rewards/margins": 29.505615234375, "rewards/rejected": -140.81089782714844, "step": 27760 }, { "epoch": 1.61, "grad_norm": 62.48872756958008, "learning_rate": 0.0004655365919733736, "logits/chosen": -17.510631561279297, "logits/rejected": -18.08560562133789, "logps/chosen": -3005.917236328125, "logps/rejected": -2865.7783203125, "loss": 2.6086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.9217529296875, "rewards/margins": 4.630892276763916, "rewards/rejected": -195.55264282226562, "step": 27770 }, { "epoch": 1.61, "grad_norm": 31.265792846679688, "learning_rate": 0.0004653430860327412, "logits/chosen": -16.38150978088379, "logits/rejected": -16.564180374145508, "logps/chosen": -2739.341796875, "logps/rejected": -2531.778076171875, "loss": 5.9184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.73382568359375, "rewards/margins": 1.086553931236267, "rewards/rejected": -133.82037353515625, "step": 27780 }, { "epoch": 1.61, "grad_norm": 55.35999298095703, "learning_rate": 0.00046514958009210883, "logits/chosen": -18.343708038330078, "logits/rejected": -18.7193546295166, "logps/chosen": -2980.35302734375, "logps/rejected": -3168.24609375, "loss": 1.9606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -226.1863555908203, "rewards/margins": 18.02884864807129, "rewards/rejected": -244.21524047851562, "step": 27790 }, { "epoch": 1.61, "grad_norm": 0.06713487952947617, "learning_rate": 0.00046495607415147645, "logits/chosen": -15.565519332885742, "logits/rejected": -15.415826797485352, "logps/chosen": -3191.560546875, "logps/rejected": -3134.11865234375, "loss": 2.9453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.198486328125, "rewards/margins": 11.433491706848145, "rewards/rejected": -220.63198852539062, "step": 27800 }, { "epoch": 1.61, "grad_norm": 203.17840576171875, "learning_rate": 0.00046476256821084406, "logits/chosen": -16.311899185180664, "logits/rejected": -16.237722396850586, "logps/chosen": -3164.4736328125, "logps/rejected": -3229.84814453125, "loss": 10.023, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -196.02920532226562, "rewards/margins": -4.4255690574646, "rewards/rejected": -191.6036376953125, "step": 27810 }, { "epoch": 1.61, "grad_norm": 7.576020699673336e-09, "learning_rate": 0.00046456906227021174, "logits/chosen": -17.116315841674805, "logits/rejected": -18.646291732788086, "logps/chosen": -3109.843994140625, "logps/rejected": -2885.69580078125, "loss": 2.6589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -161.20114135742188, "rewards/margins": 13.154261589050293, "rewards/rejected": -174.3553924560547, "step": 27820 }, { "epoch": 1.61, "grad_norm": 1.829916850237508e-15, "learning_rate": 0.00046437555632957935, "logits/chosen": -16.516481399536133, "logits/rejected": -15.569259643554688, "logps/chosen": -2811.057373046875, "logps/rejected": -2939.09814453125, "loss": 7.0343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.80558013916016, "rewards/margins": 0.12219123542308807, "rewards/rejected": -118.92774963378906, "step": 27830 }, { "epoch": 1.61, "grad_norm": 0.4397110044956207, "learning_rate": 0.00046418205038894697, "logits/chosen": -15.87427043914795, "logits/rejected": -15.641159057617188, "logps/chosen": -2838.583984375, "logps/rejected": -2784.633544921875, "loss": 8.661, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -187.89340209960938, "rewards/margins": -2.633509874343872, "rewards/rejected": -185.2598876953125, "step": 27840 }, { "epoch": 1.61, "grad_norm": 1.5678530931472778, "learning_rate": 0.0004639885444483146, "logits/chosen": -14.633501052856445, "logits/rejected": -14.426678657531738, "logps/chosen": -3110.00634765625, "logps/rejected": -2718.87060546875, "loss": 1.7166, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -132.070556640625, "rewards/margins": 3.159627914428711, "rewards/rejected": -135.23019409179688, "step": 27850 }, { "epoch": 1.61, "grad_norm": 143.29641723632812, "learning_rate": 0.00046379503850768215, "logits/chosen": -16.473848342895508, "logits/rejected": -17.399761199951172, "logps/chosen": -3177.09521484375, "logps/rejected": -2775.897216796875, "loss": 6.7626, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -106.94688415527344, "rewards/margins": -2.3207030296325684, "rewards/rejected": -104.62617492675781, "step": 27860 }, { "epoch": 1.61, "grad_norm": 7.1817240715026855, "learning_rate": 0.0004636015325670498, "logits/chosen": -17.699251174926758, "logits/rejected": -18.11783218383789, "logps/chosen": -2989.71044921875, "logps/rejected": -2740.694580078125, "loss": 18.1173, "rewards/accuracies": 0.5, "rewards/chosen": -224.02163696289062, "rewards/margins": -8.092761039733887, "rewards/rejected": -215.9288787841797, "step": 27870 }, { "epoch": 1.61, "grad_norm": 68.22712707519531, "learning_rate": 0.00046340802662641744, "logits/chosen": -17.57354164123535, "logits/rejected": -17.590747833251953, "logps/chosen": -2974.691650390625, "logps/rejected": -2980.073974609375, "loss": 6.7609, "rewards/accuracies": 0.5, "rewards/chosen": -196.45651245117188, "rewards/margins": -1.503512978553772, "rewards/rejected": -194.95297241210938, "step": 27880 }, { "epoch": 1.61, "grad_norm": 1.987278210435761e-06, "learning_rate": 0.00046321452068578506, "logits/chosen": -14.965497970581055, "logits/rejected": -14.919459342956543, "logps/chosen": -2687.2626953125, "logps/rejected": -2599.16552734375, "loss": 3.1194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.50999450683594, "rewards/margins": 12.352337837219238, "rewards/rejected": -170.86233520507812, "step": 27890 }, { "epoch": 1.61, "grad_norm": 1.3814294106850866e-05, "learning_rate": 0.0004630210147451527, "logits/chosen": -12.675105094909668, "logits/rejected": -12.88923168182373, "logps/chosen": -3387.098388671875, "logps/rejected": -3307.451904296875, "loss": 1.3486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.6104736328125, "rewards/margins": 12.816957473754883, "rewards/rejected": -159.42742919921875, "step": 27900 }, { "epoch": 1.62, "grad_norm": 80.3752670288086, "learning_rate": 0.0004628275088045203, "logits/chosen": -16.179481506347656, "logits/rejected": -16.969470977783203, "logps/chosen": -2651.96728515625, "logps/rejected": -2659.64013671875, "loss": 4.9264, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -133.99708557128906, "rewards/margins": -0.5769832730293274, "rewards/rejected": -133.42010498046875, "step": 27910 }, { "epoch": 1.62, "grad_norm": 8.301440175273456e-06, "learning_rate": 0.0004626340028638879, "logits/chosen": -15.241785049438477, "logits/rejected": -15.558466911315918, "logps/chosen": -2903.57666015625, "logps/rejected": -2794.72509765625, "loss": 5.912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.4291534423828, "rewards/margins": 9.336542129516602, "rewards/rejected": -183.7656707763672, "step": 27920 }, { "epoch": 1.62, "grad_norm": 5.028536113438169e-14, "learning_rate": 0.0004624404969232556, "logits/chosen": -15.29798412322998, "logits/rejected": -15.58961009979248, "logps/chosen": -2685.828125, "logps/rejected": -2397.508056640625, "loss": 2.199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -90.69084930419922, "rewards/margins": 9.598798751831055, "rewards/rejected": -100.2896499633789, "step": 27930 }, { "epoch": 1.62, "grad_norm": 5.794523167423904e-05, "learning_rate": 0.0004622469909826232, "logits/chosen": -16.144712448120117, "logits/rejected": -16.000667572021484, "logps/chosen": -2500.5625, "logps/rejected": -2234.235107421875, "loss": 11.9233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.31478881835938, "rewards/margins": 12.233390808105469, "rewards/rejected": -146.5481719970703, "step": 27940 }, { "epoch": 1.62, "grad_norm": 1.3814591831584622e-11, "learning_rate": 0.0004620534850419908, "logits/chosen": -16.486019134521484, "logits/rejected": -17.22258758544922, "logps/chosen": -2903.702392578125, "logps/rejected": -2812.234130859375, "loss": 29.2182, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -187.640380859375, "rewards/margins": -24.034358978271484, "rewards/rejected": -163.6060333251953, "step": 27950 }, { "epoch": 1.62, "grad_norm": 0.0015187683748081326, "learning_rate": 0.00046185997910135843, "logits/chosen": -15.467744827270508, "logits/rejected": -14.605239868164062, "logps/chosen": -2863.7724609375, "logps/rejected": -2745.97900390625, "loss": 3.2114, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.23367309570312, "rewards/margins": 0.16081209480762482, "rewards/rejected": -137.3944854736328, "step": 27960 }, { "epoch": 1.62, "grad_norm": 183.14541625976562, "learning_rate": 0.000461666473160726, "logits/chosen": -18.556106567382812, "logits/rejected": -19.207168579101562, "logps/chosen": -2867.39013671875, "logps/rejected": -2754.387939453125, "loss": 9.8167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.24349975585938, "rewards/margins": 2.269524335861206, "rewards/rejected": -151.51303100585938, "step": 27970 }, { "epoch": 1.62, "grad_norm": 55.01988983154297, "learning_rate": 0.00046147296722009367, "logits/chosen": -16.401897430419922, "logits/rejected": -16.186817169189453, "logps/chosen": -3008.21630859375, "logps/rejected": -2859.155029296875, "loss": 17.7473, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -156.45794677734375, "rewards/margins": -13.464950561523438, "rewards/rejected": -142.99298095703125, "step": 27980 }, { "epoch": 1.62, "grad_norm": 0.0469706654548645, "learning_rate": 0.0004612794612794613, "logits/chosen": -16.213871002197266, "logits/rejected": -16.588092803955078, "logps/chosen": -2957.458251953125, "logps/rejected": -2911.755615234375, "loss": 5.8622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.97914123535156, "rewards/margins": 9.785627365112305, "rewards/rejected": -201.76475524902344, "step": 27990 }, { "epoch": 1.62, "grad_norm": 0.006166231352835894, "learning_rate": 0.0004610859553388289, "logits/chosen": -18.315860748291016, "logits/rejected": -19.069332122802734, "logps/chosen": -2938.35888671875, "logps/rejected": -2786.81787109375, "loss": 16.0852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -228.7654571533203, "rewards/margins": -9.29126262664795, "rewards/rejected": -219.4741973876953, "step": 28000 }, { "epoch": 1.62, "grad_norm": 27.461538314819336, "learning_rate": 0.0004608924493981965, "logits/chosen": -14.078512191772461, "logits/rejected": -14.181236267089844, "logps/chosen": -3421.192626953125, "logps/rejected": -3311.43115234375, "loss": 4.6028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.78445434570312, "rewards/margins": 2.144598960876465, "rewards/rejected": -161.92906188964844, "step": 28010 }, { "epoch": 1.62, "grad_norm": 1.176849603652954, "learning_rate": 0.00046069894345756413, "logits/chosen": -17.596576690673828, "logits/rejected": -18.083812713623047, "logps/chosen": -2517.424072265625, "logps/rejected": -2734.32568359375, "loss": 1.5921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -186.05613708496094, "rewards/margins": 10.90071964263916, "rewards/rejected": -196.95684814453125, "step": 28020 }, { "epoch": 1.62, "grad_norm": 2.3101884294440822e-10, "learning_rate": 0.0004605054375169318, "logits/chosen": -17.5357723236084, "logits/rejected": -18.378541946411133, "logps/chosen": -2686.006591796875, "logps/rejected": -2619.97119140625, "loss": 6.5037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.80026245117188, "rewards/margins": 5.3021650314331055, "rewards/rejected": -165.10240173339844, "step": 28030 }, { "epoch": 1.62, "grad_norm": 1.0901398658752441, "learning_rate": 0.0004603119315762994, "logits/chosen": -18.613548278808594, "logits/rejected": -19.224855422973633, "logps/chosen": -2814.579345703125, "logps/rejected": -2742.618408203125, "loss": 22.0472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.1383056640625, "rewards/margins": -8.765247344970703, "rewards/rejected": -170.373046875, "step": 28040 }, { "epoch": 1.62, "grad_norm": 0.0, "learning_rate": 0.00046011842563566704, "logits/chosen": -17.3115177154541, "logits/rejected": -17.243024826049805, "logps/chosen": -3208.042236328125, "logps/rejected": -2701.1220703125, "loss": 8.3237, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -179.54681396484375, "rewards/margins": 1.121769666671753, "rewards/rejected": -180.6685791015625, "step": 28050 }, { "epoch": 1.62, "grad_norm": 57.00117874145508, "learning_rate": 0.00045992491969503466, "logits/chosen": -15.7767972946167, "logits/rejected": -15.690324783325195, "logps/chosen": -2777.2275390625, "logps/rejected": -2599.15283203125, "loss": 3.742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.69235229492188, "rewards/margins": 3.5172648429870605, "rewards/rejected": -155.2096405029297, "step": 28060 }, { "epoch": 1.62, "grad_norm": 0.6085678935050964, "learning_rate": 0.0004597314137544023, "logits/chosen": -16.64187240600586, "logits/rejected": -16.26374053955078, "logps/chosen": -2809.974853515625, "logps/rejected": -2522.3798828125, "loss": 3.0965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.64991760253906, "rewards/margins": 10.618644714355469, "rewards/rejected": -196.26856994628906, "step": 28070 }, { "epoch": 1.63, "grad_norm": 0.02394140139222145, "learning_rate": 0.0004595379078137699, "logits/chosen": -17.668407440185547, "logits/rejected": -17.55241584777832, "logps/chosen": -2658.218017578125, "logps/rejected": -2383.9296875, "loss": 1.7172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.63571166992188, "rewards/margins": 4.505831718444824, "rewards/rejected": -174.1415252685547, "step": 28080 }, { "epoch": 1.63, "grad_norm": 0.00017986520833801478, "learning_rate": 0.0004593444018731375, "logits/chosen": -15.823883056640625, "logits/rejected": -15.68211555480957, "logps/chosen": -2839.713623046875, "logps/rejected": -2728.50244140625, "loss": 15.3891, "rewards/accuracies": 0.5, "rewards/chosen": -94.61793518066406, "rewards/margins": -8.425861358642578, "rewards/rejected": -86.19207000732422, "step": 28090 }, { "epoch": 1.63, "grad_norm": 7.482271671295166, "learning_rate": 0.0004591508959325051, "logits/chosen": -16.99359130859375, "logits/rejected": -16.611331939697266, "logps/chosen": -3142.301025390625, "logps/rejected": -3053.75830078125, "loss": 9.0923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.96865844726562, "rewards/margins": 4.289862155914307, "rewards/rejected": -199.25851440429688, "step": 28100 }, { "epoch": 1.63, "grad_norm": 158.60646057128906, "learning_rate": 0.00045895738999187274, "logits/chosen": -18.65938949584961, "logits/rejected": -18.603099822998047, "logps/chosen": -2553.23876953125, "logps/rejected": -2363.419189453125, "loss": 4.5312, "rewards/accuracies": 0.5, "rewards/chosen": -182.15780639648438, "rewards/margins": 10.138354301452637, "rewards/rejected": -192.29615783691406, "step": 28110 }, { "epoch": 1.63, "grad_norm": 4.4509288699146055e-08, "learning_rate": 0.00045876388405124036, "logits/chosen": -17.090011596679688, "logits/rejected": -17.01367950439453, "logps/chosen": -3065.21044921875, "logps/rejected": -2732.035400390625, "loss": 4.0354, "rewards/accuracies": 0.5, "rewards/chosen": -145.58657836914062, "rewards/margins": 12.514795303344727, "rewards/rejected": -158.10137939453125, "step": 28120 }, { "epoch": 1.63, "grad_norm": 48.48650360107422, "learning_rate": 0.000458570378110608, "logits/chosen": -17.333559036254883, "logits/rejected": -17.371273040771484, "logps/chosen": -2969.2490234375, "logps/rejected": -2717.097900390625, "loss": 4.0146, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -190.92236328125, "rewards/margins": -1.5930019617080688, "rewards/rejected": -189.32937622070312, "step": 28130 }, { "epoch": 1.63, "grad_norm": 4.714355782198254e-06, "learning_rate": 0.00045837687216997565, "logits/chosen": -17.444982528686523, "logits/rejected": -18.375160217285156, "logps/chosen": -2951.14501953125, "logps/rejected": -2902.80712890625, "loss": 5.8394, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.98760986328125, "rewards/margins": 2.9353508949279785, "rewards/rejected": -184.92294311523438, "step": 28140 }, { "epoch": 1.63, "grad_norm": 2.396945086857727e-09, "learning_rate": 0.00045818336622934327, "logits/chosen": -16.549020767211914, "logits/rejected": -17.646564483642578, "logps/chosen": -2303.26123046875, "logps/rejected": -2260.82568359375, "loss": 23.7251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.86785888671875, "rewards/margins": -3.4053680896759033, "rewards/rejected": -117.4625015258789, "step": 28150 }, { "epoch": 1.63, "grad_norm": 0.40857893228530884, "learning_rate": 0.0004579898602887109, "logits/chosen": -15.011322021484375, "logits/rejected": -16.096750259399414, "logps/chosen": -3126.49853515625, "logps/rejected": -2960.571044921875, "loss": 1.4042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -169.36280822753906, "rewards/margins": 4.455364227294922, "rewards/rejected": -173.8181610107422, "step": 28160 }, { "epoch": 1.63, "grad_norm": 2.2621067887786117e-13, "learning_rate": 0.0004577963543480785, "logits/chosen": -19.20281982421875, "logits/rejected": -18.43547821044922, "logps/chosen": -2506.80908203125, "logps/rejected": -2491.577392578125, "loss": 7.3778, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -213.98294067382812, "rewards/margins": 0.3659873902797699, "rewards/rejected": -214.3489227294922, "step": 28170 }, { "epoch": 1.63, "grad_norm": 0.00027671997668221593, "learning_rate": 0.0004576028484074461, "logits/chosen": -14.194526672363281, "logits/rejected": -15.197209358215332, "logps/chosen": -3096.725830078125, "logps/rejected": -2903.07568359375, "loss": 1.5205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -105.2052230834961, "rewards/margins": 13.939964294433594, "rewards/rejected": -119.14517974853516, "step": 28180 }, { "epoch": 1.63, "grad_norm": 0.00013432068226393312, "learning_rate": 0.00045740934246681373, "logits/chosen": -12.572689056396484, "logits/rejected": -12.516586303710938, "logps/chosen": -3091.219970703125, "logps/rejected": -2871.550048828125, "loss": 3.8887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.219263076782227, "rewards/margins": 8.22901725769043, "rewards/rejected": -31.44827651977539, "step": 28190 }, { "epoch": 1.63, "grad_norm": 3.099438217191164e-08, "learning_rate": 0.00045721583652618135, "logits/chosen": -16.858112335205078, "logits/rejected": -17.826385498046875, "logps/chosen": -3224.306884765625, "logps/rejected": -2841.241455078125, "loss": 3.7696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.36251831054688, "rewards/margins": 6.261305332183838, "rewards/rejected": -174.6238250732422, "step": 28200 }, { "epoch": 1.63, "grad_norm": 45.29963302612305, "learning_rate": 0.00045702233058554897, "logits/chosen": -19.694950103759766, "logits/rejected": -19.73958969116211, "logps/chosen": -2794.90380859375, "logps/rejected": -2693.980224609375, "loss": 4.8524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -206.1373291015625, "rewards/margins": 3.3609752655029297, "rewards/rejected": -209.49832153320312, "step": 28210 }, { "epoch": 1.63, "grad_norm": 44.33808898925781, "learning_rate": 0.0004568288246449166, "logits/chosen": -17.2276554107666, "logits/rejected": -18.74003791809082, "logps/chosen": -2816.83154296875, "logps/rejected": -2641.74462890625, "loss": 14.7058, "rewards/accuracies": 0.5, "rewards/chosen": -167.24632263183594, "rewards/margins": -9.168645858764648, "rewards/rejected": -158.0776824951172, "step": 28220 }, { "epoch": 1.63, "grad_norm": 2.4272478124642838e-11, "learning_rate": 0.0004566353187042842, "logits/chosen": -17.701337814331055, "logits/rejected": -18.08087158203125, "logps/chosen": -2705.0458984375, "logps/rejected": -2589.55419921875, "loss": 5.7232, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.32489013671875, "rewards/margins": 1.7203285694122314, "rewards/rejected": -184.0452117919922, "step": 28230 }, { "epoch": 1.63, "grad_norm": 20.44982147216797, "learning_rate": 0.0004564418127636519, "logits/chosen": -16.963790893554688, "logits/rejected": -17.02304458618164, "logps/chosen": -2678.387451171875, "logps/rejected": -2500.97802734375, "loss": 12.5933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.8507080078125, "rewards/margins": -7.600551605224609, "rewards/rejected": -158.25015258789062, "step": 28240 }, { "epoch": 1.64, "grad_norm": 110.2278823852539, "learning_rate": 0.0004562483068230195, "logits/chosen": -18.078807830810547, "logits/rejected": -18.32603645324707, "logps/chosen": -2560.6484375, "logps/rejected": -2618.776611328125, "loss": 22.4947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.59988403320312, "rewards/margins": -18.86224365234375, "rewards/rejected": -159.73764038085938, "step": 28250 }, { "epoch": 1.64, "grad_norm": 0.3106825649738312, "learning_rate": 0.0004560548008823871, "logits/chosen": -16.426204681396484, "logits/rejected": -16.469600677490234, "logps/chosen": -2784.6259765625, "logps/rejected": -2662.048828125, "loss": 6.7173, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -167.69651794433594, "rewards/margins": -0.8551734089851379, "rewards/rejected": -166.84136962890625, "step": 28260 }, { "epoch": 1.64, "grad_norm": 30.357563018798828, "learning_rate": 0.0004558612949417547, "logits/chosen": -15.21125316619873, "logits/rejected": -15.25201416015625, "logps/chosen": -3075.88134765625, "logps/rejected": -2731.36376953125, "loss": 15.9004, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -171.43246459960938, "rewards/margins": -13.352285385131836, "rewards/rejected": -158.08016967773438, "step": 28270 }, { "epoch": 1.64, "grad_norm": 1.807445926260698e-08, "learning_rate": 0.00045566778900112234, "logits/chosen": -19.00128173828125, "logits/rejected": -20.827367782592773, "logps/chosen": -2575.214111328125, "logps/rejected": -2401.953857421875, "loss": 10.7326, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.98118591308594, "rewards/margins": -4.758833885192871, "rewards/rejected": -168.2223358154297, "step": 28280 }, { "epoch": 1.64, "grad_norm": 1.0699844360351562, "learning_rate": 0.00045547428306049, "logits/chosen": -18.19675064086914, "logits/rejected": -18.503923416137695, "logps/chosen": -3017.12109375, "logps/rejected": -2895.184326171875, "loss": 8.431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -183.58453369140625, "rewards/margins": 3.5392653942108154, "rewards/rejected": -187.12379455566406, "step": 28290 }, { "epoch": 1.64, "grad_norm": 77.28662109375, "learning_rate": 0.0004552807771198576, "logits/chosen": -18.103010177612305, "logits/rejected": -18.440326690673828, "logps/chosen": -2788.42822265625, "logps/rejected": -2455.10009765625, "loss": 9.8245, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -189.2115020751953, "rewards/margins": -6.513760566711426, "rewards/rejected": -182.69772338867188, "step": 28300 }, { "epoch": 1.64, "grad_norm": 97.5425033569336, "learning_rate": 0.0004550872711792252, "logits/chosen": -17.26065444946289, "logits/rejected": -17.73639678955078, "logps/chosen": -2427.875244140625, "logps/rejected": -2363.149169921875, "loss": 9.7769, "rewards/accuracies": 0.5, "rewards/chosen": -209.25833129882812, "rewards/margins": -3.413412570953369, "rewards/rejected": -205.8448944091797, "step": 28310 }, { "epoch": 1.64, "grad_norm": 4.709999084472656, "learning_rate": 0.0004548937652385928, "logits/chosen": -15.696393013000488, "logits/rejected": -15.89990520477295, "logps/chosen": -2859.6748046875, "logps/rejected": -2836.247314453125, "loss": 10.215, "rewards/accuracies": 0.5, "rewards/chosen": -155.6575469970703, "rewards/margins": -6.661366939544678, "rewards/rejected": -148.99618530273438, "step": 28320 }, { "epoch": 1.64, "grad_norm": 2.5671141147613525, "learning_rate": 0.00045470025929796043, "logits/chosen": -18.502117156982422, "logits/rejected": -19.8186092376709, "logps/chosen": -2708.526123046875, "logps/rejected": -2611.5869140625, "loss": 0.5847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.86602783203125, "rewards/margins": 8.727124214172363, "rewards/rejected": -181.5931396484375, "step": 28330 }, { "epoch": 1.64, "grad_norm": 0.00041961672832258046, "learning_rate": 0.0004545067533573281, "logits/chosen": -18.585641860961914, "logits/rejected": -19.14168930053711, "logps/chosen": -2496.20263671875, "logps/rejected": -2477.578857421875, "loss": 2.2258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.168701171875, "rewards/margins": 8.291677474975586, "rewards/rejected": -165.4603729248047, "step": 28340 }, { "epoch": 1.64, "grad_norm": 126.32775115966797, "learning_rate": 0.0004543132474166957, "logits/chosen": -15.03771686553955, "logits/rejected": -14.782516479492188, "logps/chosen": -2858.696533203125, "logps/rejected": -2622.29931640625, "loss": 4.9166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -105.28651428222656, "rewards/margins": 5.44364070892334, "rewards/rejected": -110.73014831542969, "step": 28350 }, { "epoch": 1.64, "grad_norm": 1.4282494076311991e-09, "learning_rate": 0.00045411974147606334, "logits/chosen": -18.501922607421875, "logits/rejected": -18.870264053344727, "logps/chosen": -2933.774169921875, "logps/rejected": -2743.87744140625, "loss": 4.0967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -184.08059692382812, "rewards/margins": 12.913421630859375, "rewards/rejected": -196.99400329589844, "step": 28360 }, { "epoch": 1.64, "grad_norm": 37.26958465576172, "learning_rate": 0.00045392623553543095, "logits/chosen": -15.9149808883667, "logits/rejected": -15.956583023071289, "logps/chosen": -2897.27099609375, "logps/rejected": -3009.8671875, "loss": 7.5737, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.7025146484375, "rewards/margins": -2.300010919570923, "rewards/rejected": -152.40249633789062, "step": 28370 }, { "epoch": 1.64, "grad_norm": 8.642017364501953, "learning_rate": 0.00045373272959479857, "logits/chosen": -15.929784774780273, "logits/rejected": -17.337656021118164, "logps/chosen": -3044.26123046875, "logps/rejected": -2773.87646484375, "loss": 1.5336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -68.95206451416016, "rewards/margins": 29.424413681030273, "rewards/rejected": -98.37648010253906, "step": 28380 }, { "epoch": 1.64, "grad_norm": 2.7781416633843037e-07, "learning_rate": 0.0004535392236541662, "logits/chosen": -16.839494705200195, "logits/rejected": -17.5467472076416, "logps/chosen": -2907.845458984375, "logps/rejected": -3147.762451171875, "loss": 6.0372, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.49288940429688, "rewards/margins": 6.294314384460449, "rewards/rejected": -175.78720092773438, "step": 28390 }, { "epoch": 1.64, "grad_norm": 0.00344503833912313, "learning_rate": 0.00045334571771353386, "logits/chosen": -17.361488342285156, "logits/rejected": -17.496532440185547, "logps/chosen": -2319.93359375, "logps/rejected": -2316.615478515625, "loss": 2.9346, "rewards/accuracies": 0.5, "rewards/chosen": -166.72964477539062, "rewards/margins": 3.4889893531799316, "rewards/rejected": -170.21859741210938, "step": 28400 }, { "epoch": 1.64, "grad_norm": 19.295421600341797, "learning_rate": 0.0004531522117729014, "logits/chosen": -13.399495124816895, "logits/rejected": -13.821510314941406, "logps/chosen": -3001.18603515625, "logps/rejected": -2359.33056640625, "loss": 15.0745, "rewards/accuracies": 0.5, "rewards/chosen": -136.2581329345703, "rewards/margins": -5.082542419433594, "rewards/rejected": -131.1755828857422, "step": 28410 }, { "epoch": 1.65, "grad_norm": 2.94562226173245e-13, "learning_rate": 0.00045295870583226904, "logits/chosen": -16.711763381958008, "logits/rejected": -17.699398040771484, "logps/chosen": -2740.27880859375, "logps/rejected": -2379.68115234375, "loss": 2.2111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.6283721923828, "rewards/margins": 8.165392875671387, "rewards/rejected": -181.79376220703125, "step": 28420 }, { "epoch": 1.65, "grad_norm": 15.499690055847168, "learning_rate": 0.00045276519989163666, "logits/chosen": -16.8765926361084, "logits/rejected": -16.874719619750977, "logps/chosen": -2569.925537109375, "logps/rejected": -2606.18115234375, "loss": 2.4014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.55226135253906, "rewards/margins": 16.459999084472656, "rewards/rejected": -197.0122528076172, "step": 28430 }, { "epoch": 1.65, "grad_norm": 71.19256591796875, "learning_rate": 0.00045257169395100427, "logits/chosen": -15.712750434875488, "logits/rejected": -16.18406105041504, "logps/chosen": -2542.739990234375, "logps/rejected": -2470.02490234375, "loss": 6.0874, "rewards/accuracies": 0.5, "rewards/chosen": -148.68202209472656, "rewards/margins": 0.23703384399414062, "rewards/rejected": -148.9190673828125, "step": 28440 }, { "epoch": 1.65, "grad_norm": 1.406212124521744e-08, "learning_rate": 0.00045237818801037194, "logits/chosen": -16.69571876525879, "logits/rejected": -16.83167266845703, "logps/chosen": -2447.35498046875, "logps/rejected": -2459.280029296875, "loss": 1.7112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -208.876708984375, "rewards/margins": 5.950006008148193, "rewards/rejected": -214.8267364501953, "step": 28450 }, { "epoch": 1.65, "grad_norm": 18.121549606323242, "learning_rate": 0.00045218468206973956, "logits/chosen": -15.00657844543457, "logits/rejected": -14.897351264953613, "logps/chosen": -2935.49462890625, "logps/rejected": -2710.54150390625, "loss": 17.2298, "rewards/accuracies": 0.5, "rewards/chosen": -148.05169677734375, "rewards/margins": -12.536757469177246, "rewards/rejected": -135.5149383544922, "step": 28460 }, { "epoch": 1.65, "grad_norm": 8.163612365722656, "learning_rate": 0.0004519911761291072, "logits/chosen": -18.30097007751465, "logits/rejected": -19.397258758544922, "logps/chosen": -2595.54296875, "logps/rejected": -2564.893310546875, "loss": 3.7515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -185.2590789794922, "rewards/margins": -0.18909625709056854, "rewards/rejected": -185.0699920654297, "step": 28470 }, { "epoch": 1.65, "grad_norm": 16.93852424621582, "learning_rate": 0.0004517976701884748, "logits/chosen": -15.79731559753418, "logits/rejected": -15.836563110351562, "logps/chosen": -2493.6552734375, "logps/rejected": -2483.7265625, "loss": 1.761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.4764404296875, "rewards/margins": 4.3097076416015625, "rewards/rejected": -149.78614807128906, "step": 28480 }, { "epoch": 1.65, "grad_norm": 0.1392735093832016, "learning_rate": 0.0004516041642478424, "logits/chosen": -16.19487190246582, "logits/rejected": -16.32891082763672, "logps/chosen": -2736.239990234375, "logps/rejected": -2843.22998046875, "loss": 4.046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.31956481933594, "rewards/margins": 6.484144687652588, "rewards/rejected": -175.80374145507812, "step": 28490 }, { "epoch": 1.65, "grad_norm": 13.0844144821167, "learning_rate": 0.0004514106583072101, "logits/chosen": -17.877849578857422, "logits/rejected": -18.096229553222656, "logps/chosen": -2512.416015625, "logps/rejected": -2286.327392578125, "loss": 19.0295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -167.57846069335938, "rewards/margins": -8.655475616455078, "rewards/rejected": -158.92300415039062, "step": 28500 }, { "epoch": 1.65, "grad_norm": 0.10208005458116531, "learning_rate": 0.0004512171523665777, "logits/chosen": -14.360333442687988, "logits/rejected": -14.326991081237793, "logps/chosen": -2488.62109375, "logps/rejected": -2516.50439453125, "loss": 2.6124, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -94.22428894042969, "rewards/margins": 7.980248928070068, "rewards/rejected": -102.20453643798828, "step": 28510 }, { "epoch": 1.65, "grad_norm": 61.23680877685547, "learning_rate": 0.00045102364642594526, "logits/chosen": -15.899566650390625, "logits/rejected": -18.661376953125, "logps/chosen": -2911.102783203125, "logps/rejected": -2808.0068359375, "loss": 25.0482, "rewards/accuracies": 0.5, "rewards/chosen": -156.70855712890625, "rewards/margins": -18.372417449951172, "rewards/rejected": -138.33615112304688, "step": 28520 }, { "epoch": 1.65, "grad_norm": 3.688367699369172e-15, "learning_rate": 0.0004508301404853129, "logits/chosen": -17.60675048828125, "logits/rejected": -17.681808471679688, "logps/chosen": -2711.675048828125, "logps/rejected": -2456.87548828125, "loss": 3.336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -140.43203735351562, "rewards/margins": 12.169889450073242, "rewards/rejected": -152.60195922851562, "step": 28530 }, { "epoch": 1.65, "grad_norm": 114.2137680053711, "learning_rate": 0.0004506366345446805, "logits/chosen": -16.669204711914062, "logits/rejected": -16.162296295166016, "logps/chosen": -2501.71435546875, "logps/rejected": -2506.626953125, "loss": 9.0151, "rewards/accuracies": 0.5, "rewards/chosen": -90.10932922363281, "rewards/margins": 1.0196692943572998, "rewards/rejected": -91.12899780273438, "step": 28540 }, { "epoch": 1.65, "grad_norm": 58.1205940246582, "learning_rate": 0.00045044312860404817, "logits/chosen": -15.727025032043457, "logits/rejected": -15.945890426635742, "logps/chosen": -3139.264892578125, "logps/rejected": -2493.572998046875, "loss": 26.9881, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.500244140625, "rewards/margins": -12.30107593536377, "rewards/rejected": -112.19918060302734, "step": 28550 }, { "epoch": 1.65, "grad_norm": 67.57653045654297, "learning_rate": 0.0004502496226634158, "logits/chosen": -15.819190979003906, "logits/rejected": -15.780351638793945, "logps/chosen": -2849.24755859375, "logps/rejected": -2567.1337890625, "loss": 6.6218, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -163.65240478515625, "rewards/margins": 10.457280158996582, "rewards/rejected": -174.10968017578125, "step": 28560 }, { "epoch": 1.65, "grad_norm": 96.30073547363281, "learning_rate": 0.0004500561167227834, "logits/chosen": -15.428865432739258, "logits/rejected": -16.208415985107422, "logps/chosen": -2905.32568359375, "logps/rejected": -2445.04443359375, "loss": 7.1945, "rewards/accuracies": 0.5, "rewards/chosen": -87.04197692871094, "rewards/margins": 10.06214427947998, "rewards/rejected": -97.1041259765625, "step": 28570 }, { "epoch": 1.65, "grad_norm": 0.6277463436126709, "learning_rate": 0.000449862610782151, "logits/chosen": -14.553645133972168, "logits/rejected": -14.22282886505127, "logps/chosen": -2378.5400390625, "logps/rejected": -2519.775146484375, "loss": 3.8541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.74806213378906, "rewards/margins": 2.843743085861206, "rewards/rejected": -137.59178161621094, "step": 28580 }, { "epoch": 1.65, "grad_norm": 41.58224868774414, "learning_rate": 0.00044966910484151864, "logits/chosen": -14.465944290161133, "logits/rejected": -14.852041244506836, "logps/chosen": -2609.881103515625, "logps/rejected": -2638.895263671875, "loss": 1.4943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.27487182617188, "rewards/margins": 12.500468254089355, "rewards/rejected": -189.77536010742188, "step": 28590 }, { "epoch": 1.66, "grad_norm": 102.85394287109375, "learning_rate": 0.00044947559890088626, "logits/chosen": -12.56580924987793, "logits/rejected": -12.517398834228516, "logps/chosen": -2852.800537109375, "logps/rejected": -3076.356689453125, "loss": 14.6318, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -77.3061752319336, "rewards/margins": 1.8034645318984985, "rewards/rejected": -79.1096420288086, "step": 28600 }, { "epoch": 1.66, "grad_norm": 59.9183464050293, "learning_rate": 0.00044928209296025393, "logits/chosen": -12.772860527038574, "logits/rejected": -12.46212100982666, "logps/chosen": -2862.982177734375, "logps/rejected": -2630.7939453125, "loss": 7.0182, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -154.35928344726562, "rewards/margins": -3.5122666358947754, "rewards/rejected": -150.84701538085938, "step": 28610 }, { "epoch": 1.66, "grad_norm": 0.007661894429475069, "learning_rate": 0.00044908858701962154, "logits/chosen": -12.636494636535645, "logits/rejected": -12.811367988586426, "logps/chosen": -3059.510986328125, "logps/rejected": -2941.2275390625, "loss": 1.1833, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -136.83627319335938, "rewards/margins": 11.05646800994873, "rewards/rejected": -147.89273071289062, "step": 28620 }, { "epoch": 1.66, "grad_norm": 88.91912841796875, "learning_rate": 0.0004488950810789891, "logits/chosen": -15.375093460083008, "logits/rejected": -15.664495468139648, "logps/chosen": -2578.239013671875, "logps/rejected": -2537.697265625, "loss": 18.363, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -149.40440368652344, "rewards/margins": -0.34529417753219604, "rewards/rejected": -149.05911254882812, "step": 28630 }, { "epoch": 1.66, "grad_norm": 1.051547560138033e-08, "learning_rate": 0.0004487015751383567, "logits/chosen": -13.967143058776855, "logits/rejected": -14.419891357421875, "logps/chosen": -2686.489501953125, "logps/rejected": -2562.35791015625, "loss": 5.6579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.4016876220703, "rewards/margins": 8.923449516296387, "rewards/rejected": -157.3251190185547, "step": 28640 }, { "epoch": 1.66, "grad_norm": 44.375640869140625, "learning_rate": 0.00044850806919772434, "logits/chosen": -14.87153148651123, "logits/rejected": -14.775288581848145, "logps/chosen": -2578.650634765625, "logps/rejected": -2436.78857421875, "loss": 2.0841, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -126.47447204589844, "rewards/margins": 25.647851943969727, "rewards/rejected": -152.122314453125, "step": 28650 }, { "epoch": 1.66, "grad_norm": 80.80133056640625, "learning_rate": 0.000448314563257092, "logits/chosen": -15.38453483581543, "logits/rejected": -16.02054214477539, "logps/chosen": -2798.6845703125, "logps/rejected": -2635.885498046875, "loss": 16.0972, "rewards/accuracies": 0.5, "rewards/chosen": -109.2973861694336, "rewards/margins": -11.995893478393555, "rewards/rejected": -97.30149841308594, "step": 28660 }, { "epoch": 1.66, "grad_norm": 0.2792450785636902, "learning_rate": 0.00044812105731645963, "logits/chosen": -17.164016723632812, "logits/rejected": -17.426626205444336, "logps/chosen": -2775.54931640625, "logps/rejected": -2570.706787109375, "loss": 7.6597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.9853515625, "rewards/margins": -3.9313950538635254, "rewards/rejected": -165.053955078125, "step": 28670 }, { "epoch": 1.66, "grad_norm": 59.31539535522461, "learning_rate": 0.00044792755137582725, "logits/chosen": -14.583099365234375, "logits/rejected": -14.654935836791992, "logps/chosen": -2976.446533203125, "logps/rejected": -2888.68115234375, "loss": 10.7702, "rewards/accuracies": 0.5, "rewards/chosen": -117.79359436035156, "rewards/margins": -0.1471717804670334, "rewards/rejected": -117.64642333984375, "step": 28680 }, { "epoch": 1.66, "grad_norm": 3.230269242197881e-14, "learning_rate": 0.00044773404543519487, "logits/chosen": -19.083290100097656, "logits/rejected": -19.96026611328125, "logps/chosen": -2509.42236328125, "logps/rejected": -2529.121826171875, "loss": 8.66, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -186.9440155029297, "rewards/margins": 6.954073905944824, "rewards/rejected": -193.8980712890625, "step": 28690 }, { "epoch": 1.66, "grad_norm": 79.78903198242188, "learning_rate": 0.0004475405394945625, "logits/chosen": -14.238889694213867, "logits/rejected": -14.866724967956543, "logps/chosen": -3069.77197265625, "logps/rejected": -2955.188232421875, "loss": 3.4582, "rewards/accuracies": 0.5, "rewards/chosen": -154.68386840820312, "rewards/margins": 5.830147743225098, "rewards/rejected": -160.51400756835938, "step": 28700 }, { "epoch": 1.66, "grad_norm": 35.66518783569336, "learning_rate": 0.00044734703355393015, "logits/chosen": -16.62724494934082, "logits/rejected": -16.471012115478516, "logps/chosen": -2612.85693359375, "logps/rejected": -2721.388916015625, "loss": 1.244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -219.74722290039062, "rewards/margins": 23.908676147460938, "rewards/rejected": -243.65591430664062, "step": 28710 }, { "epoch": 1.66, "grad_norm": 473.7662658691406, "learning_rate": 0.00044715352761329777, "logits/chosen": -17.33135414123535, "logits/rejected": -18.4636173248291, "logps/chosen": -2446.330322265625, "logps/rejected": -2378.68017578125, "loss": 18.4936, "rewards/accuracies": 0.5, "rewards/chosen": -181.2833251953125, "rewards/margins": -6.705801486968994, "rewards/rejected": -174.57752990722656, "step": 28720 }, { "epoch": 1.66, "grad_norm": 12.197683334350586, "learning_rate": 0.0004469600216726654, "logits/chosen": -14.97424602508545, "logits/rejected": -16.35298728942871, "logps/chosen": -2739.52978515625, "logps/rejected": -2913.26416015625, "loss": 7.898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.67347717285156, "rewards/margins": 0.9111831784248352, "rewards/rejected": -124.58467102050781, "step": 28730 }, { "epoch": 1.66, "grad_norm": 109.4236831665039, "learning_rate": 0.00044676651573203295, "logits/chosen": -13.01915168762207, "logits/rejected": -13.065943717956543, "logps/chosen": -3048.785888671875, "logps/rejected": -3035.649658203125, "loss": 4.235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.70622253417969, "rewards/margins": 3.2095038890838623, "rewards/rejected": -112.91572570800781, "step": 28740 }, { "epoch": 1.66, "grad_norm": 0.018854539841413498, "learning_rate": 0.00044657300979140057, "logits/chosen": -11.795205116271973, "logits/rejected": -11.856903076171875, "logps/chosen": -3134.36767578125, "logps/rejected": -2725.763671875, "loss": 1.0552, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -71.19120025634766, "rewards/margins": 19.146793365478516, "rewards/rejected": -90.33799743652344, "step": 28750 }, { "epoch": 1.66, "grad_norm": 85.0307388305664, "learning_rate": 0.00044637950385076824, "logits/chosen": -14.961047172546387, "logits/rejected": -15.483294486999512, "logps/chosen": -3201.60888671875, "logps/rejected": -2992.68017578125, "loss": 6.8238, "rewards/accuracies": 0.5, "rewards/chosen": -166.82957458496094, "rewards/margins": -2.4768872261047363, "rewards/rejected": -164.3527069091797, "step": 28760 }, { "epoch": 1.67, "grad_norm": 8.179985046386719, "learning_rate": 0.00044618599791013586, "logits/chosen": -18.83025550842285, "logits/rejected": -19.1065616607666, "logps/chosen": -2594.858154296875, "logps/rejected": -2528.22265625, "loss": 10.6603, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -195.53672790527344, "rewards/margins": -6.23977518081665, "rewards/rejected": -189.29696655273438, "step": 28770 }, { "epoch": 1.67, "grad_norm": 0.0019140404183417559, "learning_rate": 0.0004459924919695035, "logits/chosen": -17.292573928833008, "logits/rejected": -17.798110961914062, "logps/chosen": -2482.469482421875, "logps/rejected": -2700.280029296875, "loss": 0.368, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -193.96578979492188, "rewards/margins": 26.910919189453125, "rewards/rejected": -220.87667846679688, "step": 28780 }, { "epoch": 1.67, "grad_norm": 0.0033093993552029133, "learning_rate": 0.0004457989860288711, "logits/chosen": -16.241756439208984, "logits/rejected": -15.73950481414795, "logps/chosen": -3109.621826171875, "logps/rejected": -3049.453857421875, "loss": 6.3173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -268.46697998046875, "rewards/margins": 3.4101791381835938, "rewards/rejected": -271.87713623046875, "step": 28790 }, { "epoch": 1.67, "grad_norm": 3.1479241847991943, "learning_rate": 0.0004456054800882387, "logits/chosen": -15.515302658081055, "logits/rejected": -16.060161590576172, "logps/chosen": -2715.06103515625, "logps/rejected": -2796.950439453125, "loss": 6.6138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.40248107910156, "rewards/margins": 1.1828018426895142, "rewards/rejected": -191.5852813720703, "step": 28800 }, { "epoch": 1.67, "grad_norm": 1.9591867683743658e-08, "learning_rate": 0.0004454119741476063, "logits/chosen": -13.409368515014648, "logits/rejected": -13.343365669250488, "logps/chosen": -2796.02783203125, "logps/rejected": -2739.670654296875, "loss": 4.0461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.28004455566406, "rewards/margins": 5.650820255279541, "rewards/rejected": -159.9308624267578, "step": 28810 }, { "epoch": 1.67, "grad_norm": 0.0008267749799415469, "learning_rate": 0.000445218468206974, "logits/chosen": -12.826948165893555, "logits/rejected": -14.786962509155273, "logps/chosen": -2999.491455078125, "logps/rejected": -2992.30126953125, "loss": 8.6082, "rewards/accuracies": 0.5, "rewards/chosen": -137.38037109375, "rewards/margins": -2.3578267097473145, "rewards/rejected": -135.02255249023438, "step": 28820 }, { "epoch": 1.67, "grad_norm": 102.15542602539062, "learning_rate": 0.0004450249622663416, "logits/chosen": -14.414782524108887, "logits/rejected": -14.312841415405273, "logps/chosen": -2720.38818359375, "logps/rejected": -2588.62451171875, "loss": 10.6168, "rewards/accuracies": 0.5, "rewards/chosen": -141.01473999023438, "rewards/margins": -1.9744819402694702, "rewards/rejected": -139.04026794433594, "step": 28830 }, { "epoch": 1.67, "grad_norm": 3.793222180803468e-09, "learning_rate": 0.00044483145632570923, "logits/chosen": -15.2645902633667, "logits/rejected": -15.3665132522583, "logps/chosen": -2938.388671875, "logps/rejected": -2561.3466796875, "loss": 3.97, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.82774353027344, "rewards/margins": 19.803142547607422, "rewards/rejected": -165.63088989257812, "step": 28840 }, { "epoch": 1.67, "grad_norm": 29.426437377929688, "learning_rate": 0.0004446379503850768, "logits/chosen": -14.682035446166992, "logits/rejected": -14.824041366577148, "logps/chosen": -2666.773681640625, "logps/rejected": -2692.77587890625, "loss": 8.4408, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.23292541503906, "rewards/margins": -3.323455333709717, "rewards/rejected": -153.9094696044922, "step": 28850 }, { "epoch": 1.67, "grad_norm": 62.858177185058594, "learning_rate": 0.0004444444444444444, "logits/chosen": -14.538686752319336, "logits/rejected": -14.25940990447998, "logps/chosen": -2576.921142578125, "logps/rejected": -2548.152587890625, "loss": 4.7505, "rewards/accuracies": 0.5, "rewards/chosen": -173.5047149658203, "rewards/margins": 1.3574588298797607, "rewards/rejected": -174.86219787597656, "step": 28860 }, { "epoch": 1.67, "grad_norm": 3.7335269424676465e-12, "learning_rate": 0.0004442509385038121, "logits/chosen": -13.517240524291992, "logits/rejected": -13.139615058898926, "logps/chosen": -2900.7724609375, "logps/rejected": -2754.53515625, "loss": 4.7647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.54531860351562, "rewards/margins": 3.2989563941955566, "rewards/rejected": -147.84426879882812, "step": 28870 }, { "epoch": 1.67, "grad_norm": 23.896028518676758, "learning_rate": 0.0004440574325631797, "logits/chosen": -17.82016944885254, "logits/rejected": -17.937232971191406, "logps/chosen": -2478.09033203125, "logps/rejected": -2474.385009765625, "loss": 2.7923, "rewards/accuracies": 0.5, "rewards/chosen": -221.2941436767578, "rewards/margins": 0.6991928219795227, "rewards/rejected": -221.9933319091797, "step": 28880 }, { "epoch": 1.67, "grad_norm": 19.855775833129883, "learning_rate": 0.0004438639266225473, "logits/chosen": -13.406089782714844, "logits/rejected": -13.120572090148926, "logps/chosen": -2700.62939453125, "logps/rejected": -2568.704345703125, "loss": 2.8955, "rewards/accuracies": 0.5, "rewards/chosen": -153.5581817626953, "rewards/margins": 4.493217468261719, "rewards/rejected": -158.0513916015625, "step": 28890 }, { "epoch": 1.67, "grad_norm": 1.8488798048110766e-08, "learning_rate": 0.00044367042068191493, "logits/chosen": -13.138768196105957, "logits/rejected": -13.347882270812988, "logps/chosen": -2165.23046875, "logps/rejected": -2267.943603515625, "loss": 5.9488, "rewards/accuracies": 0.5, "rewards/chosen": -141.13241577148438, "rewards/margins": 20.393810272216797, "rewards/rejected": -161.52621459960938, "step": 28900 }, { "epoch": 1.67, "grad_norm": 0.0, "learning_rate": 0.00044347691474128255, "logits/chosen": -13.203877449035645, "logits/rejected": -13.090746879577637, "logps/chosen": -2744.913818359375, "logps/rejected": -2463.546142578125, "loss": 11.2004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.27174377441406, "rewards/margins": 4.42866849899292, "rewards/rejected": -137.70042419433594, "step": 28910 }, { "epoch": 1.67, "grad_norm": 77.27840423583984, "learning_rate": 0.0004432834088006502, "logits/chosen": -13.509676933288574, "logits/rejected": -14.765172004699707, "logps/chosen": -3024.78173828125, "logps/rejected": -2870.032958984375, "loss": 16.5244, "rewards/accuracies": 0.5, "rewards/chosen": -128.74783325195312, "rewards/margins": -6.996977806091309, "rewards/rejected": -121.7508544921875, "step": 28920 }, { "epoch": 1.67, "grad_norm": 0.00264383340254426, "learning_rate": 0.00044308990286001784, "logits/chosen": -15.82275390625, "logits/rejected": -15.829195022583008, "logps/chosen": -1788.1253662109375, "logps/rejected": -2157.838623046875, "loss": 7.353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.95773315429688, "rewards/margins": 16.518028259277344, "rewards/rejected": -155.4757537841797, "step": 28930 }, { "epoch": 1.68, "grad_norm": 0.19536438584327698, "learning_rate": 0.00044289639691938546, "logits/chosen": -15.030522346496582, "logits/rejected": -15.443120002746582, "logps/chosen": -2579.148193359375, "logps/rejected": -2488.568359375, "loss": 18.5373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.12423706054688, "rewards/margins": -8.530938148498535, "rewards/rejected": -150.59327697753906, "step": 28940 }, { "epoch": 1.68, "grad_norm": 66.52987670898438, "learning_rate": 0.0004427028909787531, "logits/chosen": -16.170995712280273, "logits/rejected": -16.67654800415039, "logps/chosen": -2523.51220703125, "logps/rejected": -2501.008544921875, "loss": 3.6622, "rewards/accuracies": 0.5, "rewards/chosen": -217.3779754638672, "rewards/margins": 5.509341239929199, "rewards/rejected": -222.88735961914062, "step": 28950 }, { "epoch": 1.68, "grad_norm": 38.468074798583984, "learning_rate": 0.00044250938503812064, "logits/chosen": -16.947898864746094, "logits/rejected": -17.282787322998047, "logps/chosen": -2466.57275390625, "logps/rejected": -2391.997802734375, "loss": 11.2301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.9890899658203, "rewards/margins": 1.0882446765899658, "rewards/rejected": -211.0773468017578, "step": 28960 }, { "epoch": 1.68, "grad_norm": 40.996238708496094, "learning_rate": 0.0004423158790974883, "logits/chosen": -14.786256790161133, "logits/rejected": -15.076936721801758, "logps/chosen": -2333.3125, "logps/rejected": -2368.89697265625, "loss": 5.385, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.86105346679688, "rewards/margins": -1.427808403968811, "rewards/rejected": -139.43325805664062, "step": 28970 }, { "epoch": 1.68, "grad_norm": 0.9556378722190857, "learning_rate": 0.0004421223731568559, "logits/chosen": -14.794885635375977, "logits/rejected": -15.04932689666748, "logps/chosen": -2342.219482421875, "logps/rejected": -2347.304931640625, "loss": 5.9441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.7234649658203, "rewards/margins": 2.218895673751831, "rewards/rejected": -132.94235229492188, "step": 28980 }, { "epoch": 1.68, "grad_norm": 78.59923553466797, "learning_rate": 0.00044192886721622354, "logits/chosen": -16.79229736328125, "logits/rejected": -16.22701644897461, "logps/chosen": -2859.23876953125, "logps/rejected": -2752.74072265625, "loss": 24.9844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -129.7240753173828, "rewards/margins": -16.71917152404785, "rewards/rejected": -113.0049057006836, "step": 28990 }, { "epoch": 1.68, "grad_norm": 0.0003253129543736577, "learning_rate": 0.00044173536127559116, "logits/chosen": -18.346384048461914, "logits/rejected": -17.033855438232422, "logps/chosen": -2625.491455078125, "logps/rejected": -2637.47216796875, "loss": 3.4551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.1607208251953, "rewards/margins": 5.391284465789795, "rewards/rejected": -203.55201721191406, "step": 29000 }, { "epoch": 1.68, "grad_norm": 60.546810150146484, "learning_rate": 0.0004415418553349588, "logits/chosen": -14.400197982788086, "logits/rejected": -14.33533763885498, "logps/chosen": -3264.205810546875, "logps/rejected": -2924.1328125, "loss": 7.6356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.00363159179688, "rewards/margins": -0.7948524355888367, "rewards/rejected": -179.20877075195312, "step": 29010 }, { "epoch": 1.68, "grad_norm": 58.30916213989258, "learning_rate": 0.00044134834939432645, "logits/chosen": -19.02271842956543, "logits/rejected": -21.022565841674805, "logps/chosen": -3000.746337890625, "logps/rejected": -2594.80078125, "loss": 5.5958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.34695434570312, "rewards/margins": 4.5488786697387695, "rewards/rejected": -182.89584350585938, "step": 29020 }, { "epoch": 1.68, "grad_norm": 1.143275696335877e-09, "learning_rate": 0.00044115484345369407, "logits/chosen": -15.074453353881836, "logits/rejected": -15.810476303100586, "logps/chosen": -2854.710205078125, "logps/rejected": -2956.94873046875, "loss": 5.7578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.68946838378906, "rewards/margins": 9.501195907592773, "rewards/rejected": -139.19065856933594, "step": 29030 }, { "epoch": 1.68, "grad_norm": 2.9761430102857086e-15, "learning_rate": 0.0004409613375130617, "logits/chosen": -13.41240119934082, "logits/rejected": -13.485217094421387, "logps/chosen": -2824.75244140625, "logps/rejected": -2630.014404296875, "loss": 3.5379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -129.2854766845703, "rewards/margins": 8.377659797668457, "rewards/rejected": -137.6631317138672, "step": 29040 }, { "epoch": 1.68, "grad_norm": 0.7114080190658569, "learning_rate": 0.0004407678315724293, "logits/chosen": -16.25569725036621, "logits/rejected": -17.420207977294922, "logps/chosen": -2527.02392578125, "logps/rejected": -2501.758056640625, "loss": 5.9697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.05323791503906, "rewards/margins": 3.233734130859375, "rewards/rejected": -191.28697204589844, "step": 29050 }, { "epoch": 1.68, "grad_norm": 165.63058471679688, "learning_rate": 0.00044057432563179686, "logits/chosen": -16.34756851196289, "logits/rejected": -17.169374465942383, "logps/chosen": -2625.000732421875, "logps/rejected": -2420.4345703125, "loss": 15.324, "rewards/accuracies": 0.5, "rewards/chosen": -191.78463745117188, "rewards/margins": -9.739542007446289, "rewards/rejected": -182.0451202392578, "step": 29060 }, { "epoch": 1.68, "grad_norm": 2.846137848513169e-20, "learning_rate": 0.0004403808196911645, "logits/chosen": -14.404176712036133, "logits/rejected": -14.938333511352539, "logps/chosen": -2964.6025390625, "logps/rejected": -2743.724853515625, "loss": 6.5471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.80690002441406, "rewards/margins": 7.657105445861816, "rewards/rejected": -96.46400451660156, "step": 29070 }, { "epoch": 1.68, "grad_norm": 1.3203892876845202e-06, "learning_rate": 0.00044018731375053215, "logits/chosen": -16.93549919128418, "logits/rejected": -17.38907241821289, "logps/chosen": -2531.092041015625, "logps/rejected": -2708.04296875, "loss": 4.9976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.8197784423828, "rewards/margins": 6.046004295349121, "rewards/rejected": -204.8657684326172, "step": 29080 }, { "epoch": 1.68, "grad_norm": 49.50768280029297, "learning_rate": 0.00043999380780989977, "logits/chosen": -15.144113540649414, "logits/rejected": -15.451817512512207, "logps/chosen": -2854.40771484375, "logps/rejected": -2741.74072265625, "loss": 4.3921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.52685546875, "rewards/margins": 13.440515518188477, "rewards/rejected": -193.96737670898438, "step": 29090 }, { "epoch": 1.68, "grad_norm": 5.889135650249955e-07, "learning_rate": 0.0004398003018692674, "logits/chosen": -17.28517723083496, "logits/rejected": -18.182422637939453, "logps/chosen": -2752.12939453125, "logps/rejected": -2516.80908203125, "loss": 0.8815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -140.44822692871094, "rewards/margins": 9.214821815490723, "rewards/rejected": -149.66305541992188, "step": 29100 }, { "epoch": 1.68, "grad_norm": 172.55532836914062, "learning_rate": 0.000439606795928635, "logits/chosen": -18.100574493408203, "logits/rejected": -18.668384552001953, "logps/chosen": -2284.865234375, "logps/rejected": -2353.694091796875, "loss": 24.4551, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.5661163330078, "rewards/margins": -16.287450790405273, "rewards/rejected": -175.27867126464844, "step": 29110 }, { "epoch": 1.69, "grad_norm": 1.1202867024808237e-16, "learning_rate": 0.0004394132899880026, "logits/chosen": -16.255159378051758, "logits/rejected": -15.975323677062988, "logps/chosen": -2861.470947265625, "logps/rejected": -2265.41357421875, "loss": 7.304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -110.15159606933594, "rewards/margins": 5.754995346069336, "rewards/rejected": -115.90660095214844, "step": 29120 }, { "epoch": 1.69, "grad_norm": 0.0005107178585603833, "learning_rate": 0.0004392197840473703, "logits/chosen": -15.64490032196045, "logits/rejected": -16.117895126342773, "logps/chosen": -2982.845947265625, "logps/rejected": -3065.324462890625, "loss": 6.1103, "rewards/accuracies": 0.5, "rewards/chosen": -205.69577026367188, "rewards/margins": 1.2947769165039062, "rewards/rejected": -206.99057006835938, "step": 29130 }, { "epoch": 1.69, "grad_norm": 15.937470436096191, "learning_rate": 0.0004390262781067379, "logits/chosen": -19.654001235961914, "logits/rejected": -20.948169708251953, "logps/chosen": -2287.752685546875, "logps/rejected": -2239.556640625, "loss": 6.255, "rewards/accuracies": 0.5, "rewards/chosen": -168.5733184814453, "rewards/margins": -3.02458119392395, "rewards/rejected": -165.54873657226562, "step": 29140 }, { "epoch": 1.69, "grad_norm": 7.36983711249195e-05, "learning_rate": 0.0004388327721661055, "logits/chosen": -21.086524963378906, "logits/rejected": -21.62782859802246, "logps/chosen": -2355.626708984375, "logps/rejected": -2345.77685546875, "loss": 7.1252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -212.36807250976562, "rewards/margins": -0.17497864365577698, "rewards/rejected": -212.19308471679688, "step": 29150 }, { "epoch": 1.69, "grad_norm": 14.31105899810791, "learning_rate": 0.00043863926622547314, "logits/chosen": -18.918426513671875, "logits/rejected": -17.686389923095703, "logps/chosen": -2566.17822265625, "logps/rejected": -2470.70703125, "loss": 7.1223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.5944595336914, "rewards/margins": 12.65599250793457, "rewards/rejected": -137.25045776367188, "step": 29160 }, { "epoch": 1.69, "grad_norm": 155.64479064941406, "learning_rate": 0.0004384457602848407, "logits/chosen": -17.379188537597656, "logits/rejected": -17.140066146850586, "logps/chosen": -3027.031494140625, "logps/rejected": -3128.893798828125, "loss": 13.306, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -137.0618896484375, "rewards/margins": -11.868097305297852, "rewards/rejected": -125.19380187988281, "step": 29170 }, { "epoch": 1.69, "grad_norm": 1.138738571171416e-05, "learning_rate": 0.0004382522543442084, "logits/chosen": -17.93975257873535, "logits/rejected": -17.984643936157227, "logps/chosen": -2902.76708984375, "logps/rejected": -3192.10302734375, "loss": 4.6298, "rewards/accuracies": 0.5, "rewards/chosen": -137.9897918701172, "rewards/margins": 2.2035293579101562, "rewards/rejected": -140.19332885742188, "step": 29180 }, { "epoch": 1.69, "grad_norm": 140.5467987060547, "learning_rate": 0.000438058748403576, "logits/chosen": -19.63772201538086, "logits/rejected": -20.685955047607422, "logps/chosen": -2653.587158203125, "logps/rejected": -2445.85791015625, "loss": 21.8651, "rewards/accuracies": 0.5, "rewards/chosen": -232.1505584716797, "rewards/margins": -19.36744499206543, "rewards/rejected": -212.78311157226562, "step": 29190 }, { "epoch": 1.69, "grad_norm": 115.41877746582031, "learning_rate": 0.0004378652424629436, "logits/chosen": -16.425321578979492, "logits/rejected": -16.522823333740234, "logps/chosen": -3270.240234375, "logps/rejected": -2980.243408203125, "loss": 10.519, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -205.5387725830078, "rewards/margins": -4.576659202575684, "rewards/rejected": -200.9621124267578, "step": 29200 }, { "epoch": 1.69, "grad_norm": 1.5963680198183283e-05, "learning_rate": 0.00043767173652231123, "logits/chosen": -20.200557708740234, "logits/rejected": -22.36166763305664, "logps/chosen": -2786.98876953125, "logps/rejected": -2686.04248046875, "loss": 0.7271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -201.83889770507812, "rewards/margins": 10.231425285339355, "rewards/rejected": -212.0703125, "step": 29210 }, { "epoch": 1.69, "grad_norm": 4.24069046545128e-09, "learning_rate": 0.00043747823058167885, "logits/chosen": -19.400279998779297, "logits/rejected": -21.0640811920166, "logps/chosen": -2916.7021484375, "logps/rejected": -2932.36474609375, "loss": 2.3881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -256.75811767578125, "rewards/margins": 4.380332946777344, "rewards/rejected": -261.13848876953125, "step": 29220 }, { "epoch": 1.69, "grad_norm": 16.355134963989258, "learning_rate": 0.0004372847246410465, "logits/chosen": -19.83034324645996, "logits/rejected": -22.49814224243164, "logps/chosen": -3165.70166015625, "logps/rejected": -3106.440185546875, "loss": 6.0732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.26589965820312, "rewards/margins": -1.0522186756134033, "rewards/rejected": -204.2136688232422, "step": 29230 }, { "epoch": 1.69, "grad_norm": 4.2969766633760287e-10, "learning_rate": 0.00043709121870041414, "logits/chosen": -16.550827026367188, "logits/rejected": -17.4414119720459, "logps/chosen": -2792.109130859375, "logps/rejected": -2635.952392578125, "loss": 4.4379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.6799774169922, "rewards/margins": 10.873565673828125, "rewards/rejected": -184.55355834960938, "step": 29240 }, { "epoch": 1.69, "grad_norm": 120.50589752197266, "learning_rate": 0.00043689771275978175, "logits/chosen": -20.083667755126953, "logits/rejected": -18.612903594970703, "logps/chosen": -2751.97265625, "logps/rejected": -2787.323974609375, "loss": 14.6188, "rewards/accuracies": 0.5, "rewards/chosen": -213.5507049560547, "rewards/margins": 1.589359998703003, "rewards/rejected": -215.1400909423828, "step": 29250 }, { "epoch": 1.69, "grad_norm": 82.43992614746094, "learning_rate": 0.00043670420681914937, "logits/chosen": -18.328081130981445, "logits/rejected": -18.901273727416992, "logps/chosen": -2650.4814453125, "logps/rejected": -2726.3984375, "loss": 7.2199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.61752319335938, "rewards/margins": 16.183446884155273, "rewards/rejected": -175.8009490966797, "step": 29260 }, { "epoch": 1.69, "grad_norm": 4.990346496924758e-05, "learning_rate": 0.000436510700878517, "logits/chosen": -18.79376220703125, "logits/rejected": -19.62682342529297, "logps/chosen": -2572.91650390625, "logps/rejected": -2377.38916015625, "loss": 14.9447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.7478790283203, "rewards/margins": -11.2348051071167, "rewards/rejected": -146.51307678222656, "step": 29270 }, { "epoch": 1.69, "grad_norm": 52.0565071105957, "learning_rate": 0.00043631719493788455, "logits/chosen": -17.64309310913086, "logits/rejected": -17.101709365844727, "logps/chosen": -2650.29736328125, "logps/rejected": -2484.10205078125, "loss": 6.5289, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -185.73204040527344, "rewards/margins": -0.13905735313892365, "rewards/rejected": -185.59300231933594, "step": 29280 }, { "epoch": 1.7, "grad_norm": 23.152904510498047, "learning_rate": 0.0004361236889972522, "logits/chosen": -18.166088104248047, "logits/rejected": -19.04949951171875, "logps/chosen": -2232.03369140625, "logps/rejected": -1880.15625, "loss": 36.1164, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -139.20223999023438, "rewards/margins": -19.98366928100586, "rewards/rejected": -119.21858215332031, "step": 29290 }, { "epoch": 1.7, "grad_norm": 4.3238474063400645e-06, "learning_rate": 0.00043593018305661984, "logits/chosen": -17.484617233276367, "logits/rejected": -19.25492286682129, "logps/chosen": -2930.39013671875, "logps/rejected": -2699.161376953125, "loss": 5.6747, "rewards/accuracies": 0.5, "rewards/chosen": -172.8714141845703, "rewards/margins": 2.618881940841675, "rewards/rejected": -175.49029541015625, "step": 29300 }, { "epoch": 1.7, "grad_norm": 5.27013311869785e-15, "learning_rate": 0.00043573667711598746, "logits/chosen": -15.219711303710938, "logits/rejected": -15.14434814453125, "logps/chosen": -2986.331787109375, "logps/rejected": -3029.269775390625, "loss": 1.355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.6922607421875, "rewards/margins": 12.010457992553711, "rewards/rejected": -133.70272827148438, "step": 29310 }, { "epoch": 1.7, "grad_norm": 0.0, "learning_rate": 0.0004355431711753551, "logits/chosen": -20.565210342407227, "logits/rejected": -24.205541610717773, "logps/chosen": -2535.09765625, "logps/rejected": -2300.4951171875, "loss": 19.0603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.01834106445312, "rewards/margins": -9.546974182128906, "rewards/rejected": -190.47134399414062, "step": 29320 }, { "epoch": 1.7, "grad_norm": 79.63816833496094, "learning_rate": 0.0004353496652347227, "logits/chosen": -17.555042266845703, "logits/rejected": -17.49709129333496, "logps/chosen": -2444.698486328125, "logps/rejected": -2470.651123046875, "loss": 13.4295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -202.4774169921875, "rewards/margins": 5.294731140136719, "rewards/rejected": -207.7721405029297, "step": 29330 }, { "epoch": 1.7, "grad_norm": 0.00016582111129537225, "learning_rate": 0.00043515615929409036, "logits/chosen": -19.545047760009766, "logits/rejected": -20.355331420898438, "logps/chosen": -2641.14453125, "logps/rejected": -2771.442138671875, "loss": 1.362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.3415985107422, "rewards/margins": 10.58032512664795, "rewards/rejected": -191.9219207763672, "step": 29340 }, { "epoch": 1.7, "grad_norm": 87.57831573486328, "learning_rate": 0.000434962653353458, "logits/chosen": -17.388412475585938, "logits/rejected": -19.587984085083008, "logps/chosen": -2639.86767578125, "logps/rejected": -2255.44482421875, "loss": 10.6159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.061279296875, "rewards/margins": 0.8697589635848999, "rewards/rejected": -160.93101501464844, "step": 29350 }, { "epoch": 1.7, "grad_norm": 93.81424713134766, "learning_rate": 0.0004347691474128256, "logits/chosen": -18.073211669921875, "logits/rejected": -17.927146911621094, "logps/chosen": -2576.53271484375, "logps/rejected": -2647.080810546875, "loss": 9.8169, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -142.6478729248047, "rewards/margins": -7.523750305175781, "rewards/rejected": -135.12411499023438, "step": 29360 }, { "epoch": 1.7, "grad_norm": 3.239323531634142e-10, "learning_rate": 0.0004345756414721932, "logits/chosen": -20.439861297607422, "logits/rejected": -20.15654945373535, "logps/chosen": -3012.83154296875, "logps/rejected": -3099.96044921875, "loss": 1.4808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -150.45919799804688, "rewards/margins": 20.66305160522461, "rewards/rejected": -171.12228393554688, "step": 29370 }, { "epoch": 1.7, "grad_norm": 2.1707453470298788e-06, "learning_rate": 0.00043438213553156083, "logits/chosen": -17.63571548461914, "logits/rejected": -17.883197784423828, "logps/chosen": -3029.17822265625, "logps/rejected": -2713.41162109375, "loss": 5.9734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.42106628417969, "rewards/margins": 9.107797622680664, "rewards/rejected": -136.52886962890625, "step": 29380 }, { "epoch": 1.7, "grad_norm": 0.14140182733535767, "learning_rate": 0.00043418862959092845, "logits/chosen": -19.314231872558594, "logits/rejected": -19.021282196044922, "logps/chosen": -2599.6923828125, "logps/rejected": -2490.054443359375, "loss": 10.1718, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -188.3145294189453, "rewards/margins": -5.9333624839782715, "rewards/rejected": -182.38116455078125, "step": 29390 }, { "epoch": 1.7, "grad_norm": 2.1080636978149414, "learning_rate": 0.00043399512365029606, "logits/chosen": -17.384716033935547, "logits/rejected": -17.178377151489258, "logps/chosen": -2989.958984375, "logps/rejected": -2628.9208984375, "loss": 15.7076, "rewards/accuracies": 0.5, "rewards/chosen": -159.3492889404297, "rewards/margins": -9.224649429321289, "rewards/rejected": -150.1246337890625, "step": 29400 }, { "epoch": 1.7, "grad_norm": 0.18108470737934113, "learning_rate": 0.0004338016177096637, "logits/chosen": -18.454395294189453, "logits/rejected": -18.790651321411133, "logps/chosen": -2399.32275390625, "logps/rejected": -2605.265380859375, "loss": 7.0022, "rewards/accuracies": 0.5, "rewards/chosen": -175.23953247070312, "rewards/margins": 7.451027870178223, "rewards/rejected": -182.6905517578125, "step": 29410 }, { "epoch": 1.7, "grad_norm": 87.08147430419922, "learning_rate": 0.0004336081117690313, "logits/chosen": -15.544652938842773, "logits/rejected": -15.448951721191406, "logps/chosen": -2701.346923828125, "logps/rejected": -2674.622802734375, "loss": 10.2542, "rewards/accuracies": 0.5, "rewards/chosen": -130.39584350585938, "rewards/margins": -5.868574619293213, "rewards/rejected": -124.52729797363281, "step": 29420 }, { "epoch": 1.7, "grad_norm": 66.07235717773438, "learning_rate": 0.0004334146058283989, "logits/chosen": -17.167905807495117, "logits/rejected": -17.244216918945312, "logps/chosen": -3081.640380859375, "logps/rejected": -2916.34130859375, "loss": 16.3831, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -231.18453979492188, "rewards/margins": -11.671072006225586, "rewards/rejected": -219.5134735107422, "step": 29430 }, { "epoch": 1.7, "grad_norm": 0.05913485214114189, "learning_rate": 0.0004332210998877666, "logits/chosen": -15.676976203918457, "logits/rejected": -15.628461837768555, "logps/chosen": -2831.28271484375, "logps/rejected": -2396.93994140625, "loss": 2.0485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -115.59212493896484, "rewards/margins": 13.148971557617188, "rewards/rejected": -128.74110412597656, "step": 29440 }, { "epoch": 1.7, "grad_norm": 79.03762817382812, "learning_rate": 0.0004330275939471342, "logits/chosen": -17.985023498535156, "logits/rejected": -17.93486213684082, "logps/chosen": -2940.65283203125, "logps/rejected": -2580.464111328125, "loss": 3.7737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.81602478027344, "rewards/margins": 4.9428510665893555, "rewards/rejected": -157.75888061523438, "step": 29450 }, { "epoch": 1.71, "grad_norm": 8.126968785505712e-17, "learning_rate": 0.0004328340880065018, "logits/chosen": -14.966588973999023, "logits/rejected": -15.141268730163574, "logps/chosen": -3196.598876953125, "logps/rejected": -2821.16796875, "loss": 7.3049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.26380920410156, "rewards/margins": 10.15546989440918, "rewards/rejected": -178.41928100585938, "step": 29460 }, { "epoch": 1.71, "grad_norm": 0.7361806035041809, "learning_rate": 0.00043264058206586944, "logits/chosen": -17.309478759765625, "logits/rejected": -16.844440460205078, "logps/chosen": -3135.787841796875, "logps/rejected": -3028.79443359375, "loss": 7.5718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.5911407470703, "rewards/margins": -0.01922759972512722, "rewards/rejected": -161.5718994140625, "step": 29470 }, { "epoch": 1.71, "grad_norm": 16.556949615478516, "learning_rate": 0.00043244707612523706, "logits/chosen": -19.294034957885742, "logits/rejected": -19.254106521606445, "logps/chosen": -2940.318603515625, "logps/rejected": -2753.10302734375, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": -190.0826416015625, "rewards/margins": 10.379247665405273, "rewards/rejected": -200.4619140625, "step": 29480 }, { "epoch": 1.71, "grad_norm": 79.35773468017578, "learning_rate": 0.0004322535701846047, "logits/chosen": -21.51319122314453, "logits/rejected": -21.538414001464844, "logps/chosen": -2734.33154296875, "logps/rejected": -2739.55908203125, "loss": 1.7581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.47964477539062, "rewards/margins": 7.606942176818848, "rewards/rejected": -176.08657836914062, "step": 29490 }, { "epoch": 1.71, "grad_norm": 0.0009971453109756112, "learning_rate": 0.0004320600642439723, "logits/chosen": -16.042037963867188, "logits/rejected": -16.279457092285156, "logps/chosen": -3028.793701171875, "logps/rejected": -3081.611572265625, "loss": 4.3217, "rewards/accuracies": 0.5, "rewards/chosen": -133.60694885253906, "rewards/margins": 6.106722831726074, "rewards/rejected": -139.7136688232422, "step": 29500 }, { "epoch": 1.71, "grad_norm": 0.002291029319167137, "learning_rate": 0.0004318665583033399, "logits/chosen": -19.722949981689453, "logits/rejected": -19.90110206604004, "logps/chosen": -2788.684814453125, "logps/rejected": -2775.77490234375, "loss": 2.4559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -202.221923828125, "rewards/margins": 8.910235404968262, "rewards/rejected": -211.1321563720703, "step": 29510 }, { "epoch": 1.71, "grad_norm": 0.5473893880844116, "learning_rate": 0.0004316730523627075, "logits/chosen": -17.89992904663086, "logits/rejected": -18.009700775146484, "logps/chosen": -2807.723876953125, "logps/rejected": -2765.87744140625, "loss": 6.9318, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -247.4668731689453, "rewards/margins": -0.3436233401298523, "rewards/rejected": -247.1232147216797, "step": 29520 }, { "epoch": 1.71, "grad_norm": 0.0, "learning_rate": 0.00043147954642207514, "logits/chosen": -17.60830307006836, "logits/rejected": -17.372608184814453, "logps/chosen": -2888.48486328125, "logps/rejected": -2704.86962890625, "loss": 4.661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.33810424804688, "rewards/margins": 11.754961967468262, "rewards/rejected": -141.09304809570312, "step": 29530 }, { "epoch": 1.71, "grad_norm": 165.71849060058594, "learning_rate": 0.00043128604048144276, "logits/chosen": -17.639751434326172, "logits/rejected": -17.4445858001709, "logps/chosen": -2613.205078125, "logps/rejected": -2743.363037109375, "loss": 12.8168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -166.76235961914062, "rewards/margins": -2.2840664386749268, "rewards/rejected": -164.47830200195312, "step": 29540 }, { "epoch": 1.71, "grad_norm": 0.005224208347499371, "learning_rate": 0.00043109253454081043, "logits/chosen": -19.33913230895996, "logits/rejected": -22.285676956176758, "logps/chosen": -2848.03515625, "logps/rejected": -2991.10107421875, "loss": 3.0723, "rewards/accuracies": 0.5, "rewards/chosen": -180.76719665527344, "rewards/margins": 12.459638595581055, "rewards/rejected": -193.22683715820312, "step": 29550 }, { "epoch": 1.71, "grad_norm": 84.92870330810547, "learning_rate": 0.00043089902860017805, "logits/chosen": -14.712504386901855, "logits/rejected": -16.667556762695312, "logps/chosen": -3084.202392578125, "logps/rejected": -2999.701904296875, "loss": 11.185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -124.61174011230469, "rewards/margins": -3.4407143592834473, "rewards/rejected": -121.17103576660156, "step": 29560 }, { "epoch": 1.71, "grad_norm": 31.783153533935547, "learning_rate": 0.00043070552265954567, "logits/chosen": -12.756414413452148, "logits/rejected": -12.794305801391602, "logps/chosen": -2375.76171875, "logps/rejected": -2432.30322265625, "loss": 33.346, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -79.6422348022461, "rewards/margins": -22.3776912689209, "rewards/rejected": -57.2645378112793, "step": 29570 }, { "epoch": 1.71, "grad_norm": 76.78700256347656, "learning_rate": 0.0004305120167189133, "logits/chosen": -14.113720893859863, "logits/rejected": -14.075006484985352, "logps/chosen": -3184.1328125, "logps/rejected": -3118.73486328125, "loss": 7.5579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.92219543457031, "rewards/margins": 4.867390155792236, "rewards/rejected": -131.78958129882812, "step": 29580 }, { "epoch": 1.71, "grad_norm": 5.361738431020058e-08, "learning_rate": 0.0004303185107782809, "logits/chosen": -18.377294540405273, "logits/rejected": -18.571514129638672, "logps/chosen": -2853.75, "logps/rejected": -2758.197265625, "loss": 8.0927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.13014221191406, "rewards/margins": 4.344775199890137, "rewards/rejected": -158.47491455078125, "step": 29590 }, { "epoch": 1.71, "grad_norm": 50.106101989746094, "learning_rate": 0.00043012500483764857, "logits/chosen": -18.96265411376953, "logits/rejected": -18.818458557128906, "logps/chosen": -2308.486083984375, "logps/rejected": -2057.90869140625, "loss": 2.6946, "rewards/accuracies": 0.5, "rewards/chosen": -173.01992797851562, "rewards/margins": 6.686028480529785, "rewards/rejected": -179.70594787597656, "step": 29600 }, { "epoch": 1.71, "grad_norm": 33.67255401611328, "learning_rate": 0.00042993149889701613, "logits/chosen": -16.069719314575195, "logits/rejected": -16.037395477294922, "logps/chosen": -2493.983154296875, "logps/rejected": -2466.58447265625, "loss": 4.9666, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -212.0922088623047, "rewards/margins": 1.8509247303009033, "rewards/rejected": -213.9431610107422, "step": 29610 }, { "epoch": 1.71, "grad_norm": 3.735057835935196e-12, "learning_rate": 0.00042973799295638375, "logits/chosen": -14.384989738464355, "logits/rejected": -14.295492172241211, "logps/chosen": -2786.398193359375, "logps/rejected": -3024.22119140625, "loss": 6.4738, "rewards/accuracies": 0.5, "rewards/chosen": -147.23367309570312, "rewards/margins": 4.991313457489014, "rewards/rejected": -152.2249755859375, "step": 29620 }, { "epoch": 1.72, "grad_norm": 32.29466247558594, "learning_rate": 0.00042954448701575137, "logits/chosen": -15.78071403503418, "logits/rejected": -15.081487655639648, "logps/chosen": -2731.223876953125, "logps/rejected": -2591.3916015625, "loss": 1.3258, "rewards/accuracies": 0.5, "rewards/chosen": -196.33053588867188, "rewards/margins": 2.9323127269744873, "rewards/rejected": -199.26284790039062, "step": 29630 }, { "epoch": 1.72, "grad_norm": 13.966795921325684, "learning_rate": 0.000429350981075119, "logits/chosen": -15.529080390930176, "logits/rejected": -15.57183837890625, "logps/chosen": -2543.87646484375, "logps/rejected": -2683.369140625, "loss": 6.7089, "rewards/accuracies": 0.5, "rewards/chosen": -160.6672821044922, "rewards/margins": -1.4714972972869873, "rewards/rejected": -159.19578552246094, "step": 29640 }, { "epoch": 1.72, "grad_norm": 1.6296005885853282e-14, "learning_rate": 0.00042915747513448666, "logits/chosen": -12.919517517089844, "logits/rejected": -12.519207000732422, "logps/chosen": -3004.03271484375, "logps/rejected": -2896.97509765625, "loss": 2.6531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -83.79914855957031, "rewards/margins": 18.86696434020996, "rewards/rejected": -102.6661148071289, "step": 29650 }, { "epoch": 1.72, "grad_norm": 6.961637077354185e-14, "learning_rate": 0.0004289639691938543, "logits/chosen": -14.011774063110352, "logits/rejected": -13.718046188354492, "logps/chosen": -2971.61279296875, "logps/rejected": -3272.52978515625, "loss": 4.4742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.44906616210938, "rewards/margins": 14.440240859985352, "rewards/rejected": -152.88931274414062, "step": 29660 }, { "epoch": 1.72, "grad_norm": 108.33418273925781, "learning_rate": 0.0004287704632532219, "logits/chosen": -14.872637748718262, "logits/rejected": -15.371731758117676, "logps/chosen": -3079.3857421875, "logps/rejected": -2903.760009765625, "loss": 18.3392, "rewards/accuracies": 0.5, "rewards/chosen": -160.53863525390625, "rewards/margins": -6.353938102722168, "rewards/rejected": -154.18470764160156, "step": 29670 }, { "epoch": 1.72, "grad_norm": 2.722137658577788e-19, "learning_rate": 0.0004285769573125895, "logits/chosen": -18.893444061279297, "logits/rejected": -19.881189346313477, "logps/chosen": -2603.75439453125, "logps/rejected": -2761.325927734375, "loss": 7.5246, "rewards/accuracies": 0.5, "rewards/chosen": -235.73623657226562, "rewards/margins": 6.257936000823975, "rewards/rejected": -241.99417114257812, "step": 29680 }, { "epoch": 1.72, "grad_norm": 7.107912551873596e-07, "learning_rate": 0.0004283834513719571, "logits/chosen": -15.395840644836426, "logits/rejected": -15.609827995300293, "logps/chosen": -3824.024169921875, "logps/rejected": -3515.93115234375, "loss": 21.4562, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -228.47103881835938, "rewards/margins": -18.886709213256836, "rewards/rejected": -209.58432006835938, "step": 29690 }, { "epoch": 1.72, "grad_norm": 4.2999656027671584e-13, "learning_rate": 0.00042818994543132474, "logits/chosen": -17.601606369018555, "logits/rejected": -18.129497528076172, "logps/chosen": -2878.2568359375, "logps/rejected": -2986.7587890625, "loss": 8.6248, "rewards/accuracies": 0.5, "rewards/chosen": -155.6350860595703, "rewards/margins": -3.3769924640655518, "rewards/rejected": -152.2581024169922, "step": 29700 }, { "epoch": 1.72, "grad_norm": 85.84759521484375, "learning_rate": 0.0004279964394906924, "logits/chosen": -15.347326278686523, "logits/rejected": -15.190399169921875, "logps/chosen": -2932.125732421875, "logps/rejected": -2920.680908203125, "loss": 11.2892, "rewards/accuracies": 0.5, "rewards/chosen": -159.73040771484375, "rewards/margins": -5.942091941833496, "rewards/rejected": -153.78829956054688, "step": 29710 }, { "epoch": 1.72, "grad_norm": 46.498191833496094, "learning_rate": 0.00042780293355006, "logits/chosen": -19.65108871459961, "logits/rejected": -19.1384334564209, "logps/chosen": -2546.99560546875, "logps/rejected": -2537.758056640625, "loss": 18.4814, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.14987182617188, "rewards/margins": -14.870887756347656, "rewards/rejected": -174.2789764404297, "step": 29720 }, { "epoch": 1.72, "grad_norm": 8.213976343540708e-07, "learning_rate": 0.0004276094276094276, "logits/chosen": -16.02899742126465, "logits/rejected": -15.922319412231445, "logps/chosen": -3041.38330078125, "logps/rejected": -2763.738037109375, "loss": 6.3979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -196.45913696289062, "rewards/margins": -0.7422893643379211, "rewards/rejected": -195.7168426513672, "step": 29730 }, { "epoch": 1.72, "grad_norm": 10.972434997558594, "learning_rate": 0.0004274159216687952, "logits/chosen": -17.990713119506836, "logits/rejected": -18.15608024597168, "logps/chosen": -2794.064453125, "logps/rejected": -2700.273681640625, "loss": 15.3581, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.23550415039062, "rewards/margins": -11.1867036819458, "rewards/rejected": -163.04876708984375, "step": 29740 }, { "epoch": 1.72, "grad_norm": 40.40620040893555, "learning_rate": 0.00042722241572816283, "logits/chosen": -15.478643417358398, "logits/rejected": -15.423315048217773, "logps/chosen": -3373.507080078125, "logps/rejected": -2625.10693359375, "loss": 18.775, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -126.55220031738281, "rewards/margins": -16.67739486694336, "rewards/rejected": -109.87479400634766, "step": 29750 }, { "epoch": 1.72, "grad_norm": 4.0177122045959024e-11, "learning_rate": 0.0004270289097875305, "logits/chosen": -19.724931716918945, "logits/rejected": -19.818798065185547, "logps/chosen": -2429.41064453125, "logps/rejected": -2311.09033203125, "loss": 13.7339, "rewards/accuracies": 0.5, "rewards/chosen": -192.5171661376953, "rewards/margins": -3.521014451980591, "rewards/rejected": -188.9961700439453, "step": 29760 }, { "epoch": 1.72, "grad_norm": 156.7999267578125, "learning_rate": 0.0004268354038468981, "logits/chosen": -19.087825775146484, "logits/rejected": -19.316923141479492, "logps/chosen": -2461.733642578125, "logps/rejected": -2432.493896484375, "loss": 0.9962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.64523315429688, "rewards/margins": 6.804808139801025, "rewards/rejected": -219.45004272460938, "step": 29770 }, { "epoch": 1.72, "grad_norm": 0.0, "learning_rate": 0.00042664189790626573, "logits/chosen": -18.99354362487793, "logits/rejected": -19.20309829711914, "logps/chosen": -2462.383544921875, "logps/rejected": -2081.993408203125, "loss": 12.2657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.3716278076172, "rewards/margins": 16.450557708740234, "rewards/rejected": -153.8221893310547, "step": 29780 }, { "epoch": 1.72, "grad_norm": 3.970094253347156e-10, "learning_rate": 0.00042644839196563335, "logits/chosen": -19.668415069580078, "logits/rejected": -19.535778045654297, "logps/chosen": -2465.036376953125, "logps/rejected": -2422.0634765625, "loss": 3.8744, "rewards/accuracies": 0.5, "rewards/chosen": -177.57907104492188, "rewards/margins": 2.467684507369995, "rewards/rejected": -180.0467529296875, "step": 29790 }, { "epoch": 1.72, "grad_norm": 103.94325256347656, "learning_rate": 0.00042625488602500097, "logits/chosen": -15.682594299316406, "logits/rejected": -15.468591690063477, "logps/chosen": -2349.095458984375, "logps/rejected": -2027.952392578125, "loss": 9.0285, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -92.74219512939453, "rewards/margins": -5.505009651184082, "rewards/rejected": -87.2371826171875, "step": 29800 }, { "epoch": 1.73, "grad_norm": 108.9102783203125, "learning_rate": 0.00042606138008436864, "logits/chosen": -16.498807907104492, "logits/rejected": -16.242422103881836, "logps/chosen": -2583.826904296875, "logps/rejected": -2379.606201171875, "loss": 14.4571, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -121.65461730957031, "rewards/margins": -11.061939239501953, "rewards/rejected": -110.59269714355469, "step": 29810 }, { "epoch": 1.73, "grad_norm": 52.75043869018555, "learning_rate": 0.00042586787414373626, "logits/chosen": -16.622705459594727, "logits/rejected": -17.577903747558594, "logps/chosen": -3044.57177734375, "logps/rejected": -2816.05029296875, "loss": 0.7167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.77328491210938, "rewards/margins": 16.502674102783203, "rewards/rejected": -171.27597045898438, "step": 29820 }, { "epoch": 1.73, "grad_norm": 1.2330850784891823e-09, "learning_rate": 0.0004256743682031038, "logits/chosen": -14.84803581237793, "logits/rejected": -15.21168327331543, "logps/chosen": -3013.55908203125, "logps/rejected": -2902.38330078125, "loss": 6.2757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -114.41026306152344, "rewards/margins": 8.857661247253418, "rewards/rejected": -123.26790618896484, "step": 29830 }, { "epoch": 1.73, "grad_norm": 97.91629028320312, "learning_rate": 0.00042548086226247144, "logits/chosen": -14.775431632995605, "logits/rejected": -14.732966423034668, "logps/chosen": -2700.3466796875, "logps/rejected": -2445.03076171875, "loss": 4.5225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.48653411865234, "rewards/margins": 8.70697021484375, "rewards/rejected": -112.1935043334961, "step": 29840 }, { "epoch": 1.73, "grad_norm": 0.0, "learning_rate": 0.00042528735632183906, "logits/chosen": -15.720754623413086, "logits/rejected": -15.416887283325195, "logps/chosen": -2701.02099609375, "logps/rejected": -2600.702392578125, "loss": 12.7766, "rewards/accuracies": 0.5, "rewards/chosen": -128.76626586914062, "rewards/margins": -2.8192317485809326, "rewards/rejected": -125.9470443725586, "step": 29850 }, { "epoch": 1.73, "grad_norm": 0.0, "learning_rate": 0.0004250938503812067, "logits/chosen": -17.199508666992188, "logits/rejected": -17.527359008789062, "logps/chosen": -2852.399658203125, "logps/rejected": -2869.845458984375, "loss": 3.6628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -243.05105590820312, "rewards/margins": 12.992756843566895, "rewards/rejected": -256.0437927246094, "step": 29860 }, { "epoch": 1.73, "grad_norm": 84.73897552490234, "learning_rate": 0.00042490034444057434, "logits/chosen": -14.541862487792969, "logits/rejected": -14.375575065612793, "logps/chosen": -2789.618408203125, "logps/rejected": -2784.09765625, "loss": 7.9184, "rewards/accuracies": 0.5, "rewards/chosen": -83.26651000976562, "rewards/margins": 0.48645132780075073, "rewards/rejected": -83.7529525756836, "step": 29870 }, { "epoch": 1.73, "grad_norm": 6.425934314727783, "learning_rate": 0.00042470683849994196, "logits/chosen": -16.797447204589844, "logits/rejected": -16.793901443481445, "logps/chosen": -2634.330078125, "logps/rejected": -2528.8291015625, "loss": 16.5077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -162.1026153564453, "rewards/margins": -9.609735488891602, "rewards/rejected": -152.49288940429688, "step": 29880 }, { "epoch": 1.73, "grad_norm": 106.7979965209961, "learning_rate": 0.0004245133325593096, "logits/chosen": -18.009967803955078, "logits/rejected": -17.907869338989258, "logps/chosen": -2754.046630859375, "logps/rejected": -2580.642333984375, "loss": 8.4014, "rewards/accuracies": 0.5, "rewards/chosen": -161.12185668945312, "rewards/margins": -3.1461281776428223, "rewards/rejected": -157.97572326660156, "step": 29890 }, { "epoch": 1.73, "grad_norm": 120.56829833984375, "learning_rate": 0.0004243198266186772, "logits/chosen": -16.930133819580078, "logits/rejected": -16.753442764282227, "logps/chosen": -2830.469970703125, "logps/rejected": -2566.204345703125, "loss": 2.7974, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.63427734375, "rewards/margins": 2.705038547515869, "rewards/rejected": -173.3393096923828, "step": 29900 }, { "epoch": 1.73, "grad_norm": 29.193716049194336, "learning_rate": 0.00042412632067804487, "logits/chosen": -15.732516288757324, "logits/rejected": -15.95195484161377, "logps/chosen": -2985.333251953125, "logps/rejected": -2787.26318359375, "loss": 19.4206, "rewards/accuracies": 0.5, "rewards/chosen": -101.9979019165039, "rewards/margins": -13.59423828125, "rewards/rejected": -88.40365600585938, "step": 29910 }, { "epoch": 1.73, "grad_norm": 1.531086802482605, "learning_rate": 0.0004239328147374125, "logits/chosen": -18.528255462646484, "logits/rejected": -18.126453399658203, "logps/chosen": -3048.86865234375, "logps/rejected": -3023.68017578125, "loss": 6.0996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -162.96328735351562, "rewards/margins": -1.9639238119125366, "rewards/rejected": -160.99937438964844, "step": 29920 }, { "epoch": 1.73, "grad_norm": 6.562660217285156, "learning_rate": 0.0004237393087967801, "logits/chosen": -13.708547592163086, "logits/rejected": -13.724800109863281, "logps/chosen": -3147.08837890625, "logps/rejected": -2811.79833984375, "loss": 0.3759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -53.454803466796875, "rewards/margins": 17.584665298461914, "rewards/rejected": -71.03947448730469, "step": 29930 }, { "epoch": 1.73, "grad_norm": 0.040617216378450394, "learning_rate": 0.00042354580285614766, "logits/chosen": -17.0540828704834, "logits/rejected": -17.095773696899414, "logps/chosen": -2854.31396484375, "logps/rejected": -2718.84765625, "loss": 3.1268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.009765625, "rewards/margins": 10.924959182739258, "rewards/rejected": -179.93472290039062, "step": 29940 }, { "epoch": 1.73, "grad_norm": 6.517838357922301e-08, "learning_rate": 0.0004233522969155153, "logits/chosen": -14.813821792602539, "logits/rejected": -14.778470039367676, "logps/chosen": -3003.825927734375, "logps/rejected": -2826.754150390625, "loss": 5.091, "rewards/accuracies": 0.5, "rewards/chosen": -105.9144287109375, "rewards/margins": 2.2954890727996826, "rewards/rejected": -108.20991516113281, "step": 29950 }, { "epoch": 1.73, "grad_norm": 126.39244079589844, "learning_rate": 0.0004231587909748829, "logits/chosen": -19.07247543334961, "logits/rejected": -18.83395767211914, "logps/chosen": -2852.451171875, "logps/rejected": -2765.296142578125, "loss": 2.1356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.294921875, "rewards/margins": 17.953317642211914, "rewards/rejected": -202.2482452392578, "step": 29960 }, { "epoch": 1.73, "grad_norm": 1.959437646859019e-11, "learning_rate": 0.00042296528503425057, "logits/chosen": -19.534387588500977, "logits/rejected": -19.789583206176758, "logps/chosen": -2690.41064453125, "logps/rejected": -2535.270751953125, "loss": 13.2083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.87469482421875, "rewards/margins": -7.589009761810303, "rewards/rejected": -169.28567504882812, "step": 29970 }, { "epoch": 1.74, "grad_norm": 0.434876948595047, "learning_rate": 0.0004227717790936182, "logits/chosen": -19.506505966186523, "logits/rejected": -19.306949615478516, "logps/chosen": -2529.93212890625, "logps/rejected": -2608.630126953125, "loss": 0.6407, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -178.69259643554688, "rewards/margins": 12.260152816772461, "rewards/rejected": -190.95272827148438, "step": 29980 }, { "epoch": 1.74, "grad_norm": 80.64807891845703, "learning_rate": 0.0004225782731529858, "logits/chosen": -18.331256866455078, "logits/rejected": -17.946063995361328, "logps/chosen": -3043.416748046875, "logps/rejected": -2534.99853515625, "loss": 6.8735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.92855834960938, "rewards/margins": 7.7922773361206055, "rewards/rejected": -162.72084045410156, "step": 29990 }, { "epoch": 1.74, "grad_norm": 1.2329766832408495e-05, "learning_rate": 0.0004223847672123534, "logits/chosen": -20.294452667236328, "logits/rejected": -22.531787872314453, "logps/chosen": -2827.708251953125, "logps/rejected": -2713.80615234375, "loss": 3.0313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.5365447998047, "rewards/margins": 5.421609401702881, "rewards/rejected": -210.95816040039062, "step": 30000 }, { "epoch": 1.74, "grad_norm": 1.6510602781536012e-17, "learning_rate": 0.00042219126127172104, "logits/chosen": -17.784147262573242, "logits/rejected": -17.842098236083984, "logps/chosen": -2741.857666015625, "logps/rejected": -2770.91455078125, "loss": 2.5026, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -157.42019653320312, "rewards/margins": 14.919614791870117, "rewards/rejected": -172.33981323242188, "step": 30010 }, { "epoch": 1.74, "grad_norm": 37.51365661621094, "learning_rate": 0.0004219977553310887, "logits/chosen": -15.073602676391602, "logits/rejected": -14.96882152557373, "logps/chosen": -2722.128662109375, "logps/rejected": -2567.791015625, "loss": 2.1348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.29859924316406, "rewards/margins": 7.015763759613037, "rewards/rejected": -98.31436920166016, "step": 30020 }, { "epoch": 1.74, "grad_norm": 6.945999331264829e-10, "learning_rate": 0.0004218042493904563, "logits/chosen": -17.666881561279297, "logits/rejected": -17.906726837158203, "logps/chosen": -2536.126220703125, "logps/rejected": -2532.62060546875, "loss": 2.9757, "rewards/accuracies": 0.5, "rewards/chosen": -215.1952362060547, "rewards/margins": 7.849737644195557, "rewards/rejected": -223.0449676513672, "step": 30030 }, { "epoch": 1.74, "grad_norm": 65.13168334960938, "learning_rate": 0.00042161074344982394, "logits/chosen": -14.732049942016602, "logits/rejected": -14.7639799118042, "logps/chosen": -2534.03857421875, "logps/rejected": -2568.50390625, "loss": 6.6738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -98.20173645019531, "rewards/margins": -2.6886844635009766, "rewards/rejected": -95.51304626464844, "step": 30040 }, { "epoch": 1.74, "grad_norm": 74.8260269165039, "learning_rate": 0.0004214172375091915, "logits/chosen": -15.825457572937012, "logits/rejected": -16.09609603881836, "logps/chosen": -2570.502685546875, "logps/rejected": -2227.01171875, "loss": 13.6179, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -188.65504455566406, "rewards/margins": -7.137826442718506, "rewards/rejected": -181.5172119140625, "step": 30050 }, { "epoch": 1.74, "grad_norm": 86.75362396240234, "learning_rate": 0.0004212237315685591, "logits/chosen": -19.966632843017578, "logits/rejected": -20.493127822875977, "logps/chosen": -2944.571533203125, "logps/rejected": -2469.33837890625, "loss": 5.2966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.66629028320312, "rewards/margins": 6.979581356048584, "rewards/rejected": -161.6458740234375, "step": 30060 }, { "epoch": 1.74, "grad_norm": 5.6845643643665955e-12, "learning_rate": 0.0004210302256279268, "logits/chosen": -18.861183166503906, "logits/rejected": -19.0775089263916, "logps/chosen": -3087.693359375, "logps/rejected": -2751.38330078125, "loss": 2.5321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.5152587890625, "rewards/margins": 15.386598587036133, "rewards/rejected": -213.90185546875, "step": 30070 }, { "epoch": 1.74, "grad_norm": 67.7591781616211, "learning_rate": 0.0004208367196872944, "logits/chosen": -16.648386001586914, "logits/rejected": -16.61369514465332, "logps/chosen": -2591.412109375, "logps/rejected": -2605.4296875, "loss": 21.6918, "rewards/accuracies": 0.5, "rewards/chosen": -163.51541137695312, "rewards/margins": -13.078542709350586, "rewards/rejected": -150.43687438964844, "step": 30080 }, { "epoch": 1.74, "grad_norm": 135.26339721679688, "learning_rate": 0.00042064321374666203, "logits/chosen": -17.855220794677734, "logits/rejected": -17.440805435180664, "logps/chosen": -2706.61962890625, "logps/rejected": -2543.275390625, "loss": 16.153, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -199.680419921875, "rewards/margins": -14.503602981567383, "rewards/rejected": -185.1768341064453, "step": 30090 }, { "epoch": 1.74, "grad_norm": 0.00022679820540361106, "learning_rate": 0.00042044970780602965, "logits/chosen": -14.091153144836426, "logits/rejected": -14.449705123901367, "logps/chosen": -2543.40869140625, "logps/rejected": -2296.671630859375, "loss": 10.6732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -84.044677734375, "rewards/margins": 2.8655283451080322, "rewards/rejected": -86.91020202636719, "step": 30100 }, { "epoch": 1.74, "grad_norm": 6.1315978427956e-07, "learning_rate": 0.00042025620186539726, "logits/chosen": -16.772663116455078, "logits/rejected": -16.461008071899414, "logps/chosen": -2871.84814453125, "logps/rejected": -2821.66845703125, "loss": 10.4822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.58560180664062, "rewards/margins": 2.070049285888672, "rewards/rejected": -186.65567016601562, "step": 30110 }, { "epoch": 1.74, "grad_norm": 87.71565246582031, "learning_rate": 0.00042006269592476494, "logits/chosen": -15.970515251159668, "logits/rejected": -15.833231925964355, "logps/chosen": -2639.11865234375, "logps/rejected": -2816.907470703125, "loss": 5.1314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.8285369873047, "rewards/margins": 1.9088188409805298, "rewards/rejected": -143.73736572265625, "step": 30120 }, { "epoch": 1.74, "grad_norm": 80.8104019165039, "learning_rate": 0.00041986918998413255, "logits/chosen": -14.302816390991211, "logits/rejected": -14.531064987182617, "logps/chosen": -2941.415283203125, "logps/rejected": -2915.981201171875, "loss": 7.9905, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -145.18736267089844, "rewards/margins": -4.6970391273498535, "rewards/rejected": -140.49032592773438, "step": 30130 }, { "epoch": 1.74, "grad_norm": 9.298746753199022e-16, "learning_rate": 0.00041967568404350017, "logits/chosen": -20.567672729492188, "logits/rejected": -21.8245792388916, "logps/chosen": -2562.404052734375, "logps/rejected": -2383.478515625, "loss": 13.4127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -186.166015625, "rewards/margins": -4.921999931335449, "rewards/rejected": -181.24398803710938, "step": 30140 }, { "epoch": 1.75, "grad_norm": 0.7608640789985657, "learning_rate": 0.0004194821781028678, "logits/chosen": -18.170244216918945, "logits/rejected": -17.88829803466797, "logps/chosen": -2617.762939453125, "logps/rejected": -2476.04931640625, "loss": 1.953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.66085815429688, "rewards/margins": 10.50587272644043, "rewards/rejected": -165.16673278808594, "step": 30150 }, { "epoch": 1.75, "grad_norm": 18.931264877319336, "learning_rate": 0.00041928867216223535, "logits/chosen": -16.528675079345703, "logits/rejected": -16.610998153686523, "logps/chosen": -2699.296875, "logps/rejected": -2541.52783203125, "loss": 2.2404, "rewards/accuracies": 0.5, "rewards/chosen": -107.05073547363281, "rewards/margins": 1.7491319179534912, "rewards/rejected": -108.79988861083984, "step": 30160 }, { "epoch": 1.75, "grad_norm": 7.963134862620791e-08, "learning_rate": 0.00041909516622160297, "logits/chosen": -15.906522750854492, "logits/rejected": -15.429611206054688, "logps/chosen": -2939.181640625, "logps/rejected": -1984.892578125, "loss": 14.2111, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.6192169189453, "rewards/margins": -4.367531776428223, "rewards/rejected": -133.25167846679688, "step": 30170 }, { "epoch": 1.75, "grad_norm": 29.748594284057617, "learning_rate": 0.00041890166028097064, "logits/chosen": -17.341289520263672, "logits/rejected": -17.781570434570312, "logps/chosen": -2983.2001953125, "logps/rejected": -2728.639404296875, "loss": 15.7765, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -152.92111206054688, "rewards/margins": -13.2945556640625, "rewards/rejected": -139.62655639648438, "step": 30180 }, { "epoch": 1.75, "grad_norm": 104.45953369140625, "learning_rate": 0.00041870815434033826, "logits/chosen": -17.579456329345703, "logits/rejected": -17.188705444335938, "logps/chosen": -2828.96728515625, "logps/rejected": -2848.06103515625, "loss": 15.0339, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -193.16299438476562, "rewards/margins": -13.701177597045898, "rewards/rejected": -179.46182250976562, "step": 30190 }, { "epoch": 1.75, "grad_norm": 98.56288146972656, "learning_rate": 0.0004185146483997059, "logits/chosen": -16.04555892944336, "logits/rejected": -16.604307174682617, "logps/chosen": -2880.01171875, "logps/rejected": -2708.359130859375, "loss": 3.1082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.69625854492188, "rewards/margins": 4.069783687591553, "rewards/rejected": -145.76605224609375, "step": 30200 }, { "epoch": 1.75, "grad_norm": 0.11193791031837463, "learning_rate": 0.0004183211424590735, "logits/chosen": -19.792985916137695, "logits/rejected": -19.801069259643555, "logps/chosen": -2599.778564453125, "logps/rejected": -2616.00634765625, "loss": 1.7955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.1746063232422, "rewards/margins": 5.4377641677856445, "rewards/rejected": -202.61236572265625, "step": 30210 }, { "epoch": 1.75, "grad_norm": 28.905603408813477, "learning_rate": 0.0004181276365184411, "logits/chosen": -14.977119445800781, "logits/rejected": -15.64702033996582, "logps/chosen": -2918.24462890625, "logps/rejected": -2707.096435546875, "loss": 17.3927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -145.20217895507812, "rewards/margins": -11.197826385498047, "rewards/rejected": -134.00436401367188, "step": 30220 }, { "epoch": 1.75, "grad_norm": 109.099853515625, "learning_rate": 0.0004179341305778088, "logits/chosen": -14.147061347961426, "logits/rejected": -14.025712966918945, "logps/chosen": -3327.35205078125, "logps/rejected": -2809.657958984375, "loss": 13.5201, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -178.75357055664062, "rewards/margins": -5.740786552429199, "rewards/rejected": -173.0127716064453, "step": 30230 }, { "epoch": 1.75, "grad_norm": 28.249820709228516, "learning_rate": 0.0004177406246371764, "logits/chosen": -15.506538391113281, "logits/rejected": -15.928133964538574, "logps/chosen": -3049.134033203125, "logps/rejected": -2945.857421875, "loss": 7.1452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -133.23146057128906, "rewards/margins": -0.3199991285800934, "rewards/rejected": -132.9114532470703, "step": 30240 }, { "epoch": 1.75, "grad_norm": 257.65936279296875, "learning_rate": 0.000417547118696544, "logits/chosen": -14.8496675491333, "logits/rejected": -14.602992057800293, "logps/chosen": -2885.935546875, "logps/rejected": -3041.15478515625, "loss": 5.4526, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.56884765625, "rewards/margins": -3.889437198638916, "rewards/rejected": -132.67941284179688, "step": 30250 }, { "epoch": 1.75, "grad_norm": 175.52647399902344, "learning_rate": 0.00041735361275591163, "logits/chosen": -17.759048461914062, "logits/rejected": -18.035297393798828, "logps/chosen": -2550.465087890625, "logps/rejected": -2455.201904296875, "loss": 1.015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.2773895263672, "rewards/margins": 4.467319965362549, "rewards/rejected": -171.74472045898438, "step": 30260 }, { "epoch": 1.75, "grad_norm": 2.4464900493621826, "learning_rate": 0.0004171601068152792, "logits/chosen": -20.26560401916504, "logits/rejected": -20.467973709106445, "logps/chosen": -2277.429443359375, "logps/rejected": -2207.70263671875, "loss": 13.05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.03439331054688, "rewards/margins": -3.4114341735839844, "rewards/rejected": -196.6229705810547, "step": 30270 }, { "epoch": 1.75, "grad_norm": 80.1262435913086, "learning_rate": 0.00041696660087464687, "logits/chosen": -16.820755004882812, "logits/rejected": -17.56528091430664, "logps/chosen": -2584.68701171875, "logps/rejected": -2493.30078125, "loss": 14.2543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -182.79733276367188, "rewards/margins": -1.7131130695343018, "rewards/rejected": -181.084228515625, "step": 30280 }, { "epoch": 1.75, "grad_norm": 2.4659392833709717, "learning_rate": 0.0004167730949340145, "logits/chosen": -15.730015754699707, "logits/rejected": -15.870150566101074, "logps/chosen": -3036.0146484375, "logps/rejected": -2736.961669921875, "loss": 4.8458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -222.33712768554688, "rewards/margins": 6.025417327880859, "rewards/rejected": -228.362548828125, "step": 30290 }, { "epoch": 1.75, "grad_norm": 6.720820426940918, "learning_rate": 0.0004165795889933821, "logits/chosen": -14.840646743774414, "logits/rejected": -15.650945663452148, "logps/chosen": -3251.53955078125, "logps/rejected": -2234.53466796875, "loss": 5.9445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.76527404785156, "rewards/margins": 5.365632057189941, "rewards/rejected": -174.13088989257812, "step": 30300 }, { "epoch": 1.75, "grad_norm": 8.865149066228373e-12, "learning_rate": 0.0004163860830527497, "logits/chosen": -14.98144245147705, "logits/rejected": -16.138113021850586, "logps/chosen": -2790.035400390625, "logps/rejected": -2725.63037109375, "loss": 4.416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.29718017578125, "rewards/margins": 23.489482879638672, "rewards/rejected": -200.78665161132812, "step": 30310 }, { "epoch": 1.76, "grad_norm": 205.55662536621094, "learning_rate": 0.00041619257711211733, "logits/chosen": -16.9327392578125, "logits/rejected": -17.44561195373535, "logps/chosen": -2754.589599609375, "logps/rejected": -2499.782470703125, "loss": 17.342, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.20457458496094, "rewards/margins": -4.816524505615234, "rewards/rejected": -174.3880615234375, "step": 30320 }, { "epoch": 1.76, "grad_norm": 1.5248444080352783, "learning_rate": 0.000415999071171485, "logits/chosen": -16.310556411743164, "logits/rejected": -16.193572998046875, "logps/chosen": -2665.348876953125, "logps/rejected": -2541.597412109375, "loss": 10.0085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.6036834716797, "rewards/margins": -3.809704303741455, "rewards/rejected": -170.79396057128906, "step": 30330 }, { "epoch": 1.76, "grad_norm": 2.243960107382566e-15, "learning_rate": 0.0004158055652308526, "logits/chosen": -11.197434425354004, "logits/rejected": -11.119063377380371, "logps/chosen": -3450.01953125, "logps/rejected": -2155.156982421875, "loss": 1.7663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.131284713745117, "rewards/margins": 17.52324867248535, "rewards/rejected": -32.65453338623047, "step": 30340 }, { "epoch": 1.76, "grad_norm": 126.79450225830078, "learning_rate": 0.00041561205929022024, "logits/chosen": -15.201461791992188, "logits/rejected": -15.135889053344727, "logps/chosen": -2621.16943359375, "logps/rejected": -2752.8701171875, "loss": 7.0157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.35772705078125, "rewards/margins": 1.035194754600525, "rewards/rejected": -126.39290618896484, "step": 30350 }, { "epoch": 1.76, "grad_norm": 0.10496117919683456, "learning_rate": 0.00041541855334958786, "logits/chosen": -16.16292953491211, "logits/rejected": -15.98869514465332, "logps/chosen": -2938.494140625, "logps/rejected": -2837.03076171875, "loss": 0.8878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.41879272460938, "rewards/margins": 13.887275695800781, "rewards/rejected": -160.30606079101562, "step": 30360 }, { "epoch": 1.76, "grad_norm": 0.21056891977787018, "learning_rate": 0.0004152250474089555, "logits/chosen": -18.393802642822266, "logits/rejected": -17.637378692626953, "logps/chosen": -2832.385986328125, "logps/rejected": -2647.259765625, "loss": 3.0148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.4778289794922, "rewards/margins": 13.958099365234375, "rewards/rejected": -187.43592834472656, "step": 30370 }, { "epoch": 1.76, "grad_norm": 52.57563781738281, "learning_rate": 0.00041503154146832304, "logits/chosen": -17.380840301513672, "logits/rejected": -17.71807098388672, "logps/chosen": -2919.88525390625, "logps/rejected": -2521.79150390625, "loss": 18.4649, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -149.90847778320312, "rewards/margins": -11.709091186523438, "rewards/rejected": -138.19937133789062, "step": 30380 }, { "epoch": 1.76, "grad_norm": 50.88251876831055, "learning_rate": 0.0004148380355276907, "logits/chosen": -19.08930015563965, "logits/rejected": -18.245040893554688, "logps/chosen": -2877.11474609375, "logps/rejected": -2862.501220703125, "loss": 4.7194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -209.7887420654297, "rewards/margins": 0.28090134263038635, "rewards/rejected": -210.06967163085938, "step": 30390 }, { "epoch": 1.76, "grad_norm": 53.433773040771484, "learning_rate": 0.0004146445295870583, "logits/chosen": -15.19794750213623, "logits/rejected": -14.90527629852295, "logps/chosen": -2985.42724609375, "logps/rejected": -2542.970703125, "loss": 4.7619, "rewards/accuracies": 0.5, "rewards/chosen": -126.13236999511719, "rewards/margins": 6.381928443908691, "rewards/rejected": -132.51429748535156, "step": 30400 }, { "epoch": 1.76, "grad_norm": 0.0, "learning_rate": 0.00041445102364642594, "logits/chosen": -17.411258697509766, "logits/rejected": -17.199514389038086, "logps/chosen": -3016.35009765625, "logps/rejected": -2533.14599609375, "loss": 11.5192, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -147.4717254638672, "rewards/margins": 2.50775146484375, "rewards/rejected": -149.97947692871094, "step": 30410 }, { "epoch": 1.76, "grad_norm": 89.18170928955078, "learning_rate": 0.00041425751770579356, "logits/chosen": -15.40345573425293, "logits/rejected": -15.370678901672363, "logps/chosen": -2650.177734375, "logps/rejected": -2506.326171875, "loss": 7.8037, "rewards/accuracies": 0.5, "rewards/chosen": -136.498291015625, "rewards/margins": 4.477115631103516, "rewards/rejected": -140.97543334960938, "step": 30420 }, { "epoch": 1.76, "grad_norm": 4.693371295928955, "learning_rate": 0.0004140640117651612, "logits/chosen": -15.502375602722168, "logits/rejected": -15.401878356933594, "logps/chosen": -2250.860107421875, "logps/rejected": -2310.669189453125, "loss": 2.4444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -112.82472229003906, "rewards/margins": 19.202899932861328, "rewards/rejected": -132.02761840820312, "step": 30430 }, { "epoch": 1.76, "grad_norm": 54.06076431274414, "learning_rate": 0.00041387050582452885, "logits/chosen": -15.73405933380127, "logits/rejected": -15.659662246704102, "logps/chosen": -2383.72412109375, "logps/rejected": -2483.610107421875, "loss": 6.2763, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -128.61294555664062, "rewards/margins": -2.0150198936462402, "rewards/rejected": -126.5979232788086, "step": 30440 }, { "epoch": 1.76, "grad_norm": 129.43215942382812, "learning_rate": 0.00041367699988389647, "logits/chosen": -14.99969482421875, "logits/rejected": -14.92724323272705, "logps/chosen": -2815.877197265625, "logps/rejected": -2269.54248046875, "loss": 29.0019, "rewards/accuracies": 0.5, "rewards/chosen": -139.36082458496094, "rewards/margins": -18.527149200439453, "rewards/rejected": -120.83366394042969, "step": 30450 }, { "epoch": 1.76, "grad_norm": 2.3319953607048305e-11, "learning_rate": 0.0004134834939432641, "logits/chosen": -15.731951713562012, "logits/rejected": -15.980201721191406, "logps/chosen": -2295.11474609375, "logps/rejected": -2568.52734375, "loss": 1.6514, "rewards/accuracies": 0.5, "rewards/chosen": -152.36106872558594, "rewards/margins": 5.52469539642334, "rewards/rejected": -157.88577270507812, "step": 30460 }, { "epoch": 1.76, "grad_norm": 0.00013261870481073856, "learning_rate": 0.0004132899880026317, "logits/chosen": -14.627519607543945, "logits/rejected": -14.793627738952637, "logps/chosen": -3177.415283203125, "logps/rejected": -3020.8193359375, "loss": 8.318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.97190856933594, "rewards/margins": 3.1141726970672607, "rewards/rejected": -129.08607482910156, "step": 30470 }, { "epoch": 1.76, "grad_norm": 9.315749048255384e-05, "learning_rate": 0.0004130964820619993, "logits/chosen": -12.79470443725586, "logits/rejected": -12.66602611541748, "logps/chosen": -3246.403076171875, "logps/rejected": -2820.682373046875, "loss": 16.6902, "rewards/accuracies": 0.5, "rewards/chosen": -103.0059814453125, "rewards/margins": -7.008180141448975, "rewards/rejected": -95.997802734375, "step": 30480 }, { "epoch": 1.76, "grad_norm": 0.38561367988586426, "learning_rate": 0.00041290297612136693, "logits/chosen": -17.768863677978516, "logits/rejected": -17.94143295288086, "logps/chosen": -3034.714111328125, "logps/rejected": -3046.7763671875, "loss": 7.1573, "rewards/accuracies": 0.5, "rewards/chosen": -177.8765869140625, "rewards/margins": -1.2063030004501343, "rewards/rejected": -176.6702880859375, "step": 30490 }, { "epoch": 1.77, "grad_norm": 104.2564468383789, "learning_rate": 0.00041270947018073455, "logits/chosen": -15.352895736694336, "logits/rejected": -15.312932968139648, "logps/chosen": -2934.03857421875, "logps/rejected": -2529.319091796875, "loss": 12.2158, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -114.2359390258789, "rewards/margins": -9.35784912109375, "rewards/rejected": -104.87808990478516, "step": 30500 }, { "epoch": 1.77, "grad_norm": 0.2523304224014282, "learning_rate": 0.00041251596424010217, "logits/chosen": -15.40868091583252, "logits/rejected": -15.134069442749023, "logps/chosen": -2652.44287109375, "logps/rejected": -2881.17333984375, "loss": 5.4438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.51383972167969, "rewards/margins": 2.450103282928467, "rewards/rejected": -125.96394348144531, "step": 30510 }, { "epoch": 1.77, "grad_norm": 0.007190486881881952, "learning_rate": 0.0004123224582994698, "logits/chosen": -13.147997856140137, "logits/rejected": -12.867033004760742, "logps/chosen": -3355.302001953125, "logps/rejected": -2645.816162109375, "loss": 13.2523, "rewards/accuracies": 0.5, "rewards/chosen": -112.11075592041016, "rewards/margins": 9.2252836227417, "rewards/rejected": -121.33604431152344, "step": 30520 }, { "epoch": 1.77, "grad_norm": 89.58148956298828, "learning_rate": 0.0004121289523588374, "logits/chosen": -12.516956329345703, "logits/rejected": -13.005073547363281, "logps/chosen": -3103.232177734375, "logps/rejected": -2821.28955078125, "loss": 16.6524, "rewards/accuracies": 0.5, "rewards/chosen": -103.27494049072266, "rewards/margins": -9.180304527282715, "rewards/rejected": -94.09463500976562, "step": 30530 }, { "epoch": 1.77, "grad_norm": 0.3022933900356293, "learning_rate": 0.0004119354464182051, "logits/chosen": -15.668081283569336, "logits/rejected": -15.926678657531738, "logps/chosen": -2982.55224609375, "logps/rejected": -2848.16748046875, "loss": 0.5974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.55101013183594, "rewards/margins": 4.488341331481934, "rewards/rejected": -158.0393524169922, "step": 30540 }, { "epoch": 1.77, "grad_norm": 62.50892639160156, "learning_rate": 0.0004117419404775727, "logits/chosen": -18.499675750732422, "logits/rejected": -19.411231994628906, "logps/chosen": -2606.112548828125, "logps/rejected": -2452.288330078125, "loss": 2.6355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.0611572265625, "rewards/margins": 16.80299949645996, "rewards/rejected": -206.8641357421875, "step": 30550 }, { "epoch": 1.77, "grad_norm": 0.4261592924594879, "learning_rate": 0.0004115484345369403, "logits/chosen": -17.451358795166016, "logits/rejected": -17.229129791259766, "logps/chosen": -2965.57958984375, "logps/rejected": -2619.908447265625, "loss": 2.3888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.5962142944336, "rewards/margins": 32.815311431884766, "rewards/rejected": -158.41152954101562, "step": 30560 }, { "epoch": 1.77, "grad_norm": 3.6984504052740596e-19, "learning_rate": 0.0004113549285963079, "logits/chosen": -16.60853385925293, "logits/rejected": -16.022342681884766, "logps/chosen": -2946.88134765625, "logps/rejected": -2925.69287109375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -139.27780151367188, "rewards/margins": 20.194255828857422, "rewards/rejected": -159.4720458984375, "step": 30570 }, { "epoch": 1.77, "grad_norm": 2.079418182373047, "learning_rate": 0.00041116142265567554, "logits/chosen": -19.61271858215332, "logits/rejected": -20.37567710876465, "logps/chosen": -2408.85400390625, "logps/rejected": -2213.88134765625, "loss": 5.9101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -152.35787963867188, "rewards/margins": -1.995659589767456, "rewards/rejected": -150.3622283935547, "step": 30580 }, { "epoch": 1.77, "grad_norm": 38.660011291503906, "learning_rate": 0.0004109679167150431, "logits/chosen": -18.7518367767334, "logits/rejected": -20.438928604125977, "logps/chosen": -2503.57080078125, "logps/rejected": -2531.379150390625, "loss": 4.5264, "rewards/accuracies": 0.5, "rewards/chosen": -174.6894073486328, "rewards/margins": 5.514653205871582, "rewards/rejected": -180.2040557861328, "step": 30590 }, { "epoch": 1.77, "grad_norm": 31.9985294342041, "learning_rate": 0.0004107744107744108, "logits/chosen": -16.10824966430664, "logits/rejected": -16.316675186157227, "logps/chosen": -2805.26025390625, "logps/rejected": -2705.13037109375, "loss": 2.9486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -107.92713928222656, "rewards/margins": 10.462183952331543, "rewards/rejected": -118.38932037353516, "step": 30600 }, { "epoch": 1.77, "grad_norm": 51.57191467285156, "learning_rate": 0.0004105809048337784, "logits/chosen": -17.040435791015625, "logits/rejected": -17.50628089904785, "logps/chosen": -2987.478271484375, "logps/rejected": -2693.360107421875, "loss": 5.2051, "rewards/accuracies": 0.5, "rewards/chosen": -197.9112091064453, "rewards/margins": 0.08090992271900177, "rewards/rejected": -197.9921112060547, "step": 30610 }, { "epoch": 1.77, "grad_norm": 60.23801040649414, "learning_rate": 0.000410387398893146, "logits/chosen": -17.02420425415039, "logits/rejected": -17.0806827545166, "logps/chosen": -2775.09423828125, "logps/rejected": -2666.91650390625, "loss": 5.5741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.97903442382812, "rewards/margins": -0.23981475830078125, "rewards/rejected": -170.7392120361328, "step": 30620 }, { "epoch": 1.77, "grad_norm": 1.566620314377709e-12, "learning_rate": 0.00041019389295251363, "logits/chosen": -16.303802490234375, "logits/rejected": -16.08913803100586, "logps/chosen": -2533.19287109375, "logps/rejected": -2222.10009765625, "loss": 7.5542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -126.70603942871094, "rewards/margins": 7.656754493713379, "rewards/rejected": -134.36279296875, "step": 30630 }, { "epoch": 1.77, "grad_norm": 8.361948132783243e-20, "learning_rate": 0.00041000038701188125, "logits/chosen": -17.640499114990234, "logits/rejected": -17.708011627197266, "logps/chosen": -2621.66552734375, "logps/rejected": -2591.876708984375, "loss": 3.2442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.96340942382812, "rewards/margins": 12.482159614562988, "rewards/rejected": -166.44557189941406, "step": 30640 }, { "epoch": 1.77, "grad_norm": 30.350391387939453, "learning_rate": 0.0004098068810712489, "logits/chosen": -15.803506851196289, "logits/rejected": -16.202653884887695, "logps/chosen": -2624.547119140625, "logps/rejected": -2585.59326171875, "loss": 9.1359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -147.67453002929688, "rewards/margins": -3.290088653564453, "rewards/rejected": -144.3844451904297, "step": 30650 }, { "epoch": 1.77, "grad_norm": 1.989998388296499e-09, "learning_rate": 0.00040961337513061653, "logits/chosen": -20.633081436157227, "logits/rejected": -20.3928279876709, "logps/chosen": -2776.044189453125, "logps/rejected": -2144.51025390625, "loss": 41.2088, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -207.2335968017578, "rewards/margins": -35.11433029174805, "rewards/rejected": -172.1192626953125, "step": 30660 }, { "epoch": 1.78, "grad_norm": 6.5048483093022025e-12, "learning_rate": 0.00040941986918998415, "logits/chosen": -18.774442672729492, "logits/rejected": -19.191896438598633, "logps/chosen": -2498.540283203125, "logps/rejected": -2416.720458984375, "loss": 7.4243, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -146.32302856445312, "rewards/margins": 0.5611938238143921, "rewards/rejected": -146.88424682617188, "step": 30670 }, { "epoch": 1.78, "grad_norm": 87.42044067382812, "learning_rate": 0.00040922636324935177, "logits/chosen": -13.577413558959961, "logits/rejected": -13.718976974487305, "logps/chosen": -3007.528076171875, "logps/rejected": -2653.53857421875, "loss": 10.8845, "rewards/accuracies": 0.5, "rewards/chosen": -92.30068969726562, "rewards/margins": 0.528119683265686, "rewards/rejected": -92.82881164550781, "step": 30680 }, { "epoch": 1.78, "grad_norm": 63.81200408935547, "learning_rate": 0.0004090328573087194, "logits/chosen": -17.516401290893555, "logits/rejected": -17.33510971069336, "logps/chosen": -2594.32763671875, "logps/rejected": -2206.97607421875, "loss": 8.3822, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.9281005859375, "rewards/margins": 1.3107631206512451, "rewards/rejected": -159.23887634277344, "step": 30690 }, { "epoch": 1.78, "grad_norm": 1.5038477613416035e-05, "learning_rate": 0.00040883935136808706, "logits/chosen": -15.249902725219727, "logits/rejected": -14.948053359985352, "logps/chosen": -2717.889892578125, "logps/rejected": -2879.90625, "loss": 5.3597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.77490234375, "rewards/margins": 9.629899978637695, "rewards/rejected": -131.40480041503906, "step": 30700 }, { "epoch": 1.78, "grad_norm": 62.77493667602539, "learning_rate": 0.0004086458454274546, "logits/chosen": -16.218563079833984, "logits/rejected": -16.191425323486328, "logps/chosen": -2622.658935546875, "logps/rejected": -2643.42919921875, "loss": 5.3356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -154.34024047851562, "rewards/margins": 5.516722679138184, "rewards/rejected": -159.85696411132812, "step": 30710 }, { "epoch": 1.78, "grad_norm": 93.96603393554688, "learning_rate": 0.00040845233948682224, "logits/chosen": -13.517064094543457, "logits/rejected": -13.545320510864258, "logps/chosen": -3082.0673828125, "logps/rejected": -2908.417236328125, "loss": 2.702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.52117919921875, "rewards/margins": 6.086508750915527, "rewards/rejected": -199.60768127441406, "step": 30720 }, { "epoch": 1.78, "grad_norm": 1.7950640085473424e-07, "learning_rate": 0.00040825883354618986, "logits/chosen": -14.611854553222656, "logits/rejected": -14.715303421020508, "logps/chosen": -2910.06787109375, "logps/rejected": -2823.813720703125, "loss": 4.8413, "rewards/accuracies": 0.5, "rewards/chosen": -173.27017211914062, "rewards/margins": 0.2844018042087555, "rewards/rejected": -173.55458068847656, "step": 30730 }, { "epoch": 1.78, "grad_norm": 0.02504279464483261, "learning_rate": 0.00040806532760555747, "logits/chosen": -16.43269920349121, "logits/rejected": -16.871171951293945, "logps/chosen": -2721.52783203125, "logps/rejected": -2467.71142578125, "loss": 0.8803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.15716552734375, "rewards/margins": 10.397924423217773, "rewards/rejected": -208.55508422851562, "step": 30740 }, { "epoch": 1.78, "grad_norm": 0.07237151265144348, "learning_rate": 0.00040787182166492514, "logits/chosen": -15.715283393859863, "logits/rejected": -16.118921279907227, "logps/chosen": -2550.015869140625, "logps/rejected": -2503.05712890625, "loss": 2.3471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.084228515625, "rewards/margins": 6.217211723327637, "rewards/rejected": -183.30142211914062, "step": 30750 }, { "epoch": 1.78, "grad_norm": 0.0430399551987648, "learning_rate": 0.00040767831572429276, "logits/chosen": -16.138896942138672, "logits/rejected": -16.360292434692383, "logps/chosen": -2816.705078125, "logps/rejected": -2584.4140625, "loss": 3.1227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.18325805664062, "rewards/margins": 8.083005905151367, "rewards/rejected": -218.2662353515625, "step": 30760 }, { "epoch": 1.78, "grad_norm": 73.0302505493164, "learning_rate": 0.0004074848097836604, "logits/chosen": -14.502874374389648, "logits/rejected": -14.415003776550293, "logps/chosen": -2836.8466796875, "logps/rejected": -2388.19970703125, "loss": 19.6535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -140.16302490234375, "rewards/margins": -0.5697418451309204, "rewards/rejected": -139.59329223632812, "step": 30770 }, { "epoch": 1.78, "grad_norm": 1.0089033998701442e-14, "learning_rate": 0.000407291303843028, "logits/chosen": -14.058502197265625, "logits/rejected": -14.0760498046875, "logps/chosen": -2708.50830078125, "logps/rejected": -2304.916259765625, "loss": 2.9087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -127.92793273925781, "rewards/margins": 14.996699333190918, "rewards/rejected": -142.92462158203125, "step": 30780 }, { "epoch": 1.78, "grad_norm": 0.18008925020694733, "learning_rate": 0.0004070977979023956, "logits/chosen": -15.241693496704102, "logits/rejected": -15.4752197265625, "logps/chosen": -2586.67041015625, "logps/rejected": -2516.333251953125, "loss": 4.0101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.36380004882812, "rewards/margins": 1.9289499521255493, "rewards/rejected": -156.29275512695312, "step": 30790 }, { "epoch": 1.78, "grad_norm": 2.559598613061098e-07, "learning_rate": 0.0004069042919617633, "logits/chosen": -13.936111450195312, "logits/rejected": -13.737451553344727, "logps/chosen": -2547.4892578125, "logps/rejected": -2535.630126953125, "loss": 8.2812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -142.93556213378906, "rewards/margins": -0.982235312461853, "rewards/rejected": -141.9533233642578, "step": 30800 }, { "epoch": 1.78, "grad_norm": 140.68797302246094, "learning_rate": 0.0004067107860211309, "logits/chosen": -15.568857192993164, "logits/rejected": -15.21113395690918, "logps/chosen": -2412.96728515625, "logps/rejected": -2259.43505859375, "loss": 2.954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.55929565429688, "rewards/margins": 12.961004257202148, "rewards/rejected": -190.52032470703125, "step": 30810 }, { "epoch": 1.78, "grad_norm": 0.0009339528041891754, "learning_rate": 0.00040651728008049846, "logits/chosen": -14.91472053527832, "logits/rejected": -14.59422779083252, "logps/chosen": -2599.36572265625, "logps/rejected": -2441.875, "loss": 2.8716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.29469299316406, "rewards/margins": 9.891931533813477, "rewards/rejected": -155.18661499023438, "step": 30820 }, { "epoch": 1.78, "grad_norm": 35.93717575073242, "learning_rate": 0.0004063237741398661, "logits/chosen": -14.067181587219238, "logits/rejected": -14.042455673217773, "logps/chosen": -2354.64697265625, "logps/rejected": -2239.6083984375, "loss": 7.5868, "rewards/accuracies": 0.5, "rewards/chosen": -106.91264343261719, "rewards/margins": -0.30287933349609375, "rewards/rejected": -106.60975646972656, "step": 30830 }, { "epoch": 1.79, "grad_norm": 0.006475809961557388, "learning_rate": 0.0004061302681992337, "logits/chosen": -12.01591968536377, "logits/rejected": -11.962137222290039, "logps/chosen": -3112.846435546875, "logps/rejected": -2667.23681640625, "loss": 1.579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -58.355499267578125, "rewards/margins": 14.065610885620117, "rewards/rejected": -72.42110443115234, "step": 30840 }, { "epoch": 1.79, "grad_norm": 1.1154807078028739e-09, "learning_rate": 0.0004059367622586013, "logits/chosen": -15.13068962097168, "logits/rejected": -15.904536247253418, "logps/chosen": -2967.254638671875, "logps/rejected": -2814.47802734375, "loss": 21.6548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.5366973876953, "rewards/margins": -8.437273025512695, "rewards/rejected": -140.09942626953125, "step": 30850 }, { "epoch": 1.79, "grad_norm": 66.0483627319336, "learning_rate": 0.000405743256317969, "logits/chosen": -15.905660629272461, "logits/rejected": -16.449668884277344, "logps/chosen": -2187.284423828125, "logps/rejected": -2191.19580078125, "loss": 8.3675, "rewards/accuracies": 0.5, "rewards/chosen": -143.85458374023438, "rewards/margins": 12.814679145812988, "rewards/rejected": -156.66925048828125, "step": 30860 }, { "epoch": 1.79, "grad_norm": 221.6697540283203, "learning_rate": 0.0004055497503773366, "logits/chosen": -17.387975692749023, "logits/rejected": -18.31478500366211, "logps/chosen": -2554.2421875, "logps/rejected": -2710.84375, "loss": 18.3517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.74417114257812, "rewards/margins": -12.417057991027832, "rewards/rejected": -146.3271026611328, "step": 30870 }, { "epoch": 1.79, "grad_norm": 0.0006581300403922796, "learning_rate": 0.0004053562444367042, "logits/chosen": -14.694246292114258, "logits/rejected": -14.792137145996094, "logps/chosen": -2762.908935546875, "logps/rejected": -2889.43408203125, "loss": 7.7143, "rewards/accuracies": 0.5, "rewards/chosen": -116.13448333740234, "rewards/margins": -1.2488548755645752, "rewards/rejected": -114.88563537597656, "step": 30880 }, { "epoch": 1.79, "grad_norm": 98.06092071533203, "learning_rate": 0.00040516273849607184, "logits/chosen": -14.817319869995117, "logits/rejected": -15.22807788848877, "logps/chosen": -2854.49560546875, "logps/rejected": -2731.123779296875, "loss": 9.8188, "rewards/accuracies": 0.5, "rewards/chosen": -206.40402221679688, "rewards/margins": -3.698021650314331, "rewards/rejected": -202.7060089111328, "step": 30890 }, { "epoch": 1.79, "grad_norm": 2.2322804937857654e-08, "learning_rate": 0.00040496923255543946, "logits/chosen": -14.68041706085205, "logits/rejected": -14.647786140441895, "logps/chosen": -2940.963134765625, "logps/rejected": -2815.52099609375, "loss": 10.2471, "rewards/accuracies": 0.5, "rewards/chosen": -178.02169799804688, "rewards/margins": -2.934558153152466, "rewards/rejected": -175.08717346191406, "step": 30900 }, { "epoch": 1.79, "grad_norm": 37.480812072753906, "learning_rate": 0.00040477572661480713, "logits/chosen": -12.967000007629395, "logits/rejected": -12.710638046264648, "logps/chosen": -2923.279541015625, "logps/rejected": -2796.747314453125, "loss": 7.6512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.41693115234375, "rewards/margins": 2.73686146736145, "rewards/rejected": -176.15380859375, "step": 30910 }, { "epoch": 1.79, "grad_norm": 68.53252410888672, "learning_rate": 0.0004045822206741747, "logits/chosen": -15.314798355102539, "logits/rejected": -15.298646926879883, "logps/chosen": -2407.21728515625, "logps/rejected": -2617.44189453125, "loss": 0.8472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.0460205078125, "rewards/margins": 13.664451599121094, "rewards/rejected": -202.71047973632812, "step": 30920 }, { "epoch": 1.79, "grad_norm": 221.3947296142578, "learning_rate": 0.0004043887147335423, "logits/chosen": -13.139180183410645, "logits/rejected": -12.85363483428955, "logps/chosen": -3295.705078125, "logps/rejected": -2670.793701171875, "loss": 13.3973, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.44139099121094, "rewards/margins": -5.069121837615967, "rewards/rejected": -153.3722686767578, "step": 30930 }, { "epoch": 1.79, "grad_norm": 1.0814834240591154e-05, "learning_rate": 0.0004041952087929099, "logits/chosen": -12.546656608581543, "logits/rejected": -12.38755989074707, "logps/chosen": -3143.900634765625, "logps/rejected": -2795.44287109375, "loss": 2.8767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -137.15525817871094, "rewards/margins": 4.1909356117248535, "rewards/rejected": -141.34620666503906, "step": 30940 }, { "epoch": 1.79, "grad_norm": 1.4174713919984372e-12, "learning_rate": 0.00040400170285227754, "logits/chosen": -13.02075481414795, "logits/rejected": -12.499639511108398, "logps/chosen": -3014.8330078125, "logps/rejected": -2790.457763671875, "loss": 18.2818, "rewards/accuracies": 0.5, "rewards/chosen": -171.5936737060547, "rewards/margins": -11.78349494934082, "rewards/rejected": -159.81019592285156, "step": 30950 }, { "epoch": 1.79, "grad_norm": 0.036905061453580856, "learning_rate": 0.0004038081969116452, "logits/chosen": -14.4811429977417, "logits/rejected": -14.780652046203613, "logps/chosen": -2839.288818359375, "logps/rejected": -2786.28662109375, "loss": 3.1559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.75094604492188, "rewards/margins": 4.255339622497559, "rewards/rejected": -172.00628662109375, "step": 30960 }, { "epoch": 1.79, "grad_norm": 67.30625915527344, "learning_rate": 0.00040361469097101283, "logits/chosen": -17.507389068603516, "logits/rejected": -17.42679214477539, "logps/chosen": -2815.927001953125, "logps/rejected": -2458.863037109375, "loss": 13.8394, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.67147827148438, "rewards/margins": 1.1130958795547485, "rewards/rejected": -171.78457641601562, "step": 30970 }, { "epoch": 1.79, "grad_norm": 1.2159286078006204e-10, "learning_rate": 0.00040342118503038045, "logits/chosen": -18.13182258605957, "logits/rejected": -19.059062957763672, "logps/chosen": -2352.049560546875, "logps/rejected": -2325.31005859375, "loss": 18.4048, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.40170288085938, "rewards/margins": -8.481526374816895, "rewards/rejected": -155.92019653320312, "step": 30980 }, { "epoch": 1.79, "grad_norm": 1.0002938181810726e-15, "learning_rate": 0.00040322767908974806, "logits/chosen": -17.935108184814453, "logits/rejected": -17.98470115661621, "logps/chosen": -3044.27001953125, "logps/rejected": -2867.558349609375, "loss": 8.3452, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -196.08935546875, "rewards/margins": 0.7087963223457336, "rewards/rejected": -196.7981414794922, "step": 30990 }, { "epoch": 1.79, "grad_norm": 123.01289367675781, "learning_rate": 0.0004030341731491157, "logits/chosen": -18.31087875366211, "logits/rejected": -18.45221519470215, "logps/chosen": -2431.281494140625, "logps/rejected": -2548.59619140625, "loss": 8.2594, "rewards/accuracies": 0.5, "rewards/chosen": -197.52188110351562, "rewards/margins": 6.928391456604004, "rewards/rejected": -204.45028686523438, "step": 31000 }, { "epoch": 1.79, "grad_norm": 1.2030372619628906, "learning_rate": 0.00040284066720848335, "logits/chosen": -22.190105438232422, "logits/rejected": -23.17633628845215, "logps/chosen": -2577.321533203125, "logps/rejected": -2322.953857421875, "loss": 8.8172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.95016479492188, "rewards/margins": 10.946493148803711, "rewards/rejected": -216.8966522216797, "step": 31010 }, { "epoch": 1.8, "grad_norm": 68.46089172363281, "learning_rate": 0.00040264716126785097, "logits/chosen": -17.059261322021484, "logits/rejected": -16.975313186645508, "logps/chosen": -2369.81396484375, "logps/rejected": -2287.29931640625, "loss": 7.6811, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -158.06216430664062, "rewards/margins": -3.6484718322753906, "rewards/rejected": -154.41366577148438, "step": 31020 }, { "epoch": 1.8, "grad_norm": 1.3907973766326904, "learning_rate": 0.00040245365532721853, "logits/chosen": -17.504396438598633, "logits/rejected": -18.053401947021484, "logps/chosen": -3028.088623046875, "logps/rejected": -2823.8427734375, "loss": 3.7983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -183.0408935546875, "rewards/margins": 15.221331596374512, "rewards/rejected": -198.26220703125, "step": 31030 }, { "epoch": 1.8, "grad_norm": 102.37547302246094, "learning_rate": 0.00040226014938658615, "logits/chosen": -17.431421279907227, "logits/rejected": -18.20370101928711, "logps/chosen": -2986.874267578125, "logps/rejected": -2849.97216796875, "loss": 15.4594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -199.65292358398438, "rewards/margins": -6.180412292480469, "rewards/rejected": -193.47250366210938, "step": 31040 }, { "epoch": 1.8, "grad_norm": 69.46814727783203, "learning_rate": 0.00040206664344595377, "logits/chosen": -14.987058639526367, "logits/rejected": -14.910369873046875, "logps/chosen": -2924.12646484375, "logps/rejected": -2749.269287109375, "loss": 6.6974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.4830322265625, "rewards/margins": 3.771899700164795, "rewards/rejected": -144.25494384765625, "step": 31050 }, { "epoch": 1.8, "grad_norm": 198.7992401123047, "learning_rate": 0.0004018731375053214, "logits/chosen": -14.388920783996582, "logits/rejected": -14.321187019348145, "logps/chosen": -3466.481201171875, "logps/rejected": -2918.09814453125, "loss": 21.5597, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -146.88510131835938, "rewards/margins": -15.22215461730957, "rewards/rejected": -131.66293334960938, "step": 31060 }, { "epoch": 1.8, "grad_norm": 1.584526110320894e-08, "learning_rate": 0.00040167963156468906, "logits/chosen": -15.435856819152832, "logits/rejected": -15.740102767944336, "logps/chosen": -2704.454833984375, "logps/rejected": -2439.580810546875, "loss": 12.9087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -162.3553009033203, "rewards/margins": 0.45431214570999146, "rewards/rejected": -162.8096160888672, "step": 31070 }, { "epoch": 1.8, "grad_norm": 63.37843322753906, "learning_rate": 0.0004014861256240567, "logits/chosen": -13.521766662597656, "logits/rejected": -13.34747314453125, "logps/chosen": -3413.806640625, "logps/rejected": -3395.485595703125, "loss": 4.32, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.0287094116211, "rewards/margins": 8.336592674255371, "rewards/rejected": -132.36532592773438, "step": 31080 }, { "epoch": 1.8, "grad_norm": 53.957794189453125, "learning_rate": 0.0004012926196834243, "logits/chosen": -19.137128829956055, "logits/rejected": -18.567825317382812, "logps/chosen": -2594.725341796875, "logps/rejected": -2306.0791015625, "loss": 3.5521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.6529083251953, "rewards/margins": 5.256440162658691, "rewards/rejected": -194.9093475341797, "step": 31090 }, { "epoch": 1.8, "grad_norm": 0.00012656419130507857, "learning_rate": 0.0004010991137427919, "logits/chosen": -14.326879501342773, "logits/rejected": -14.394384384155273, "logps/chosen": -2834.6767578125, "logps/rejected": -2718.759033203125, "loss": 5.7505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.71183013916016, "rewards/margins": 4.313835144042969, "rewards/rejected": -108.0256576538086, "step": 31100 }, { "epoch": 1.8, "grad_norm": 8.746543740269079e-16, "learning_rate": 0.0004009056078021595, "logits/chosen": -16.50096893310547, "logits/rejected": -16.70749855041504, "logps/chosen": -3089.08984375, "logps/rejected": -2957.744140625, "loss": 0.8303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.27169799804688, "rewards/margins": 15.745416641235352, "rewards/rejected": -212.0171356201172, "step": 31110 }, { "epoch": 1.8, "grad_norm": 114.28863525390625, "learning_rate": 0.0004007121018615272, "logits/chosen": -16.724889755249023, "logits/rejected": -16.935850143432617, "logps/chosen": -2618.29833984375, "logps/rejected": -2704.79345703125, "loss": 3.167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.28787231445312, "rewards/margins": 14.24732494354248, "rewards/rejected": -202.53518676757812, "step": 31120 }, { "epoch": 1.8, "grad_norm": 3.687067985534668, "learning_rate": 0.0004005185959208948, "logits/chosen": -16.44700813293457, "logits/rejected": -16.734283447265625, "logps/chosen": -3016.55517578125, "logps/rejected": -3040.76806640625, "loss": 7.9652, "rewards/accuracies": 0.5, "rewards/chosen": -192.00863647460938, "rewards/margins": 6.903430938720703, "rewards/rejected": -198.91209411621094, "step": 31130 }, { "epoch": 1.8, "grad_norm": 0.22975975275039673, "learning_rate": 0.0004003250899802624, "logits/chosen": -15.966192245483398, "logits/rejected": -15.856823921203613, "logps/chosen": -2567.123046875, "logps/rejected": -2510.222412109375, "loss": 4.8164, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.69607543945312, "rewards/margins": -0.12282256782054901, "rewards/rejected": -182.5732421875, "step": 31140 }, { "epoch": 1.8, "grad_norm": 0.0, "learning_rate": 0.00040013158403963, "logits/chosen": -17.36098289489746, "logits/rejected": -17.21573257446289, "logps/chosen": -2459.864501953125, "logps/rejected": -2370.14306640625, "loss": 6.6566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.10826110839844, "rewards/margins": 6.054991722106934, "rewards/rejected": -148.1632537841797, "step": 31150 }, { "epoch": 1.8, "grad_norm": 88.64459991455078, "learning_rate": 0.0003999380780989976, "logits/chosen": -16.849393844604492, "logits/rejected": -16.585628509521484, "logps/chosen": -2553.055419921875, "logps/rejected": -2142.830810546875, "loss": 14.6501, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -171.59100341796875, "rewards/margins": -9.620214462280273, "rewards/rejected": -161.9707794189453, "step": 31160 }, { "epoch": 1.8, "grad_norm": 0.0005929088802076876, "learning_rate": 0.0003997445721583653, "logits/chosen": -16.17510223388672, "logits/rejected": -16.272518157958984, "logps/chosen": -2815.17529296875, "logps/rejected": -2839.733642578125, "loss": 5.4638, "rewards/accuracies": 0.5, "rewards/chosen": -182.63455200195312, "rewards/margins": -1.7820113897323608, "rewards/rejected": -180.85255432128906, "step": 31170 }, { "epoch": 1.8, "grad_norm": 4.1688230645320346e-10, "learning_rate": 0.0003995510662177329, "logits/chosen": -14.44990062713623, "logits/rejected": -14.477714538574219, "logps/chosen": -3066.42138671875, "logps/rejected": -2662.56982421875, "loss": 8.5862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -117.1259765625, "rewards/margins": -2.1320478916168213, "rewards/rejected": -114.99393463134766, "step": 31180 }, { "epoch": 1.81, "grad_norm": 0.2586377263069153, "learning_rate": 0.0003993575602771005, "logits/chosen": -14.178644180297852, "logits/rejected": -14.109919548034668, "logps/chosen": -3104.23193359375, "logps/rejected": -2958.48046875, "loss": 4.7919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -129.0902557373047, "rewards/margins": 11.974752426147461, "rewards/rejected": -141.0650177001953, "step": 31190 }, { "epoch": 1.81, "grad_norm": 103.04878234863281, "learning_rate": 0.00039916405433646813, "logits/chosen": -19.04923439025879, "logits/rejected": -18.792171478271484, "logps/chosen": -2566.05224609375, "logps/rejected": -2620.400390625, "loss": 24.914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -214.55117797851562, "rewards/margins": -14.594761848449707, "rewards/rejected": -199.95640563964844, "step": 31200 }, { "epoch": 1.81, "grad_norm": 0.0, "learning_rate": 0.00039897054839583575, "logits/chosen": -14.777178764343262, "logits/rejected": -14.474327087402344, "logps/chosen": -2758.2705078125, "logps/rejected": -2374.712158203125, "loss": 11.5617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.31642150878906, "rewards/margins": 0.5936450958251953, "rewards/rejected": -145.91006469726562, "step": 31210 }, { "epoch": 1.81, "grad_norm": 7.1190547714650165e-06, "learning_rate": 0.0003987770424552034, "logits/chosen": -14.800642013549805, "logits/rejected": -14.456273078918457, "logps/chosen": -3242.324951171875, "logps/rejected": -3133.94482421875, "loss": 4.437, "rewards/accuracies": 0.5, "rewards/chosen": -138.80270385742188, "rewards/margins": 2.4788401126861572, "rewards/rejected": -141.28152465820312, "step": 31220 }, { "epoch": 1.81, "grad_norm": 36.7863883972168, "learning_rate": 0.00039858353651457104, "logits/chosen": -15.706860542297363, "logits/rejected": -15.593363761901855, "logps/chosen": -2682.3359375, "logps/rejected": -2581.4072265625, "loss": 5.4219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.7429962158203, "rewards/margins": -0.19264373183250427, "rewards/rejected": -166.55035400390625, "step": 31230 }, { "epoch": 1.81, "grad_norm": 79.47103118896484, "learning_rate": 0.00039839003057393866, "logits/chosen": -15.8025541305542, "logits/rejected": -15.54216480255127, "logps/chosen": -2614.776611328125, "logps/rejected": -2647.20068359375, "loss": 12.5646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.9346160888672, "rewards/margins": -6.271763801574707, "rewards/rejected": -164.662841796875, "step": 31240 }, { "epoch": 1.81, "grad_norm": 2.733325787929175e-10, "learning_rate": 0.0003981965246333062, "logits/chosen": -17.113718032836914, "logits/rejected": -17.54981231689453, "logps/chosen": -2434.33349609375, "logps/rejected": -2342.958984375, "loss": 1.3918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.8910675048828, "rewards/margins": 5.739875316619873, "rewards/rejected": -210.63095092773438, "step": 31250 }, { "epoch": 1.81, "grad_norm": 5.486431746248854e-06, "learning_rate": 0.00039800301869267384, "logits/chosen": -14.524282455444336, "logits/rejected": -14.677576065063477, "logps/chosen": -2888.642578125, "logps/rejected": -2849.27783203125, "loss": 3.747, "rewards/accuracies": 0.5, "rewards/chosen": -111.1053237915039, "rewards/margins": 1.60244619846344, "rewards/rejected": -112.707763671875, "step": 31260 }, { "epoch": 1.81, "grad_norm": 67.04315185546875, "learning_rate": 0.00039780951275204145, "logits/chosen": -17.86736297607422, "logits/rejected": -17.31888771057129, "logps/chosen": -2954.166748046875, "logps/rejected": -2758.82763671875, "loss": 3.7239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -233.3530731201172, "rewards/margins": 0.3331008851528168, "rewards/rejected": -233.68618774414062, "step": 31270 }, { "epoch": 1.81, "grad_norm": 0.03463875502347946, "learning_rate": 0.0003976160068114091, "logits/chosen": -16.609394073486328, "logits/rejected": -16.45401382446289, "logps/chosen": -2716.93896484375, "logps/rejected": -2382.396240234375, "loss": 19.549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.11354064941406, "rewards/margins": -10.122172355651855, "rewards/rejected": -123.9913558959961, "step": 31280 }, { "epoch": 1.81, "grad_norm": 131.27603149414062, "learning_rate": 0.00039742250087077674, "logits/chosen": -15.91014575958252, "logits/rejected": -15.86406421661377, "logps/chosen": -2818.82666015625, "logps/rejected": -2866.23486328125, "loss": 5.8606, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -187.40049743652344, "rewards/margins": -3.0272953510284424, "rewards/rejected": -184.37319946289062, "step": 31290 }, { "epoch": 1.81, "grad_norm": 123.98382568359375, "learning_rate": 0.00039722899493014436, "logits/chosen": -16.662031173706055, "logits/rejected": -16.716136932373047, "logps/chosen": -2582.19091796875, "logps/rejected": -2563.066162109375, "loss": 3.5394, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.71986389160156, "rewards/margins": 3.5627589225769043, "rewards/rejected": -164.28262329101562, "step": 31300 }, { "epoch": 1.81, "grad_norm": 46.63457489013672, "learning_rate": 0.000397035488989512, "logits/chosen": -16.0484561920166, "logits/rejected": -15.553112983703613, "logps/chosen": -2823.45654296875, "logps/rejected": -2441.32568359375, "loss": 19.0096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -177.36436462402344, "rewards/margins": -12.090660095214844, "rewards/rejected": -165.27371215820312, "step": 31310 }, { "epoch": 1.81, "grad_norm": 0.005186609458178282, "learning_rate": 0.0003968419830488796, "logits/chosen": -16.319984436035156, "logits/rejected": -16.492401123046875, "logps/chosen": -2308.2021484375, "logps/rejected": -2287.506591796875, "loss": 3.3452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.48443603515625, "rewards/margins": 3.232773542404175, "rewards/rejected": -164.7172088623047, "step": 31320 }, { "epoch": 1.81, "grad_norm": 34.69977951049805, "learning_rate": 0.00039664847710824727, "logits/chosen": -14.480422973632812, "logits/rejected": -14.384821891784668, "logps/chosen": -2637.929443359375, "logps/rejected": -2403.853515625, "loss": 2.4067, "rewards/accuracies": 0.5, "rewards/chosen": -151.1294403076172, "rewards/margins": 11.132875442504883, "rewards/rejected": -162.26231384277344, "step": 31330 }, { "epoch": 1.81, "grad_norm": 1.8354554640609422e-06, "learning_rate": 0.0003964549711676149, "logits/chosen": -15.187368392944336, "logits/rejected": -15.060359001159668, "logps/chosen": -2973.386474609375, "logps/rejected": -2791.628173828125, "loss": 1.254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.98036193847656, "rewards/margins": 15.503766059875488, "rewards/rejected": -172.48411560058594, "step": 31340 }, { "epoch": 1.81, "grad_norm": 0.0, "learning_rate": 0.0003962614652269825, "logits/chosen": -18.63626480102539, "logits/rejected": -18.580469131469727, "logps/chosen": -2371.715087890625, "logps/rejected": -2675.08935546875, "loss": 8.5863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.84365844726562, "rewards/margins": 15.652928352355957, "rewards/rejected": -177.49658203125, "step": 31350 }, { "epoch": 1.82, "grad_norm": 0.0, "learning_rate": 0.00039606795928635006, "logits/chosen": -14.742939949035645, "logits/rejected": -14.692463874816895, "logps/chosen": -2465.69091796875, "logps/rejected": -2407.724609375, "loss": 6.5847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.904296875, "rewards/margins": 14.257379531860352, "rewards/rejected": -148.16168212890625, "step": 31360 }, { "epoch": 1.82, "grad_norm": 1.5486826152797507e-17, "learning_rate": 0.0003958744533457177, "logits/chosen": -12.835458755493164, "logits/rejected": -12.893847465515137, "logps/chosen": -3051.11083984375, "logps/rejected": -3189.395263671875, "loss": 2.6416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -90.64574432373047, "rewards/margins": 9.660019874572754, "rewards/rejected": -100.30577087402344, "step": 31370 }, { "epoch": 1.82, "grad_norm": 22.264549255371094, "learning_rate": 0.00039568094740508535, "logits/chosen": -16.665311813354492, "logits/rejected": -16.60592269897461, "logps/chosen": -2627.491455078125, "logps/rejected": -2737.1298828125, "loss": 2.2566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.64625549316406, "rewards/margins": 6.765772342681885, "rewards/rejected": -168.4120330810547, "step": 31380 }, { "epoch": 1.82, "grad_norm": 65.90538787841797, "learning_rate": 0.00039548744146445297, "logits/chosen": -14.33240795135498, "logits/rejected": -15.55017375946045, "logps/chosen": -3271.47119140625, "logps/rejected": -3055.006103515625, "loss": 2.8075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -112.89265441894531, "rewards/margins": 17.30678939819336, "rewards/rejected": -130.19944763183594, "step": 31390 }, { "epoch": 1.82, "grad_norm": 100.44717407226562, "learning_rate": 0.0003952939355238206, "logits/chosen": -14.86755084991455, "logits/rejected": -14.994575500488281, "logps/chosen": -2429.090087890625, "logps/rejected": -2366.603759765625, "loss": 8.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.42727661132812, "rewards/margins": -2.7072372436523438, "rewards/rejected": -163.7200469970703, "step": 31400 }, { "epoch": 1.82, "grad_norm": 108.6537094116211, "learning_rate": 0.0003951004295831882, "logits/chosen": -17.924610137939453, "logits/rejected": -17.816926956176758, "logps/chosen": -2203.55078125, "logps/rejected": -2031.1845703125, "loss": 22.1976, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.51809692382812, "rewards/margins": -16.54070472717285, "rewards/rejected": -149.97740173339844, "step": 31410 }, { "epoch": 1.82, "grad_norm": 7.516884215874597e-05, "learning_rate": 0.0003949069236425558, "logits/chosen": -17.53067398071289, "logits/rejected": -17.394357681274414, "logps/chosen": -2395.427978515625, "logps/rejected": -2342.57080078125, "loss": 3.4545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.75369262695312, "rewards/margins": 8.352383613586426, "rewards/rejected": -168.1060791015625, "step": 31420 }, { "epoch": 1.82, "grad_norm": 30.550146102905273, "learning_rate": 0.0003947134177019235, "logits/chosen": -19.044437408447266, "logits/rejected": -19.404285430908203, "logps/chosen": -2874.5859375, "logps/rejected": -2807.139404296875, "loss": 11.4646, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -174.54824829101562, "rewards/margins": -8.327624320983887, "rewards/rejected": -166.22061157226562, "step": 31430 }, { "epoch": 1.82, "grad_norm": 0.0007947334670461714, "learning_rate": 0.0003945199117612911, "logits/chosen": -20.860637664794922, "logits/rejected": -21.14228057861328, "logps/chosen": -3056.70654296875, "logps/rejected": -2753.491455078125, "loss": 11.6397, "rewards/accuracies": 0.5, "rewards/chosen": -193.46995544433594, "rewards/margins": -7.338271141052246, "rewards/rejected": -186.13168334960938, "step": 31440 }, { "epoch": 1.82, "grad_norm": 474.9881896972656, "learning_rate": 0.0003943264058206587, "logits/chosen": -20.039648056030273, "logits/rejected": -19.305604934692383, "logps/chosen": -2467.81787109375, "logps/rejected": -2256.7587890625, "loss": 11.4466, "rewards/accuracies": 0.5, "rewards/chosen": -114.63233947753906, "rewards/margins": 2.109182357788086, "rewards/rejected": -116.74153137207031, "step": 31450 }, { "epoch": 1.82, "grad_norm": 94.44711303710938, "learning_rate": 0.00039413289988002634, "logits/chosen": -15.755247116088867, "logits/rejected": -16.10623550415039, "logps/chosen": -2529.14404296875, "logps/rejected": -2492.438720703125, "loss": 5.1358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.57411193847656, "rewards/margins": 0.5055004358291626, "rewards/rejected": -122.0796127319336, "step": 31460 }, { "epoch": 1.82, "grad_norm": 120.09944152832031, "learning_rate": 0.0003939393939393939, "logits/chosen": -13.551249504089355, "logits/rejected": -13.29644775390625, "logps/chosen": -3133.35986328125, "logps/rejected": -2936.3388671875, "loss": 2.058, "rewards/accuracies": 0.5, "rewards/chosen": -76.76753997802734, "rewards/margins": 10.914915084838867, "rewards/rejected": -87.68244934082031, "step": 31470 }, { "epoch": 1.82, "grad_norm": 133.99290466308594, "learning_rate": 0.0003937458879987615, "logits/chosen": -15.96593189239502, "logits/rejected": -16.021604537963867, "logps/chosen": -2941.66455078125, "logps/rejected": -2592.439697265625, "loss": 2.7693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.76034545898438, "rewards/margins": 9.636834144592285, "rewards/rejected": -161.3971710205078, "step": 31480 }, { "epoch": 1.82, "grad_norm": 5.1823146468565895e-11, "learning_rate": 0.0003935523820581292, "logits/chosen": -21.05242156982422, "logits/rejected": -21.4512996673584, "logps/chosen": -2584.52978515625, "logps/rejected": -2517.9052734375, "loss": 12.0691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -220.22695922851562, "rewards/margins": 2.188962459564209, "rewards/rejected": -222.4158935546875, "step": 31490 }, { "epoch": 1.82, "grad_norm": 64.97588348388672, "learning_rate": 0.0003933588761174968, "logits/chosen": -14.503721237182617, "logits/rejected": -14.419870376586914, "logps/chosen": -2729.473876953125, "logps/rejected": -2832.828125, "loss": 6.0803, "rewards/accuracies": 0.5, "rewards/chosen": -116.417236328125, "rewards/margins": 3.2528693675994873, "rewards/rejected": -119.67010498046875, "step": 31500 }, { "epoch": 1.82, "grad_norm": 39.45216751098633, "learning_rate": 0.00039316537017686443, "logits/chosen": -16.20937728881836, "logits/rejected": -16.269786834716797, "logps/chosen": -2413.501953125, "logps/rejected": -2594.367919921875, "loss": 13.0449, "rewards/accuracies": 0.5, "rewards/chosen": -151.5423583984375, "rewards/margins": 8.234182357788086, "rewards/rejected": -159.7765655517578, "step": 31510 }, { "epoch": 1.82, "grad_norm": 270.3787536621094, "learning_rate": 0.00039297186423623205, "logits/chosen": -14.441838264465332, "logits/rejected": -14.288324356079102, "logps/chosen": -2770.112548828125, "logps/rejected": -2727.2744140625, "loss": 10.984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.64833068847656, "rewards/margins": -3.3220818042755127, "rewards/rejected": -145.3262481689453, "step": 31520 }, { "epoch": 1.83, "grad_norm": 0.01707916520535946, "learning_rate": 0.00039277835829559966, "logits/chosen": -18.333126068115234, "logits/rejected": -18.250850677490234, "logps/chosen": -2632.826904296875, "logps/rejected": -2537.79052734375, "loss": 2.0165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.63880920410156, "rewards/margins": 4.245433807373047, "rewards/rejected": -171.8842315673828, "step": 31530 }, { "epoch": 1.83, "grad_norm": 0.05708526447415352, "learning_rate": 0.00039258485235496734, "logits/chosen": -17.322208404541016, "logits/rejected": -17.420185089111328, "logps/chosen": -2858.17431640625, "logps/rejected": -2722.849609375, "loss": 10.0834, "rewards/accuracies": 0.5, "rewards/chosen": -177.09268188476562, "rewards/margins": -4.723980903625488, "rewards/rejected": -172.3686981201172, "step": 31540 }, { "epoch": 1.83, "grad_norm": 0.1547490507364273, "learning_rate": 0.00039239134641433495, "logits/chosen": -14.645007133483887, "logits/rejected": -14.762948989868164, "logps/chosen": -2119.670654296875, "logps/rejected": -2285.372802734375, "loss": 16.6723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.80316162109375, "rewards/margins": -14.542508125305176, "rewards/rejected": -117.2606430053711, "step": 31550 }, { "epoch": 1.83, "grad_norm": 2.790063433621981e-07, "learning_rate": 0.00039219784047370257, "logits/chosen": -14.098745346069336, "logits/rejected": -13.82422924041748, "logps/chosen": -3064.764892578125, "logps/rejected": -2718.68505859375, "loss": 3.5442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.60971069335938, "rewards/margins": 5.704411506652832, "rewards/rejected": -158.31414794921875, "step": 31560 }, { "epoch": 1.83, "grad_norm": 72.61040496826172, "learning_rate": 0.0003920043345330702, "logits/chosen": -16.45578956604004, "logits/rejected": -16.57159996032715, "logps/chosen": -3227.6572265625, "logps/rejected": -2416.91845703125, "loss": 6.3498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -180.57879638671875, "rewards/margins": 8.961477279663086, "rewards/rejected": -189.54025268554688, "step": 31570 }, { "epoch": 1.83, "grad_norm": 1.382309405702578e-19, "learning_rate": 0.00039181082859243775, "logits/chosen": -18.607585906982422, "logits/rejected": -19.03685188293457, "logps/chosen": -2848.29638671875, "logps/rejected": -2614.55615234375, "loss": 3.2199, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.44082641601562, "rewards/margins": 10.913818359375, "rewards/rejected": -165.35462951660156, "step": 31580 }, { "epoch": 1.83, "grad_norm": 58.42751693725586, "learning_rate": 0.0003916173226518054, "logits/chosen": -14.354368209838867, "logits/rejected": -14.230751037597656, "logps/chosen": -2913.34912109375, "logps/rejected": -2777.328857421875, "loss": 4.3883, "rewards/accuracies": 0.5, "rewards/chosen": -137.68495178222656, "rewards/margins": 4.789580345153809, "rewards/rejected": -142.47451782226562, "step": 31590 }, { "epoch": 1.83, "grad_norm": 4.642325620807242e-06, "learning_rate": 0.00039142381671117304, "logits/chosen": -17.957304000854492, "logits/rejected": -17.743656158447266, "logps/chosen": -2600.584716796875, "logps/rejected": -2634.065673828125, "loss": 7.0318, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -196.94729614257812, "rewards/margins": -2.7178311347961426, "rewards/rejected": -194.22946166992188, "step": 31600 }, { "epoch": 1.83, "grad_norm": 0.07857540249824524, "learning_rate": 0.00039123031077054066, "logits/chosen": -17.74770736694336, "logits/rejected": -17.678442001342773, "logps/chosen": -2920.512939453125, "logps/rejected": -2986.86962890625, "loss": 3.7343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.822021484375, "rewards/margins": 7.192719459533691, "rewards/rejected": -220.0147705078125, "step": 31610 }, { "epoch": 1.83, "grad_norm": 38.85293960571289, "learning_rate": 0.00039103680482990827, "logits/chosen": -18.40892791748047, "logits/rejected": -18.731945037841797, "logps/chosen": -2554.476318359375, "logps/rejected": -2016.604736328125, "loss": 21.6749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.98757934570312, "rewards/margins": -7.723989963531494, "rewards/rejected": -177.26358032226562, "step": 31620 }, { "epoch": 1.83, "grad_norm": 46.11820983886719, "learning_rate": 0.0003908432988892759, "logits/chosen": -16.53096580505371, "logits/rejected": -16.642667770385742, "logps/chosen": -2855.076904296875, "logps/rejected": -2689.103271484375, "loss": 4.2214, "rewards/accuracies": 0.5, "rewards/chosen": -112.24180603027344, "rewards/margins": 6.479429721832275, "rewards/rejected": -118.72123718261719, "step": 31630 }, { "epoch": 1.83, "grad_norm": 108.6362533569336, "learning_rate": 0.00039064979294864356, "logits/chosen": -16.97770118713379, "logits/rejected": -17.479686737060547, "logps/chosen": -2883.29052734375, "logps/rejected": -2869.205810546875, "loss": 6.6278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.66720581054688, "rewards/margins": 3.220046281814575, "rewards/rejected": -181.88723754882812, "step": 31640 }, { "epoch": 1.83, "grad_norm": 1.4929785265849205e-06, "learning_rate": 0.0003904562870080112, "logits/chosen": -17.848560333251953, "logits/rejected": -17.662385940551758, "logps/chosen": -2481.1611328125, "logps/rejected": -2695.531982421875, "loss": 1.0649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.19937133789062, "rewards/margins": 25.906753540039062, "rewards/rejected": -172.1061248779297, "step": 31650 }, { "epoch": 1.83, "grad_norm": 0.0, "learning_rate": 0.0003902627810673788, "logits/chosen": -18.152057647705078, "logits/rejected": -17.43804931640625, "logps/chosen": -2880.797119140625, "logps/rejected": -2355.60498046875, "loss": 24.1975, "rewards/accuracies": 0.5, "rewards/chosen": -138.86634826660156, "rewards/margins": -9.709550857543945, "rewards/rejected": -129.15679931640625, "step": 31660 }, { "epoch": 1.83, "grad_norm": 141.14462280273438, "learning_rate": 0.0003900692751267464, "logits/chosen": -18.03384780883789, "logits/rejected": -17.842845916748047, "logps/chosen": -2621.70556640625, "logps/rejected": -2526.61767578125, "loss": 7.6916, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -142.83998107910156, "rewards/margins": -4.019312858581543, "rewards/rejected": -138.82066345214844, "step": 31670 }, { "epoch": 1.83, "grad_norm": 9.614197731018066, "learning_rate": 0.00038987576918611403, "logits/chosen": -17.89614486694336, "logits/rejected": -17.937320709228516, "logps/chosen": -2763.41015625, "logps/rejected": -2282.964111328125, "loss": 1.9515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -141.94149780273438, "rewards/margins": 6.255977153778076, "rewards/rejected": -148.19747924804688, "step": 31680 }, { "epoch": 1.83, "grad_norm": 94.62449645996094, "learning_rate": 0.00038968226324548165, "logits/chosen": -20.334171295166016, "logits/rejected": -20.384113311767578, "logps/chosen": -2398.23828125, "logps/rejected": -2472.65673828125, "loss": 1.2504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -208.1097412109375, "rewards/margins": 6.507532596588135, "rewards/rejected": -214.6172637939453, "step": 31690 }, { "epoch": 1.83, "grad_norm": 0.00039240263868123293, "learning_rate": 0.00038948875730484926, "logits/chosen": -14.051546096801758, "logits/rejected": -13.793298721313477, "logps/chosen": -2728.33544921875, "logps/rejected": -2795.772705078125, "loss": 4.1189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.02577209472656, "rewards/margins": 6.5540666580200195, "rewards/rejected": -109.57981872558594, "step": 31700 }, { "epoch": 1.84, "grad_norm": 178.4314727783203, "learning_rate": 0.0003892952513642169, "logits/chosen": -18.960376739501953, "logits/rejected": -18.694746017456055, "logps/chosen": -2929.94384765625, "logps/rejected": -2545.63623046875, "loss": 9.7054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -188.8591766357422, "rewards/margins": -0.8667755126953125, "rewards/rejected": -187.99240112304688, "step": 31710 }, { "epoch": 1.84, "grad_norm": 0.012998099438846111, "learning_rate": 0.0003891017454235845, "logits/chosen": -18.138423919677734, "logits/rejected": -18.680936813354492, "logps/chosen": -2612.814453125, "logps/rejected": -2801.12939453125, "loss": 11.4675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.89736938476562, "rewards/margins": -0.7921112179756165, "rewards/rejected": -170.1052703857422, "step": 31720 }, { "epoch": 1.84, "grad_norm": 115.11833190917969, "learning_rate": 0.0003889082394829521, "logits/chosen": -21.975927352905273, "logits/rejected": -21.79226303100586, "logps/chosen": -2548.932373046875, "logps/rejected": -2231.977783203125, "loss": 23.2157, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -221.24868774414062, "rewards/margins": -20.693206787109375, "rewards/rejected": -200.5554656982422, "step": 31730 }, { "epoch": 1.84, "grad_norm": 74.16426849365234, "learning_rate": 0.00038871473354231973, "logits/chosen": -20.621828079223633, "logits/rejected": -20.986698150634766, "logps/chosen": -2504.430908203125, "logps/rejected": -2501.93359375, "loss": 20.4159, "rewards/accuracies": 0.5, "rewards/chosen": -203.63671875, "rewards/margins": -15.809593200683594, "rewards/rejected": -187.82711791992188, "step": 31740 }, { "epoch": 1.84, "grad_norm": 0.0, "learning_rate": 0.0003885212276016874, "logits/chosen": -16.95230484008789, "logits/rejected": -16.75661849975586, "logps/chosen": -2794.278564453125, "logps/rejected": -2404.66015625, "loss": 0.886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.60062408447266, "rewards/margins": 15.629000663757324, "rewards/rejected": -141.22962951660156, "step": 31750 }, { "epoch": 1.84, "grad_norm": 73.54195404052734, "learning_rate": 0.000388327721661055, "logits/chosen": -17.993139266967773, "logits/rejected": -17.877687454223633, "logps/chosen": -2542.362060546875, "logps/rejected": -2436.142578125, "loss": 11.6699, "rewards/accuracies": 0.5, "rewards/chosen": -157.24398803710938, "rewards/margins": -6.049246311187744, "rewards/rejected": -151.19471740722656, "step": 31760 }, { "epoch": 1.84, "grad_norm": 2.6208698749542236, "learning_rate": 0.00038813421572042264, "logits/chosen": -14.952020645141602, "logits/rejected": -14.984804153442383, "logps/chosen": -3054.326416015625, "logps/rejected": -3029.487060546875, "loss": 6.1785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.54759216308594, "rewards/margins": 0.5114504098892212, "rewards/rejected": -155.05905151367188, "step": 31770 }, { "epoch": 1.84, "grad_norm": 1.3888093651066255e-19, "learning_rate": 0.00038794070977979026, "logits/chosen": -14.653802871704102, "logits/rejected": -14.493173599243164, "logps/chosen": -2888.36328125, "logps/rejected": -2562.1689453125, "loss": 8.2133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -102.22761535644531, "rewards/margins": 6.260080337524414, "rewards/rejected": -108.48768615722656, "step": 31780 }, { "epoch": 1.84, "grad_norm": 1.3434182619675994e-06, "learning_rate": 0.0003877472038391579, "logits/chosen": -16.330196380615234, "logits/rejected": -15.997465133666992, "logps/chosen": -2415.47216796875, "logps/rejected": -2417.37451171875, "loss": 1.2913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.55526733398438, "rewards/margins": 14.498451232910156, "rewards/rejected": -187.0537109375, "step": 31790 }, { "epoch": 1.84, "grad_norm": 157.7950439453125, "learning_rate": 0.0003875536978985255, "logits/chosen": -13.558298110961914, "logits/rejected": -12.833338737487793, "logps/chosen": -3041.88037109375, "logps/rejected": -2852.25244140625, "loss": 10.2829, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -127.18904113769531, "rewards/margins": -3.508941650390625, "rewards/rejected": -123.68009185791016, "step": 31800 }, { "epoch": 1.84, "grad_norm": 1.850509491734681e-15, "learning_rate": 0.0003873601919578931, "logits/chosen": -16.747478485107422, "logits/rejected": -16.619876861572266, "logps/chosen": -2900.46728515625, "logps/rejected": -2749.151123046875, "loss": 1.1357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -151.77102661132812, "rewards/margins": 15.234428405761719, "rewards/rejected": -167.0054473876953, "step": 31810 }, { "epoch": 1.84, "grad_norm": 30.410438537597656, "learning_rate": 0.0003871666860172607, "logits/chosen": -15.12329387664795, "logits/rejected": -15.242057800292969, "logps/chosen": -2250.3779296875, "logps/rejected": -2617.5419921875, "loss": 9.2781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -167.52734375, "rewards/margins": -5.697525978088379, "rewards/rejected": -161.829833984375, "step": 31820 }, { "epoch": 1.84, "grad_norm": 4.113471716027384e-11, "learning_rate": 0.00038697318007662834, "logits/chosen": -13.887290954589844, "logits/rejected": -14.126029968261719, "logps/chosen": -2609.78466796875, "logps/rejected": -2607.912841796875, "loss": 5.7677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -117.43955993652344, "rewards/margins": 3.167282819747925, "rewards/rejected": -120.60684967041016, "step": 31830 }, { "epoch": 1.84, "grad_norm": 11.010801315307617, "learning_rate": 0.00038677967413599596, "logits/chosen": -14.670123100280762, "logits/rejected": -15.438308715820312, "logps/chosen": -2782.522216796875, "logps/rejected": -2707.260986328125, "loss": 6.7961, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -161.43402099609375, "rewards/margins": 5.3158721923828125, "rewards/rejected": -166.74990844726562, "step": 31840 }, { "epoch": 1.84, "grad_norm": 1.553827067013458e-09, "learning_rate": 0.00038658616819536363, "logits/chosen": -15.850442886352539, "logits/rejected": -15.808004379272461, "logps/chosen": -2677.48779296875, "logps/rejected": -2410.181396484375, "loss": 2.1041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.53732299804688, "rewards/margins": 6.2448015213012695, "rewards/rejected": -166.78211975097656, "step": 31850 }, { "epoch": 1.84, "grad_norm": 2.884628571165476e-13, "learning_rate": 0.00038639266225473125, "logits/chosen": -14.914545059204102, "logits/rejected": -15.236987113952637, "logps/chosen": -3010.196044921875, "logps/rejected": -2642.41796875, "loss": 9.5567, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -159.17630004882812, "rewards/margins": -0.44321250915527344, "rewards/rejected": -158.73306274414062, "step": 31860 }, { "epoch": 1.84, "grad_norm": 143.49041748046875, "learning_rate": 0.00038619915631409887, "logits/chosen": -16.13609504699707, "logits/rejected": -16.30010414123535, "logps/chosen": -2672.087890625, "logps/rejected": -2371.948486328125, "loss": 4.1407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.97525024414062, "rewards/margins": 7.711872100830078, "rewards/rejected": -197.6871337890625, "step": 31870 }, { "epoch": 1.85, "grad_norm": 8.831369473607034e-14, "learning_rate": 0.0003860056503734665, "logits/chosen": -15.24852180480957, "logits/rejected": -14.625322341918945, "logps/chosen": -2875.18603515625, "logps/rejected": -2719.672607421875, "loss": 6.7108, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -180.04115295410156, "rewards/margins": 10.385272979736328, "rewards/rejected": -190.4264373779297, "step": 31880 }, { "epoch": 1.85, "grad_norm": 18.707462310791016, "learning_rate": 0.0003858121444328341, "logits/chosen": -17.534202575683594, "logits/rejected": -16.779541015625, "logps/chosen": -2885.138671875, "logps/rejected": -2771.66259765625, "loss": 1.1768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -174.71743774414062, "rewards/margins": 3.3587429523468018, "rewards/rejected": -178.076171875, "step": 31890 }, { "epoch": 1.85, "grad_norm": 53.45650100708008, "learning_rate": 0.00038561863849220177, "logits/chosen": -16.842443466186523, "logits/rejected": -16.702024459838867, "logps/chosen": -2725.9931640625, "logps/rejected": -2631.86767578125, "loss": 3.5571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.86660766601562, "rewards/margins": 16.689294815063477, "rewards/rejected": -182.555908203125, "step": 31900 }, { "epoch": 1.85, "grad_norm": 1.9350433349609375, "learning_rate": 0.00038542513255156933, "logits/chosen": -15.448732376098633, "logits/rejected": -14.34271240234375, "logps/chosen": -2988.53662109375, "logps/rejected": -2295.0595703125, "loss": 11.9648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -124.6841812133789, "rewards/margins": 3.5342178344726562, "rewards/rejected": -128.21841430664062, "step": 31910 }, { "epoch": 1.85, "grad_norm": 308.57958984375, "learning_rate": 0.00038523162661093695, "logits/chosen": -17.602399826049805, "logits/rejected": -17.46283531188965, "logps/chosen": -2771.509765625, "logps/rejected": -2497.15185546875, "loss": 14.1614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -177.27700805664062, "rewards/margins": -5.499289035797119, "rewards/rejected": -171.77772521972656, "step": 31920 }, { "epoch": 1.85, "grad_norm": 5.079760398984945e-07, "learning_rate": 0.00038503812067030457, "logits/chosen": -16.254093170166016, "logits/rejected": -16.297380447387695, "logps/chosen": -3048.639404296875, "logps/rejected": -2851.928466796875, "loss": 0.2722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.03486633300781, "rewards/margins": 18.453121185302734, "rewards/rejected": -137.48797607421875, "step": 31930 }, { "epoch": 1.85, "grad_norm": 4.428882149909441e-09, "learning_rate": 0.0003848446147296722, "logits/chosen": -17.641414642333984, "logits/rejected": -17.188037872314453, "logps/chosen": -2764.800048828125, "logps/rejected": -2764.143310546875, "loss": 6.8013, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -236.75537109375, "rewards/margins": 2.059124708175659, "rewards/rejected": -238.8144989013672, "step": 31940 }, { "epoch": 1.85, "grad_norm": 80.74352264404297, "learning_rate": 0.0003846511087890398, "logits/chosen": -18.59549331665039, "logits/rejected": -20.209033966064453, "logps/chosen": -2684.601318359375, "logps/rejected": -2442.028564453125, "loss": 8.0689, "rewards/accuracies": 0.5, "rewards/chosen": -165.08348083496094, "rewards/margins": -1.0033092498779297, "rewards/rejected": -164.08018493652344, "step": 31950 }, { "epoch": 1.85, "grad_norm": 137.257568359375, "learning_rate": 0.0003844576028484075, "logits/chosen": -16.461013793945312, "logits/rejected": -15.9320650100708, "logps/chosen": -2747.17138671875, "logps/rejected": -2638.734375, "loss": 8.6962, "rewards/accuracies": 0.5, "rewards/chosen": -166.35671997070312, "rewards/margins": -0.8365761041641235, "rewards/rejected": -165.5201416015625, "step": 31960 }, { "epoch": 1.85, "grad_norm": 70.82767486572266, "learning_rate": 0.0003842640969077751, "logits/chosen": -16.621051788330078, "logits/rejected": -16.575010299682617, "logps/chosen": -2425.160888671875, "logps/rejected": -2482.129150390625, "loss": 2.1209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -166.98257446289062, "rewards/margins": 18.484060287475586, "rewards/rejected": -185.46664428710938, "step": 31970 }, { "epoch": 1.85, "grad_norm": 62.074432373046875, "learning_rate": 0.0003840705909671427, "logits/chosen": -20.03301429748535, "logits/rejected": -20.313533782958984, "logps/chosen": -2490.670654296875, "logps/rejected": -2386.84326171875, "loss": 8.995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.9356231689453, "rewards/margins": -2.6304473876953125, "rewards/rejected": -215.30517578125, "step": 31980 }, { "epoch": 1.85, "grad_norm": 89.16320037841797, "learning_rate": 0.0003838770850265103, "logits/chosen": -18.87446403503418, "logits/rejected": -18.09283447265625, "logps/chosen": -2187.09765625, "logps/rejected": -2230.35888671875, "loss": 16.9504, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.1907958984375, "rewards/margins": 2.138888120651245, "rewards/rejected": -160.32968139648438, "step": 31990 }, { "epoch": 1.85, "grad_norm": 66.01690673828125, "learning_rate": 0.00038368357908587794, "logits/chosen": -16.10749626159668, "logits/rejected": -15.605761528015137, "logps/chosen": -2603.306884765625, "logps/rejected": -2615.0869140625, "loss": 4.1682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.8662109375, "rewards/margins": 7.545478820800781, "rewards/rejected": -135.41171264648438, "step": 32000 }, { "epoch": 1.85, "grad_norm": 38.119049072265625, "learning_rate": 0.0003834900731452456, "logits/chosen": -18.351205825805664, "logits/rejected": -18.159610748291016, "logps/chosen": -2911.11279296875, "logps/rejected": -2864.860107421875, "loss": 5.8636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -152.80091857910156, "rewards/margins": 10.175211906433105, "rewards/rejected": -162.97613525390625, "step": 32010 }, { "epoch": 1.85, "grad_norm": 1.8580840485035313e-19, "learning_rate": 0.0003832965672046132, "logits/chosen": -17.966678619384766, "logits/rejected": -19.105987548828125, "logps/chosen": -2583.57080078125, "logps/rejected": -2541.86474609375, "loss": 20.2251, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -186.00485229492188, "rewards/margins": -8.28175163269043, "rewards/rejected": -177.72312927246094, "step": 32020 }, { "epoch": 1.85, "grad_norm": 6.26402473449707, "learning_rate": 0.0003831030612639808, "logits/chosen": -16.720470428466797, "logits/rejected": -17.005325317382812, "logps/chosen": -2970.68701171875, "logps/rejected": -2602.20654296875, "loss": 3.2277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.42727661132812, "rewards/margins": 8.18597412109375, "rewards/rejected": -150.61325073242188, "step": 32030 }, { "epoch": 1.85, "grad_norm": 1.1680484357512455e-09, "learning_rate": 0.0003829095553233484, "logits/chosen": -17.126455307006836, "logits/rejected": -16.80112075805664, "logps/chosen": -2745.9189453125, "logps/rejected": -2664.184326171875, "loss": 5.6404, "rewards/accuracies": 0.5, "rewards/chosen": -170.6118927001953, "rewards/margins": 2.9768898487091064, "rewards/rejected": -173.58877563476562, "step": 32040 }, { "epoch": 1.86, "grad_norm": 4.638475825657906e-09, "learning_rate": 0.00038271604938271603, "logits/chosen": -17.685619354248047, "logits/rejected": -17.602323532104492, "logps/chosen": -2663.97998046875, "logps/rejected": -2419.01220703125, "loss": 15.9501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.9380645751953, "rewards/margins": -2.517831325531006, "rewards/rejected": -164.42022705078125, "step": 32050 }, { "epoch": 1.86, "grad_norm": 128.44253540039062, "learning_rate": 0.0003825225434420837, "logits/chosen": -18.608722686767578, "logits/rejected": -19.376222610473633, "logps/chosen": -2879.235595703125, "logps/rejected": -2675.56005859375, "loss": 16.5482, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -182.17027282714844, "rewards/margins": -11.962559700012207, "rewards/rejected": -170.2077178955078, "step": 32060 }, { "epoch": 1.86, "grad_norm": 1.674932698030302e-21, "learning_rate": 0.0003823290375014513, "logits/chosen": -18.662778854370117, "logits/rejected": -18.0062313079834, "logps/chosen": -3171.636474609375, "logps/rejected": -2922.78271484375, "loss": 9.2557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -232.23171997070312, "rewards/margins": 1.0586563348770142, "rewards/rejected": -233.29037475585938, "step": 32070 }, { "epoch": 1.86, "grad_norm": 0.06637870520353317, "learning_rate": 0.00038213553156081893, "logits/chosen": -16.255390167236328, "logits/rejected": -17.035184860229492, "logps/chosen": -3031.685302734375, "logps/rejected": -2947.95458984375, "loss": 1.4348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.2415313720703, "rewards/margins": 7.970010280609131, "rewards/rejected": -168.2115478515625, "step": 32080 }, { "epoch": 1.86, "grad_norm": 2.8489291707956e-06, "learning_rate": 0.00038194202562018655, "logits/chosen": -18.827600479125977, "logits/rejected": -18.591724395751953, "logps/chosen": -2859.118896484375, "logps/rejected": -2937.62939453125, "loss": 2.5348, "rewards/accuracies": 0.5, "rewards/chosen": -150.93185424804688, "rewards/margins": 6.524823188781738, "rewards/rejected": -157.45668029785156, "step": 32090 }, { "epoch": 1.86, "grad_norm": 77.46934509277344, "learning_rate": 0.00038174851967955417, "logits/chosen": -15.826502799987793, "logits/rejected": -15.464834213256836, "logps/chosen": -2916.557861328125, "logps/rejected": -2499.930419921875, "loss": 6.8279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -182.297119140625, "rewards/margins": 8.368096351623535, "rewards/rejected": -190.6652069091797, "step": 32100 }, { "epoch": 1.86, "grad_norm": 0.0, "learning_rate": 0.00038155501373892184, "logits/chosen": -15.347981452941895, "logits/rejected": -14.995630264282227, "logps/chosen": -2536.744384765625, "logps/rejected": -1920.4925537109375, "loss": 14.7383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -101.3334732055664, "rewards/margins": 9.367688179016113, "rewards/rejected": -110.701171875, "step": 32110 }, { "epoch": 1.86, "grad_norm": 0.00036220980109646916, "learning_rate": 0.00038136150779828946, "logits/chosen": -15.15025520324707, "logits/rejected": -15.730682373046875, "logps/chosen": -3022.44384765625, "logps/rejected": -2638.2333984375, "loss": 4.1732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.87362670898438, "rewards/margins": 11.957618713378906, "rewards/rejected": -155.8312530517578, "step": 32120 }, { "epoch": 1.86, "grad_norm": 90.42298889160156, "learning_rate": 0.000381168001857657, "logits/chosen": -15.344096183776855, "logits/rejected": -15.441637992858887, "logps/chosen": -2467.94091796875, "logps/rejected": -2286.092529296875, "loss": 1.1453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.01072692871094, "rewards/margins": 14.314857482910156, "rewards/rejected": -186.32559204101562, "step": 32130 }, { "epoch": 1.86, "grad_norm": 22.119213104248047, "learning_rate": 0.00038097449591702464, "logits/chosen": -17.789525985717773, "logits/rejected": -17.712926864624023, "logps/chosen": -2813.03369140625, "logps/rejected": -2707.89013671875, "loss": 2.3491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.1686248779297, "rewards/margins": 10.945055961608887, "rewards/rejected": -218.11367797851562, "step": 32140 }, { "epoch": 1.86, "grad_norm": 9.525044788460946e-07, "learning_rate": 0.00038078098997639225, "logits/chosen": -14.3070707321167, "logits/rejected": -13.820857048034668, "logps/chosen": -2236.634765625, "logps/rejected": -2368.17333984375, "loss": 5.5217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -98.22500610351562, "rewards/margins": 5.65418815612793, "rewards/rejected": -103.87919616699219, "step": 32150 }, { "epoch": 1.86, "grad_norm": 5.089316466051882e-16, "learning_rate": 0.00038058748403575987, "logits/chosen": -13.971647262573242, "logits/rejected": -13.83916187286377, "logps/chosen": -2559.98828125, "logps/rejected": -2390.81494140625, "loss": 3.2803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.17051696777344, "rewards/margins": 9.757759094238281, "rewards/rejected": -172.9282684326172, "step": 32160 }, { "epoch": 1.86, "grad_norm": 72.90176391601562, "learning_rate": 0.00038039397809512754, "logits/chosen": -12.058627128601074, "logits/rejected": -11.658223152160645, "logps/chosen": -2689.674560546875, "logps/rejected": -2601.43603515625, "loss": 5.2131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -60.67113494873047, "rewards/margins": 7.238592624664307, "rewards/rejected": -67.90973663330078, "step": 32170 }, { "epoch": 1.86, "grad_norm": 47.96951675415039, "learning_rate": 0.00038020047215449516, "logits/chosen": -15.128445625305176, "logits/rejected": -15.183977127075195, "logps/chosen": -3153.83056640625, "logps/rejected": -3102.247802734375, "loss": 9.6843, "rewards/accuracies": 0.5, "rewards/chosen": -200.11660766601562, "rewards/margins": -3.0118629932403564, "rewards/rejected": -197.10472106933594, "step": 32180 }, { "epoch": 1.86, "grad_norm": 91.54042053222656, "learning_rate": 0.0003800069662138628, "logits/chosen": -13.997645378112793, "logits/rejected": -13.582906723022461, "logps/chosen": -3013.06982421875, "logps/rejected": -3032.622802734375, "loss": 3.1348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -120.0965576171875, "rewards/margins": 1.90842604637146, "rewards/rejected": -122.0049819946289, "step": 32190 }, { "epoch": 1.86, "grad_norm": 0.4188946485519409, "learning_rate": 0.0003798134602732304, "logits/chosen": -17.7796573638916, "logits/rejected": -17.91379165649414, "logps/chosen": -2885.22314453125, "logps/rejected": -2671.154296875, "loss": 2.116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.23934936523438, "rewards/margins": 5.863409519195557, "rewards/rejected": -206.102783203125, "step": 32200 }, { "epoch": 1.86, "grad_norm": 6.86294887586314e-09, "learning_rate": 0.000379619954332598, "logits/chosen": -16.92047691345215, "logits/rejected": -16.36307716369629, "logps/chosen": -2708.448486328125, "logps/rejected": -2676.41845703125, "loss": 4.7939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -186.5532989501953, "rewards/margins": 3.047208070755005, "rewards/rejected": -189.6005096435547, "step": 32210 }, { "epoch": 1.87, "grad_norm": 0.0006557157030329108, "learning_rate": 0.0003794264483919657, "logits/chosen": -14.816999435424805, "logits/rejected": -14.367715835571289, "logps/chosen": -2699.388916015625, "logps/rejected": -2861.033935546875, "loss": 1.4941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -150.836181640625, "rewards/margins": 20.614442825317383, "rewards/rejected": -171.4506072998047, "step": 32220 }, { "epoch": 1.87, "grad_norm": 99.17916870117188, "learning_rate": 0.0003792329424513333, "logits/chosen": -17.779937744140625, "logits/rejected": -17.346769332885742, "logps/chosen": -2152.75244140625, "logps/rejected": -2254.013427734375, "loss": 21.4077, "rewards/accuracies": 0.5, "rewards/chosen": -177.3564453125, "rewards/margins": 5.03481912612915, "rewards/rejected": -182.3912353515625, "step": 32230 }, { "epoch": 1.87, "grad_norm": 109.4664535522461, "learning_rate": 0.00037903943651070086, "logits/chosen": -14.516253471374512, "logits/rejected": -15.076807022094727, "logps/chosen": -2957.169921875, "logps/rejected": -2758.025634765625, "loss": 15.8806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -164.19058227539062, "rewards/margins": -9.342107772827148, "rewards/rejected": -154.84848022460938, "step": 32240 }, { "epoch": 1.87, "grad_norm": 2.2202608585357666, "learning_rate": 0.0003788459305700685, "logits/chosen": -15.728601455688477, "logits/rejected": -15.223260879516602, "logps/chosen": -2663.381591796875, "logps/rejected": -2659.03759765625, "loss": 1.5508, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -125.36033630371094, "rewards/margins": 9.687491416931152, "rewards/rejected": -135.04783630371094, "step": 32250 }, { "epoch": 1.87, "grad_norm": 28.037805557250977, "learning_rate": 0.0003786524246294361, "logits/chosen": -13.005290031433105, "logits/rejected": -13.10535717010498, "logps/chosen": -3014.490234375, "logps/rejected": -2834.75537109375, "loss": 5.376, "rewards/accuracies": 0.5, "rewards/chosen": -104.99378967285156, "rewards/margins": 4.8495774269104, "rewards/rejected": -109.84336853027344, "step": 32260 }, { "epoch": 1.87, "grad_norm": 6.430092054513295e-14, "learning_rate": 0.00037845891868880377, "logits/chosen": -13.790109634399414, "logits/rejected": -13.689648628234863, "logps/chosen": -2970.2119140625, "logps/rejected": -2924.37451171875, "loss": 6.3702, "rewards/accuracies": 0.5, "rewards/chosen": -141.8578338623047, "rewards/margins": 4.104060173034668, "rewards/rejected": -145.96188354492188, "step": 32270 }, { "epoch": 1.87, "grad_norm": 24.248544692993164, "learning_rate": 0.0003782654127481714, "logits/chosen": -12.487526893615723, "logits/rejected": -12.182621955871582, "logps/chosen": -3222.501220703125, "logps/rejected": -2952.836669921875, "loss": 9.5871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -104.69325256347656, "rewards/margins": -1.8818098306655884, "rewards/rejected": -102.8114242553711, "step": 32280 }, { "epoch": 1.87, "grad_norm": 4.569322300085332e-06, "learning_rate": 0.000378071906807539, "logits/chosen": -12.74087142944336, "logits/rejected": -12.355169296264648, "logps/chosen": -2578.39453125, "logps/rejected": -2559.417724609375, "loss": 2.9144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -95.52236938476562, "rewards/margins": 14.985026359558105, "rewards/rejected": -110.50740051269531, "step": 32290 }, { "epoch": 1.87, "grad_norm": 45.1838264465332, "learning_rate": 0.0003778784008669066, "logits/chosen": -13.444482803344727, "logits/rejected": -13.28441047668457, "logps/chosen": -2726.21728515625, "logps/rejected": -2770.59326171875, "loss": 4.9706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.62722778320312, "rewards/margins": 13.1404390335083, "rewards/rejected": -143.76768493652344, "step": 32300 }, { "epoch": 1.87, "grad_norm": 43.20091247558594, "learning_rate": 0.00037768489492627424, "logits/chosen": -17.94447135925293, "logits/rejected": -17.636648178100586, "logps/chosen": -2562.7705078125, "logps/rejected": -2546.3046875, "loss": 4.3125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -235.9164276123047, "rewards/margins": -0.8328613042831421, "rewards/rejected": -235.0835723876953, "step": 32310 }, { "epoch": 1.87, "grad_norm": 96.59170532226562, "learning_rate": 0.0003774913889856419, "logits/chosen": -14.078794479370117, "logits/rejected": -13.53014850616455, "logps/chosen": -2890.58740234375, "logps/rejected": -2882.141357421875, "loss": 2.4885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -208.71298217773438, "rewards/margins": 12.69869613647461, "rewards/rejected": -221.4116973876953, "step": 32320 }, { "epoch": 1.87, "grad_norm": 8.339541464350407e-18, "learning_rate": 0.0003772978830450095, "logits/chosen": -14.795793533325195, "logits/rejected": -14.917877197265625, "logps/chosen": -2919.811279296875, "logps/rejected": -2812.29052734375, "loss": 7.3288, "rewards/accuracies": 0.5, "rewards/chosen": -179.51339721679688, "rewards/margins": 5.641371726989746, "rewards/rejected": -185.15478515625, "step": 32330 }, { "epoch": 1.87, "grad_norm": 9.148231328026668e-08, "learning_rate": 0.0003771043771043771, "logits/chosen": -12.334084510803223, "logits/rejected": -11.992825508117676, "logps/chosen": -3041.54345703125, "logps/rejected": -2264.823974609375, "loss": 2.2771, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -88.37641143798828, "rewards/margins": 32.27955627441406, "rewards/rejected": -120.65596771240234, "step": 32340 }, { "epoch": 1.87, "grad_norm": 102.20218658447266, "learning_rate": 0.0003769108711637447, "logits/chosen": -16.41856575012207, "logits/rejected": -16.79281997680664, "logps/chosen": -2513.243408203125, "logps/rejected": -2375.693359375, "loss": 11.9552, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -222.2848358154297, "rewards/margins": -8.758627891540527, "rewards/rejected": -213.5261993408203, "step": 32350 }, { "epoch": 1.87, "grad_norm": 0.25409021973609924, "learning_rate": 0.0003767173652231123, "logits/chosen": -12.491747856140137, "logits/rejected": -12.406578063964844, "logps/chosen": -2837.017822265625, "logps/rejected": -2848.358642578125, "loss": 1.7174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.8808822631836, "rewards/margins": 8.84046745300293, "rewards/rejected": -127.7213363647461, "step": 32360 }, { "epoch": 1.87, "grad_norm": 2.345873554077116e-06, "learning_rate": 0.00037652385928247994, "logits/chosen": -12.824483871459961, "logits/rejected": -12.672323226928711, "logps/chosen": -2799.356201171875, "logps/rejected": -2818.246337890625, "loss": 3.729, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -113.25433349609375, "rewards/margins": -0.17992344498634338, "rewards/rejected": -113.07441711425781, "step": 32370 }, { "epoch": 1.87, "grad_norm": 61.98116683959961, "learning_rate": 0.0003763303533418476, "logits/chosen": -12.218160629272461, "logits/rejected": -11.163629531860352, "logps/chosen": -3090.98291015625, "logps/rejected": -2796.754638671875, "loss": 5.4389, "rewards/accuracies": 0.5, "rewards/chosen": -159.23362731933594, "rewards/margins": 1.6347240209579468, "rewards/rejected": -160.86834716796875, "step": 32380 }, { "epoch": 1.87, "grad_norm": 0.0002527947071939707, "learning_rate": 0.00037613684740121523, "logits/chosen": -10.616438865661621, "logits/rejected": -10.42082405090332, "logps/chosen": -2833.468994140625, "logps/rejected": -2677.2099609375, "loss": 4.5126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -85.86682891845703, "rewards/margins": 10.864656448364258, "rewards/rejected": -96.73149108886719, "step": 32390 }, { "epoch": 1.88, "grad_norm": 0.0008054719655774534, "learning_rate": 0.00037594334146058285, "logits/chosen": -12.815617561340332, "logits/rejected": -13.214042663574219, "logps/chosen": -2772.303466796875, "logps/rejected": -2733.422119140625, "loss": 7.1907, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -169.283203125, "rewards/margins": -3.6289806365966797, "rewards/rejected": -165.65423583984375, "step": 32400 }, { "epoch": 1.88, "grad_norm": 8.561609199375653e-09, "learning_rate": 0.00037574983551995046, "logits/chosen": -9.560081481933594, "logits/rejected": -9.722249031066895, "logps/chosen": -3105.94091796875, "logps/rejected": -2866.97705078125, "loss": 1.1672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -106.5106430053711, "rewards/margins": 10.953763008117676, "rewards/rejected": -117.46440124511719, "step": 32410 }, { "epoch": 1.88, "grad_norm": 3.4088247957697604e-07, "learning_rate": 0.0003755563295793181, "logits/chosen": -10.603678703308105, "logits/rejected": -10.39234733581543, "logps/chosen": -2669.03759765625, "logps/rejected": -2493.22607421875, "loss": 1.487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.8798370361328, "rewards/margins": 12.102277755737305, "rewards/rejected": -166.9821014404297, "step": 32420 }, { "epoch": 1.88, "grad_norm": 20.911556243896484, "learning_rate": 0.00037536282363868575, "logits/chosen": -9.244368553161621, "logits/rejected": -9.096868515014648, "logps/chosen": -3086.8359375, "logps/rejected": -3188.17724609375, "loss": 3.7002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.59890747070312, "rewards/margins": 6.7449469566345215, "rewards/rejected": -142.3438720703125, "step": 32430 }, { "epoch": 1.88, "grad_norm": 56.354827880859375, "learning_rate": 0.00037516931769805337, "logits/chosen": -11.349504470825195, "logits/rejected": -11.68166446685791, "logps/chosen": -2813.66845703125, "logps/rejected": -3283.77880859375, "loss": 9.1062, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -192.63250732421875, "rewards/margins": -8.531842231750488, "rewards/rejected": -184.10067749023438, "step": 32440 }, { "epoch": 1.88, "grad_norm": 0.23474542796611786, "learning_rate": 0.00037497581175742093, "logits/chosen": -10.33344841003418, "logits/rejected": -10.274945259094238, "logps/chosen": -2739.376708984375, "logps/rejected": -2071.342529296875, "loss": 10.266, "rewards/accuracies": 0.5, "rewards/chosen": -173.64793395996094, "rewards/margins": -7.2242584228515625, "rewards/rejected": -166.42367553710938, "step": 32450 }, { "epoch": 1.88, "grad_norm": 2.667538642883301, "learning_rate": 0.00037478230581678855, "logits/chosen": -12.331344604492188, "logits/rejected": -12.059892654418945, "logps/chosen": -2766.100341796875, "logps/rejected": -2647.844970703125, "loss": 14.1086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.32640075683594, "rewards/margins": -8.115461349487305, "rewards/rejected": -158.21095275878906, "step": 32460 }, { "epoch": 1.88, "grad_norm": 157.9792022705078, "learning_rate": 0.00037458879987615617, "logits/chosen": -11.356314659118652, "logits/rejected": -11.549667358398438, "logps/chosen": -2811.330078125, "logps/rejected": -2824.25439453125, "loss": 2.2897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.3084259033203, "rewards/margins": 4.905174255371094, "rewards/rejected": -159.21359252929688, "step": 32470 }, { "epoch": 1.88, "grad_norm": 201.63824462890625, "learning_rate": 0.00037439529393552384, "logits/chosen": -12.776172637939453, "logits/rejected": -12.11775016784668, "logps/chosen": -2918.22509765625, "logps/rejected": -2606.951904296875, "loss": 20.833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.83303833007812, "rewards/margins": -6.271035671234131, "rewards/rejected": -139.56198120117188, "step": 32480 }, { "epoch": 1.88, "grad_norm": 0.0, "learning_rate": 0.00037420178799489146, "logits/chosen": -12.609371185302734, "logits/rejected": -12.794210433959961, "logps/chosen": -2403.325439453125, "logps/rejected": -2501.767578125, "loss": 2.9508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.60911560058594, "rewards/margins": 14.392573356628418, "rewards/rejected": -185.00167846679688, "step": 32490 }, { "epoch": 1.88, "grad_norm": 19.7012996673584, "learning_rate": 0.0003740082820542591, "logits/chosen": -13.468780517578125, "logits/rejected": -13.148872375488281, "logps/chosen": -2733.67041015625, "logps/rejected": -2370.248291015625, "loss": 24.7225, "rewards/accuracies": 0.5, "rewards/chosen": -166.06277465820312, "rewards/margins": -20.838897705078125, "rewards/rejected": -145.22390747070312, "step": 32500 }, { "epoch": 1.88, "grad_norm": 2.8587757938680625e-08, "learning_rate": 0.0003738147761136267, "logits/chosen": -13.537304878234863, "logits/rejected": -13.157365798950195, "logps/chosen": -2875.29638671875, "logps/rejected": -2642.74267578125, "loss": 18.3243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.3645782470703, "rewards/margins": -9.54877758026123, "rewards/rejected": -166.8157958984375, "step": 32510 }, { "epoch": 1.88, "grad_norm": 7.57906436920166, "learning_rate": 0.0003736212701729943, "logits/chosen": -14.45959186553955, "logits/rejected": -14.014585494995117, "logps/chosen": -2646.154296875, "logps/rejected": -2631.92919921875, "loss": 7.753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.8839874267578, "rewards/margins": -2.2795004844665527, "rewards/rejected": -167.6044921875, "step": 32520 }, { "epoch": 1.88, "grad_norm": 1.0043657994174282e-07, "learning_rate": 0.000373427764232362, "logits/chosen": -14.87016773223877, "logits/rejected": -14.661079406738281, "logps/chosen": -2456.26416015625, "logps/rejected": -2545.431884765625, "loss": 0.774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.906982421875, "rewards/margins": 12.76820182800293, "rewards/rejected": -186.6751708984375, "step": 32530 }, { "epoch": 1.88, "grad_norm": 0.0005066645098850131, "learning_rate": 0.0003732342582917296, "logits/chosen": -14.013795852661133, "logits/rejected": -13.873077392578125, "logps/chosen": -2174.525390625, "logps/rejected": -2071.91015625, "loss": 5.4728, "rewards/accuracies": 0.5, "rewards/chosen": -114.44879150390625, "rewards/margins": 1.4937961101531982, "rewards/rejected": -115.94258880615234, "step": 32540 }, { "epoch": 1.88, "grad_norm": 61.05585861206055, "learning_rate": 0.0003730407523510972, "logits/chosen": -16.869884490966797, "logits/rejected": -16.34073829650879, "logps/chosen": -2590.446533203125, "logps/rejected": -2372.47021484375, "loss": 1.0855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.3264923095703, "rewards/margins": 17.43914794921875, "rewards/rejected": -206.765625, "step": 32550 }, { "epoch": 1.88, "grad_norm": 7.27604160427641e-16, "learning_rate": 0.0003728472464104648, "logits/chosen": -11.606401443481445, "logits/rejected": -11.588691711425781, "logps/chosen": -2698.906494140625, "logps/rejected": -2454.02294921875, "loss": 5.5406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -86.05619049072266, "rewards/margins": 4.843178749084473, "rewards/rejected": -90.89936828613281, "step": 32560 }, { "epoch": 1.89, "grad_norm": 9.940270780080361e-18, "learning_rate": 0.0003726537404698324, "logits/chosen": -14.661084175109863, "logits/rejected": -14.710192680358887, "logps/chosen": -2711.6640625, "logps/rejected": -2331.497314453125, "loss": 17.3882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.44668579101562, "rewards/margins": -8.668609619140625, "rewards/rejected": -128.77810668945312, "step": 32570 }, { "epoch": 1.89, "grad_norm": 77.30607604980469, "learning_rate": 0.00037246023452920006, "logits/chosen": -13.825920104980469, "logits/rejected": -14.059326171875, "logps/chosen": -3047.037841796875, "logps/rejected": -2846.95751953125, "loss": 25.4582, "rewards/accuracies": 0.5, "rewards/chosen": -139.21080017089844, "rewards/margins": -11.486961364746094, "rewards/rejected": -127.72383880615234, "step": 32580 }, { "epoch": 1.89, "grad_norm": 1.5397246195050784e-12, "learning_rate": 0.0003722667285885677, "logits/chosen": -14.638772964477539, "logits/rejected": -14.303179740905762, "logps/chosen": -2559.252685546875, "logps/rejected": -2353.79296875, "loss": 2.2033, "rewards/accuracies": 0.5, "rewards/chosen": -154.58169555664062, "rewards/margins": 4.170807361602783, "rewards/rejected": -158.7524871826172, "step": 32590 }, { "epoch": 1.89, "grad_norm": 3.341563683534332e-07, "learning_rate": 0.0003720732226479353, "logits/chosen": -14.72924518585205, "logits/rejected": -14.909149169921875, "logps/chosen": -2892.462890625, "logps/rejected": -2521.5859375, "loss": 3.4179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -72.27635192871094, "rewards/margins": 12.36497688293457, "rewards/rejected": -84.64131927490234, "step": 32600 }, { "epoch": 1.89, "grad_norm": 69.09735107421875, "learning_rate": 0.0003718797167073029, "logits/chosen": -16.528596878051758, "logits/rejected": -16.577186584472656, "logps/chosen": -2542.71923828125, "logps/rejected": -2472.974365234375, "loss": 2.2671, "rewards/accuracies": 0.5, "rewards/chosen": -186.42999267578125, "rewards/margins": 3.1430931091308594, "rewards/rejected": -189.57308959960938, "step": 32610 }, { "epoch": 1.89, "grad_norm": 270.9193420410156, "learning_rate": 0.00037168621076667053, "logits/chosen": -17.506200790405273, "logits/rejected": -17.06772232055664, "logps/chosen": -2658.0302734375, "logps/rejected": -2734.73291015625, "loss": 18.3026, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.1167755126953, "rewards/margins": -13.766054153442383, "rewards/rejected": -177.3507080078125, "step": 32620 }, { "epoch": 1.89, "grad_norm": 2.980931282043457, "learning_rate": 0.00037149270482603815, "logits/chosen": -18.749250411987305, "logits/rejected": -18.893484115600586, "logps/chosen": -2722.197998046875, "logps/rejected": -2661.685302734375, "loss": 1.8885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.09506225585938, "rewards/margins": 20.9746150970459, "rewards/rejected": -193.06967163085938, "step": 32630 }, { "epoch": 1.89, "grad_norm": 3.8501226902008057, "learning_rate": 0.0003712991988854058, "logits/chosen": -14.05189037322998, "logits/rejected": -14.057519912719727, "logps/chosen": -2853.078369140625, "logps/rejected": -2842.4794921875, "loss": 1.8511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.6959991455078, "rewards/margins": 3.6539714336395264, "rewards/rejected": -141.3499755859375, "step": 32640 }, { "epoch": 1.89, "grad_norm": 66.34224700927734, "learning_rate": 0.00037110569294477344, "logits/chosen": -13.662898063659668, "logits/rejected": -13.723655700683594, "logps/chosen": -3080.712646484375, "logps/rejected": -2833.82177734375, "loss": 3.7111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -173.78758239746094, "rewards/margins": 8.831438064575195, "rewards/rejected": -182.61903381347656, "step": 32650 }, { "epoch": 1.89, "grad_norm": 70.29911041259766, "learning_rate": 0.00037091218700414106, "logits/chosen": -14.429855346679688, "logits/rejected": -14.200174331665039, "logps/chosen": -2668.497802734375, "logps/rejected": -2658.65283203125, "loss": 3.6145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.95455932617188, "rewards/margins": 19.582292556762695, "rewards/rejected": -188.53683471679688, "step": 32660 }, { "epoch": 1.89, "grad_norm": 3.842011210508645e-06, "learning_rate": 0.0003707186810635086, "logits/chosen": -16.145523071289062, "logits/rejected": -15.941499710083008, "logps/chosen": -2667.41748046875, "logps/rejected": -2324.31689453125, "loss": 8.9311, "rewards/accuracies": 0.5, "rewards/chosen": -190.30735778808594, "rewards/margins": 11.20069694519043, "rewards/rejected": -201.508056640625, "step": 32670 }, { "epoch": 1.89, "grad_norm": 56.87923049926758, "learning_rate": 0.00037052517512287624, "logits/chosen": -12.660280227661133, "logits/rejected": -12.707429885864258, "logps/chosen": -3065.80029296875, "logps/rejected": -2729.530517578125, "loss": 4.2373, "rewards/accuracies": 0.5, "rewards/chosen": -162.65293884277344, "rewards/margins": 0.14176884293556213, "rewards/rejected": -162.7947235107422, "step": 32680 }, { "epoch": 1.89, "grad_norm": 142.2414093017578, "learning_rate": 0.0003703316691822439, "logits/chosen": -11.343961715698242, "logits/rejected": -11.312207221984863, "logps/chosen": -3026.343994140625, "logps/rejected": -2860.3427734375, "loss": 1.172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -95.18463897705078, "rewards/margins": 17.395587921142578, "rewards/rejected": -112.5802230834961, "step": 32690 }, { "epoch": 1.89, "grad_norm": 0.10623923689126968, "learning_rate": 0.0003701381632416115, "logits/chosen": -11.946527481079102, "logits/rejected": -11.941606521606445, "logps/chosen": -2993.14453125, "logps/rejected": -2948.633056640625, "loss": 0.7212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.95034790039062, "rewards/margins": 7.3680830001831055, "rewards/rejected": -170.3184356689453, "step": 32700 }, { "epoch": 1.89, "grad_norm": 4.472021406345169e-15, "learning_rate": 0.00036994465730097914, "logits/chosen": -14.928202629089355, "logits/rejected": -15.152549743652344, "logps/chosen": -2656.89306640625, "logps/rejected": -2610.95751953125, "loss": 1.2348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -199.5487060546875, "rewards/margins": 17.551366806030273, "rewards/rejected": -217.1000518798828, "step": 32710 }, { "epoch": 1.89, "grad_norm": 71.42363739013672, "learning_rate": 0.00036975115136034676, "logits/chosen": -11.15931510925293, "logits/rejected": -11.006860733032227, "logps/chosen": -2747.042236328125, "logps/rejected": -2863.78271484375, "loss": 7.9529, "rewards/accuracies": 0.5, "rewards/chosen": -167.4406280517578, "rewards/margins": -1.9654762744903564, "rewards/rejected": -165.47515869140625, "step": 32720 }, { "epoch": 1.89, "grad_norm": 1.2363856466413026e-11, "learning_rate": 0.0003695576454197144, "logits/chosen": -10.394250869750977, "logits/rejected": -10.593047142028809, "logps/chosen": -2765.315185546875, "logps/rejected": -2561.19384765625, "loss": 2.1027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -158.0552215576172, "rewards/margins": 8.655954360961914, "rewards/rejected": -166.711181640625, "step": 32730 }, { "epoch": 1.9, "grad_norm": 90.97293853759766, "learning_rate": 0.00036936413947908205, "logits/chosen": -10.583406448364258, "logits/rejected": -10.52259635925293, "logps/chosen": -3053.7109375, "logps/rejected": -2864.00146484375, "loss": 16.4349, "rewards/accuracies": 0.5, "rewards/chosen": -127.787841796875, "rewards/margins": 3.7972946166992188, "rewards/rejected": -131.58514404296875, "step": 32740 }, { "epoch": 1.9, "grad_norm": 11.259980201721191, "learning_rate": 0.00036917063353844967, "logits/chosen": -12.096183776855469, "logits/rejected": -11.561213493347168, "logps/chosen": -2965.492431640625, "logps/rejected": -2265.81005859375, "loss": 12.9125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.3697738647461, "rewards/margins": 3.0970053672790527, "rewards/rejected": -122.46675872802734, "step": 32750 }, { "epoch": 1.9, "grad_norm": 107.9267807006836, "learning_rate": 0.0003689771275978173, "logits/chosen": -15.31177043914795, "logits/rejected": -15.365564346313477, "logps/chosen": -2896.935546875, "logps/rejected": -2729.72216796875, "loss": 20.9559, "rewards/accuracies": 0.5, "rewards/chosen": -185.9174041748047, "rewards/margins": -14.246313095092773, "rewards/rejected": -171.67111206054688, "step": 32760 }, { "epoch": 1.9, "grad_norm": 0.7351678609848022, "learning_rate": 0.0003687836216571849, "logits/chosen": -12.06524658203125, "logits/rejected": -12.137557983398438, "logps/chosen": -2796.483154296875, "logps/rejected": -2657.210693359375, "loss": 3.3072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -128.995849609375, "rewards/margins": 2.4567744731903076, "rewards/rejected": -131.45262145996094, "step": 32770 }, { "epoch": 1.9, "grad_norm": 70.11473846435547, "learning_rate": 0.00036859011571655246, "logits/chosen": -16.389598846435547, "logits/rejected": -16.386608123779297, "logps/chosen": -2820.30517578125, "logps/rejected": -2899.786865234375, "loss": 2.5573, "rewards/accuracies": 0.5, "rewards/chosen": -256.7909851074219, "rewards/margins": 6.739076137542725, "rewards/rejected": -263.53009033203125, "step": 32780 }, { "epoch": 1.9, "grad_norm": 6.681644916534424, "learning_rate": 0.00036839660977592013, "logits/chosen": -16.193918228149414, "logits/rejected": -16.675397872924805, "logps/chosen": -2877.259765625, "logps/rejected": -2830.732421875, "loss": 2.5888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.24583435058594, "rewards/margins": 1.6532466411590576, "rewards/rejected": -182.8990936279297, "step": 32790 }, { "epoch": 1.9, "grad_norm": 0.23455150425434113, "learning_rate": 0.00036820310383528775, "logits/chosen": -15.229443550109863, "logits/rejected": -15.10930347442627, "logps/chosen": -2773.089111328125, "logps/rejected": -2540.51416015625, "loss": 0.7702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -137.69419860839844, "rewards/margins": 8.772655487060547, "rewards/rejected": -146.46685791015625, "step": 32800 }, { "epoch": 1.9, "grad_norm": 6.934147210680154e-17, "learning_rate": 0.00036800959789465537, "logits/chosen": -14.786323547363281, "logits/rejected": -14.607403755187988, "logps/chosen": -2752.427734375, "logps/rejected": -2159.95458984375, "loss": 1.7392, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -108.17622375488281, "rewards/margins": 9.388882637023926, "rewards/rejected": -117.56510925292969, "step": 32810 }, { "epoch": 1.9, "grad_norm": 5.6517383200116456e-05, "learning_rate": 0.000367816091954023, "logits/chosen": -16.28848648071289, "logits/rejected": -16.46076011657715, "logps/chosen": -2529.27099609375, "logps/rejected": -2734.432861328125, "loss": 1.8442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -178.13746643066406, "rewards/margins": 32.41087341308594, "rewards/rejected": -210.54830932617188, "step": 32820 }, { "epoch": 1.9, "grad_norm": 1.314487187068736e-19, "learning_rate": 0.0003676225860133906, "logits/chosen": -14.783950805664062, "logits/rejected": -14.939615249633789, "logps/chosen": -2954.7451171875, "logps/rejected": -2670.1923828125, "loss": 13.4165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -106.3662109375, "rewards/margins": 2.4761836528778076, "rewards/rejected": -108.84239196777344, "step": 32830 }, { "epoch": 1.9, "grad_norm": 0.12706021964550018, "learning_rate": 0.0003674290800727582, "logits/chosen": -14.913871765136719, "logits/rejected": -14.915441513061523, "logps/chosen": -2220.07568359375, "logps/rejected": -2477.849609375, "loss": 2.7302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.24507904052734, "rewards/margins": 24.769969940185547, "rewards/rejected": -144.01504516601562, "step": 32840 }, { "epoch": 1.9, "grad_norm": 0.0018144587520509958, "learning_rate": 0.0003672355741321259, "logits/chosen": -16.98845100402832, "logits/rejected": -17.529069900512695, "logps/chosen": -2355.189697265625, "logps/rejected": -2527.64990234375, "loss": 1.3441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -138.9951934814453, "rewards/margins": 10.030009269714355, "rewards/rejected": -149.0251922607422, "step": 32850 }, { "epoch": 1.9, "grad_norm": 46.275699615478516, "learning_rate": 0.0003670420681914935, "logits/chosen": -12.273508071899414, "logits/rejected": -12.26984691619873, "logps/chosen": -3111.8037109375, "logps/rejected": -2778.166259765625, "loss": 6.126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -63.141136169433594, "rewards/margins": 2.6527369022369385, "rewards/rejected": -65.79386901855469, "step": 32860 }, { "epoch": 1.9, "grad_norm": 0.15355603396892548, "learning_rate": 0.0003668485622508611, "logits/chosen": -11.114779472351074, "logits/rejected": -10.941149711608887, "logps/chosen": -2693.47412109375, "logps/rejected": -3161.53564453125, "loss": 3.5798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -127.53265380859375, "rewards/margins": 0.884168267250061, "rewards/rejected": -128.41683959960938, "step": 32870 }, { "epoch": 1.9, "grad_norm": 4.248568996612079e-10, "learning_rate": 0.00036665505631022874, "logits/chosen": -12.342679023742676, "logits/rejected": -12.353769302368164, "logps/chosen": -3045.58544921875, "logps/rejected": -3100.621826171875, "loss": 5.8886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -193.6728057861328, "rewards/margins": 7.392282009124756, "rewards/rejected": -201.06507873535156, "step": 32880 }, { "epoch": 1.9, "grad_norm": 340.0784912109375, "learning_rate": 0.0003664615503695963, "logits/chosen": -11.530698776245117, "logits/rejected": -11.375688552856445, "logps/chosen": -2812.58056640625, "logps/rejected": -2616.3798828125, "loss": 5.9776, "rewards/accuracies": 0.5, "rewards/chosen": -166.12181091308594, "rewards/margins": 12.589434623718262, "rewards/rejected": -178.71124267578125, "step": 32890 }, { "epoch": 1.9, "grad_norm": 239.54725646972656, "learning_rate": 0.000366268044428964, "logits/chosen": -10.710001945495605, "logits/rejected": -10.421630859375, "logps/chosen": -2723.778076171875, "logps/rejected": -1696.1207275390625, "loss": 23.2463, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -74.03273010253906, "rewards/margins": -1.3891258239746094, "rewards/rejected": -72.64360046386719, "step": 32900 }, { "epoch": 1.9, "grad_norm": 4.4205666682728406e-08, "learning_rate": 0.0003660745384883316, "logits/chosen": -15.612103462219238, "logits/rejected": -15.58942985534668, "logps/chosen": -2504.081787109375, "logps/rejected": -2167.950439453125, "loss": 6.9675, "rewards/accuracies": 0.5, "rewards/chosen": -176.58792114257812, "rewards/margins": 0.42644768953323364, "rewards/rejected": -177.0143585205078, "step": 32910 }, { "epoch": 1.91, "grad_norm": 1.9052088465892902e-12, "learning_rate": 0.0003658810325476992, "logits/chosen": -14.996403694152832, "logits/rejected": -15.155476570129395, "logps/chosen": -2873.32470703125, "logps/rejected": -2057.52001953125, "loss": 14.5192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.45008850097656, "rewards/margins": 1.1631195545196533, "rewards/rejected": -125.61319732666016, "step": 32920 }, { "epoch": 1.91, "grad_norm": 4.300123625233798e-12, "learning_rate": 0.00036568752660706683, "logits/chosen": -12.982312202453613, "logits/rejected": -13.035191535949707, "logps/chosen": -2803.1513671875, "logps/rejected": -2550.00634765625, "loss": 13.9101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -94.28227233886719, "rewards/margins": -1.375862717628479, "rewards/rejected": -92.9063949584961, "step": 32930 }, { "epoch": 1.91, "grad_norm": 81.4319839477539, "learning_rate": 0.00036549402066643445, "logits/chosen": -15.698585510253906, "logits/rejected": -15.392919540405273, "logps/chosen": -2484.3662109375, "logps/rejected": -2230.46533203125, "loss": 7.2453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.92794799804688, "rewards/margins": 4.064084529876709, "rewards/rejected": -148.99203491210938, "step": 32940 }, { "epoch": 1.91, "grad_norm": 38.54352569580078, "learning_rate": 0.0003653005147258021, "logits/chosen": -14.944897651672363, "logits/rejected": -14.886584281921387, "logps/chosen": -2745.302734375, "logps/rejected": -2467.630859375, "loss": 5.5001, "rewards/accuracies": 0.5, "rewards/chosen": -159.88327026367188, "rewards/margins": 7.620385646820068, "rewards/rejected": -167.50363159179688, "step": 32950 }, { "epoch": 1.91, "grad_norm": 4.568506942601796e-18, "learning_rate": 0.00036510700878516973, "logits/chosen": -12.188558578491211, "logits/rejected": -12.329065322875977, "logps/chosen": -2888.43359375, "logps/rejected": -3091.505859375, "loss": 4.4154, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -95.80271911621094, "rewards/margins": 2.820380687713623, "rewards/rejected": -98.62310028076172, "step": 32960 }, { "epoch": 1.91, "grad_norm": 58.72743225097656, "learning_rate": 0.00036491350284453735, "logits/chosen": -15.741567611694336, "logits/rejected": -15.825108528137207, "logps/chosen": -2221.410888671875, "logps/rejected": -2236.299560546875, "loss": 4.4759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -191.71266174316406, "rewards/margins": -0.3077735900878906, "rewards/rejected": -191.40487670898438, "step": 32970 }, { "epoch": 1.91, "grad_norm": 88.79193878173828, "learning_rate": 0.00036471999690390497, "logits/chosen": -13.30627155303955, "logits/rejected": -12.873634338378906, "logps/chosen": -2980.546630859375, "logps/rejected": -2796.11328125, "loss": 10.1185, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -164.9385986328125, "rewards/margins": -7.1412858963012695, "rewards/rejected": -157.7973175048828, "step": 32980 }, { "epoch": 1.91, "grad_norm": 34.93214416503906, "learning_rate": 0.0003645264909632726, "logits/chosen": -16.83815574645996, "logits/rejected": -16.9248046875, "logps/chosen": -2846.196533203125, "logps/rejected": -2577.08740234375, "loss": 3.4182, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -171.34255981445312, "rewards/margins": 3.7156271934509277, "rewards/rejected": -175.05816650390625, "step": 32990 }, { "epoch": 1.91, "grad_norm": 0.5401834845542908, "learning_rate": 0.0003643329850226402, "logits/chosen": -15.587593078613281, "logits/rejected": -15.974527359008789, "logps/chosen": -2512.895751953125, "logps/rejected": -2356.245361328125, "loss": 10.2547, "rewards/accuracies": 0.5, "rewards/chosen": -213.10202026367188, "rewards/margins": -0.708953857421875, "rewards/rejected": -212.39306640625, "step": 33000 }, { "epoch": 1.91, "grad_norm": 0.014073451980948448, "learning_rate": 0.0003641394790820078, "logits/chosen": -16.105302810668945, "logits/rejected": -16.446203231811523, "logps/chosen": -2601.747802734375, "logps/rejected": -2503.040771484375, "loss": 1.7048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -191.72396850585938, "rewards/margins": 10.396105766296387, "rewards/rejected": -202.1200714111328, "step": 33010 }, { "epoch": 1.91, "grad_norm": 0.0, "learning_rate": 0.00036394597314137544, "logits/chosen": -14.704673767089844, "logits/rejected": -14.456443786621094, "logps/chosen": -2807.78515625, "logps/rejected": -2480.125244140625, "loss": 6.1113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -192.60899353027344, "rewards/margins": 14.01233959197998, "rewards/rejected": -206.621337890625, "step": 33020 }, { "epoch": 1.91, "grad_norm": 103.34972381591797, "learning_rate": 0.00036375246720074305, "logits/chosen": -15.080102920532227, "logits/rejected": -15.672223091125488, "logps/chosen": -2485.03955078125, "logps/rejected": -2783.880126953125, "loss": 7.9901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.19212341308594, "rewards/margins": -3.000225782394409, "rewards/rejected": -181.19190979003906, "step": 33030 }, { "epoch": 1.91, "grad_norm": 7.003080368041992, "learning_rate": 0.00036355896126011067, "logits/chosen": -11.12080192565918, "logits/rejected": -11.134357452392578, "logps/chosen": -3008.49951171875, "logps/rejected": -2985.65234375, "loss": 12.0732, "rewards/accuracies": 0.5, "rewards/chosen": -127.5865478515625, "rewards/margins": -8.895756721496582, "rewards/rejected": -118.6907958984375, "step": 33040 }, { "epoch": 1.91, "grad_norm": 0.0, "learning_rate": 0.0003633654553194783, "logits/chosen": -11.500611305236816, "logits/rejected": -11.323145866394043, "logps/chosen": -3128.317138671875, "logps/rejected": -2776.23046875, "loss": 1.4468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -100.4018325805664, "rewards/margins": 14.798563003540039, "rewards/rejected": -115.20039367675781, "step": 33050 }, { "epoch": 1.91, "grad_norm": 53.054962158203125, "learning_rate": 0.00036317194937884596, "logits/chosen": -11.688454627990723, "logits/rejected": -11.687860488891602, "logps/chosen": -3103.063232421875, "logps/rejected": -2793.48828125, "loss": 4.4544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -114.1260986328125, "rewards/margins": 4.91497278213501, "rewards/rejected": -119.04106140136719, "step": 33060 }, { "epoch": 1.91, "grad_norm": 2.495619624504123e-17, "learning_rate": 0.0003629784434382136, "logits/chosen": -17.109020233154297, "logits/rejected": -17.344188690185547, "logps/chosen": -2583.529296875, "logps/rejected": -2028.5416259765625, "loss": 10.2399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.2823944091797, "rewards/margins": 7.49856424331665, "rewards/rejected": -162.7809600830078, "step": 33070 }, { "epoch": 1.91, "grad_norm": 8.408874418819323e-05, "learning_rate": 0.0003627849374975812, "logits/chosen": -14.4064302444458, "logits/rejected": -14.150016784667969, "logps/chosen": -2724.010498046875, "logps/rejected": -2594.873291015625, "loss": 1.5684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -139.86575317382812, "rewards/margins": 7.780706882476807, "rewards/rejected": -147.64645385742188, "step": 33080 }, { "epoch": 1.92, "grad_norm": 36.859901428222656, "learning_rate": 0.0003625914315569488, "logits/chosen": -14.055482864379883, "logits/rejected": -14.22301197052002, "logps/chosen": -2816.2763671875, "logps/rejected": -2444.83349609375, "loss": 3.3592, "rewards/accuracies": 0.5, "rewards/chosen": -142.27578735351562, "rewards/margins": 2.6614322662353516, "rewards/rejected": -144.9372100830078, "step": 33090 }, { "epoch": 1.92, "grad_norm": 13.007956504821777, "learning_rate": 0.00036239792561631643, "logits/chosen": -15.488713264465332, "logits/rejected": -15.10273265838623, "logps/chosen": -2594.57421875, "logps/rejected": -2743.24951171875, "loss": 8.7394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -209.0780792236328, "rewards/margins": -2.111494779586792, "rewards/rejected": -206.96658325195312, "step": 33100 }, { "epoch": 1.92, "grad_norm": 35.778926849365234, "learning_rate": 0.00036220441967568405, "logits/chosen": -12.588663101196289, "logits/rejected": -12.598260879516602, "logps/chosen": -3011.12255859375, "logps/rejected": -2856.433349609375, "loss": 3.5984, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.12306213378906, "rewards/margins": 0.966788649559021, "rewards/rejected": -124.08985900878906, "step": 33110 }, { "epoch": 1.92, "grad_norm": 384.89947509765625, "learning_rate": 0.00036201091373505166, "logits/chosen": -14.173627853393555, "logits/rejected": -13.933988571166992, "logps/chosen": -2314.2197265625, "logps/rejected": -2643.725341796875, "loss": 3.8256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -111.20677185058594, "rewards/margins": 4.999583721160889, "rewards/rejected": -116.20634460449219, "step": 33120 }, { "epoch": 1.92, "grad_norm": 65.3374252319336, "learning_rate": 0.0003618174077944193, "logits/chosen": -12.168471336364746, "logits/rejected": -12.251229286193848, "logps/chosen": -3051.771484375, "logps/rejected": -2658.791015625, "loss": 3.7124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -142.275634765625, "rewards/margins": 10.743715286254883, "rewards/rejected": -153.0193634033203, "step": 33130 }, { "epoch": 1.92, "grad_norm": 75.72505950927734, "learning_rate": 0.0003616239018537869, "logits/chosen": -12.898843765258789, "logits/rejected": -12.692785263061523, "logps/chosen": -2928.14404296875, "logps/rejected": -2783.489013671875, "loss": 1.278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.61952209472656, "rewards/margins": 12.797388076782227, "rewards/rejected": -169.41690063476562, "step": 33140 }, { "epoch": 1.92, "grad_norm": 88.75336456298828, "learning_rate": 0.0003614303959131545, "logits/chosen": -15.603513717651367, "logits/rejected": -15.817705154418945, "logps/chosen": -2778.24267578125, "logps/rejected": -2803.17041015625, "loss": 12.5016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -220.4438934326172, "rewards/margins": -10.492773056030273, "rewards/rejected": -209.9511260986328, "step": 33150 }, { "epoch": 1.92, "grad_norm": 145.09580993652344, "learning_rate": 0.0003612368899725222, "logits/chosen": -13.359106063842773, "logits/rejected": -13.355151176452637, "logps/chosen": -3561.43359375, "logps/rejected": -3165.70849609375, "loss": 15.0506, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -212.87646484375, "rewards/margins": -12.153215408325195, "rewards/rejected": -200.72323608398438, "step": 33160 }, { "epoch": 1.92, "grad_norm": 151.65708923339844, "learning_rate": 0.0003610433840318898, "logits/chosen": -12.182385444641113, "logits/rejected": -11.744009017944336, "logps/chosen": -2738.484619140625, "logps/rejected": -2742.130126953125, "loss": 8.3387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.0309295654297, "rewards/margins": 5.761612892150879, "rewards/rejected": -200.7925262451172, "step": 33170 }, { "epoch": 1.92, "grad_norm": 62.28668975830078, "learning_rate": 0.0003608498780912574, "logits/chosen": -13.323046684265137, "logits/rejected": -13.166796684265137, "logps/chosen": -2625.364990234375, "logps/rejected": -2355.30029296875, "loss": 5.1439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.7052459716797, "rewards/margins": -0.687686562538147, "rewards/rejected": -200.01754760742188, "step": 33180 }, { "epoch": 1.92, "grad_norm": 60.67380142211914, "learning_rate": 0.00036065637215062504, "logits/chosen": -11.213029861450195, "logits/rejected": -10.768284797668457, "logps/chosen": -3150.278564453125, "logps/rejected": -3083.13671875, "loss": 6.8021, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.53514099121094, "rewards/margins": -2.3687500953674316, "rewards/rejected": -162.16639709472656, "step": 33190 }, { "epoch": 1.92, "grad_norm": 10.63302993774414, "learning_rate": 0.00036046286620999266, "logits/chosen": -13.27996826171875, "logits/rejected": -13.115735054016113, "logps/chosen": -2976.53466796875, "logps/rejected": -2877.86474609375, "loss": 2.3809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.97125244140625, "rewards/margins": 3.862868547439575, "rewards/rejected": -195.83413696289062, "step": 33200 }, { "epoch": 1.92, "grad_norm": 0.2679760158061981, "learning_rate": 0.0003602693602693603, "logits/chosen": -13.972308158874512, "logits/rejected": -14.062410354614258, "logps/chosen": -3107.74853515625, "logps/rejected": -2944.97216796875, "loss": 1.4689, "rewards/accuracies": 0.5, "rewards/chosen": -129.5245361328125, "rewards/margins": 2.238831043243408, "rewards/rejected": -131.7633819580078, "step": 33210 }, { "epoch": 1.92, "grad_norm": 119.18061828613281, "learning_rate": 0.0003600758543287279, "logits/chosen": -14.107522964477539, "logits/rejected": -14.492838859558105, "logps/chosen": -2912.75537109375, "logps/rejected": -2944.853271484375, "loss": 8.5301, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -139.2840118408203, "rewards/margins": -6.509657382965088, "rewards/rejected": -132.7743682861328, "step": 33220 }, { "epoch": 1.92, "grad_norm": 1.470685010405981e-14, "learning_rate": 0.0003598823483880955, "logits/chosen": -14.35297966003418, "logits/rejected": -14.567814826965332, "logps/chosen": -2845.38330078125, "logps/rejected": -2915.164306640625, "loss": 6.9771, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -85.69853210449219, "rewards/margins": 1.7899894714355469, "rewards/rejected": -87.48851776123047, "step": 33230 }, { "epoch": 1.92, "grad_norm": 71.2671890258789, "learning_rate": 0.0003596888424474631, "logits/chosen": -16.28070831298828, "logits/rejected": -17.164060592651367, "logps/chosen": -2882.541015625, "logps/rejected": -2494.878173828125, "loss": 1.9809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -147.13290405273438, "rewards/margins": 9.980430603027344, "rewards/rejected": -157.1133575439453, "step": 33240 }, { "epoch": 1.92, "grad_norm": 243.31871032714844, "learning_rate": 0.00035949533650683074, "logits/chosen": -12.842615127563477, "logits/rejected": -12.804814338684082, "logps/chosen": -3212.449951171875, "logps/rejected": -2528.91064453125, "loss": 21.7888, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -142.68743896484375, "rewards/margins": -12.429664611816406, "rewards/rejected": -130.25778198242188, "step": 33250 }, { "epoch": 1.93, "grad_norm": 56.258506774902344, "learning_rate": 0.00035930183056619836, "logits/chosen": -14.515437126159668, "logits/rejected": -14.363665580749512, "logps/chosen": -2821.98388671875, "logps/rejected": -2682.80908203125, "loss": 2.8206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.3864288330078, "rewards/margins": 2.6436431407928467, "rewards/rejected": -182.0300750732422, "step": 33260 }, { "epoch": 1.93, "grad_norm": 6.811058028688421e-07, "learning_rate": 0.00035910832462556603, "logits/chosen": -16.730571746826172, "logits/rejected": -17.523773193359375, "logps/chosen": -2542.06640625, "logps/rejected": -2544.600830078125, "loss": 16.3163, "rewards/accuracies": 0.5, "rewards/chosen": -180.7716827392578, "rewards/margins": 8.374267578125, "rewards/rejected": -189.14596557617188, "step": 33270 }, { "epoch": 1.93, "grad_norm": 4.499344254003869e-16, "learning_rate": 0.00035891481868493365, "logits/chosen": -12.793550491333008, "logits/rejected": -12.974847793579102, "logps/chosen": -2987.638916015625, "logps/rejected": -2453.433349609375, "loss": 5.5188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.36116790771484, "rewards/margins": 5.392288684844971, "rewards/rejected": -122.75346374511719, "step": 33280 }, { "epoch": 1.93, "grad_norm": 8.525531768798828, "learning_rate": 0.00035872131274430126, "logits/chosen": -17.250286102294922, "logits/rejected": -17.33346176147461, "logps/chosen": -2784.677978515625, "logps/rejected": -2693.17236328125, "loss": 7.3961, "rewards/accuracies": 0.5, "rewards/chosen": -206.5211181640625, "rewards/margins": -0.6881629824638367, "rewards/rejected": -205.8329620361328, "step": 33290 }, { "epoch": 1.93, "grad_norm": 0.2295273393392563, "learning_rate": 0.0003585278068036689, "logits/chosen": -14.570671081542969, "logits/rejected": -14.70909595489502, "logps/chosen": -2810.24365234375, "logps/rejected": -2817.904296875, "loss": 2.0991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.37173461914062, "rewards/margins": 8.228435516357422, "rewards/rejected": -180.60015869140625, "step": 33300 }, { "epoch": 1.93, "grad_norm": 0.03872811794281006, "learning_rate": 0.0003583343008630365, "logits/chosen": -14.761526107788086, "logits/rejected": -15.49455451965332, "logps/chosen": -2672.612548828125, "logps/rejected": -2657.934326171875, "loss": 2.1188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -160.70901489257812, "rewards/margins": 8.6351900100708, "rewards/rejected": -169.3441925048828, "step": 33310 }, { "epoch": 1.93, "grad_norm": 16.418344497680664, "learning_rate": 0.00035814079492240417, "logits/chosen": -13.561355590820312, "logits/rejected": -14.6376371383667, "logps/chosen": -2955.48486328125, "logps/rejected": -2624.03173828125, "loss": 19.4539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.2710723876953, "rewards/margins": -15.658106803894043, "rewards/rejected": -136.6129608154297, "step": 33320 }, { "epoch": 1.93, "grad_norm": 100.40364074707031, "learning_rate": 0.00035794728898177173, "logits/chosen": -16.18465805053711, "logits/rejected": -15.901286125183105, "logps/chosen": -2869.215576171875, "logps/rejected": -2640.4267578125, "loss": 3.7933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.7211151123047, "rewards/margins": 9.507835388183594, "rewards/rejected": -148.22897338867188, "step": 33330 }, { "epoch": 1.93, "grad_norm": 97.97025299072266, "learning_rate": 0.00035775378304113935, "logits/chosen": -17.546974182128906, "logits/rejected": -17.85531234741211, "logps/chosen": -2936.866455078125, "logps/rejected": -2834.416015625, "loss": 4.5526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.8833770751953, "rewards/margins": 7.904959201812744, "rewards/rejected": -153.788330078125, "step": 33340 }, { "epoch": 1.93, "grad_norm": 128.7353515625, "learning_rate": 0.00035756027710050697, "logits/chosen": -19.13035774230957, "logits/rejected": -19.24237632751465, "logps/chosen": -2722.67529296875, "logps/rejected": -2694.69140625, "loss": 4.9762, "rewards/accuracies": 0.5, "rewards/chosen": -211.8491973876953, "rewards/margins": 1.4871076345443726, "rewards/rejected": -213.3363037109375, "step": 33350 }, { "epoch": 1.93, "grad_norm": 154.04872131347656, "learning_rate": 0.0003573667711598746, "logits/chosen": -17.65776252746582, "logits/rejected": -17.939836502075195, "logps/chosen": -2897.70361328125, "logps/rejected": -2972.928466796875, "loss": 4.3508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -223.17800903320312, "rewards/margins": 4.701677322387695, "rewards/rejected": -227.87966918945312, "step": 33360 }, { "epoch": 1.93, "grad_norm": 56.03358840942383, "learning_rate": 0.00035717326521924226, "logits/chosen": -15.206682205200195, "logits/rejected": -15.701436042785645, "logps/chosen": -2988.107666015625, "logps/rejected": -2955.928955078125, "loss": 13.3718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -196.87930297851562, "rewards/margins": 3.9461452960968018, "rewards/rejected": -200.82545471191406, "step": 33370 }, { "epoch": 1.93, "grad_norm": 1.1019294261932373, "learning_rate": 0.0003569797592786099, "logits/chosen": -14.125917434692383, "logits/rejected": -14.37205982208252, "logps/chosen": -2899.91259765625, "logps/rejected": -2312.39990234375, "loss": 2.5915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.7030029296875, "rewards/margins": 27.60881996154785, "rewards/rejected": -171.31179809570312, "step": 33380 }, { "epoch": 1.93, "grad_norm": 0.4393358826637268, "learning_rate": 0.0003567862533379775, "logits/chosen": -18.392337799072266, "logits/rejected": -18.30194091796875, "logps/chosen": -2423.05908203125, "logps/rejected": -2591.005126953125, "loss": 5.6187, "rewards/accuracies": 0.5, "rewards/chosen": -219.72171020507812, "rewards/margins": 11.970357894897461, "rewards/rejected": -231.6920623779297, "step": 33390 }, { "epoch": 1.93, "grad_norm": 81.65206146240234, "learning_rate": 0.0003565927473973451, "logits/chosen": -16.156919479370117, "logits/rejected": -16.55573272705078, "logps/chosen": -2276.30029296875, "logps/rejected": -2366.68359375, "loss": 6.6084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -195.21800231933594, "rewards/margins": 10.01282024383545, "rewards/rejected": -205.2308349609375, "step": 33400 }, { "epoch": 1.93, "grad_norm": 0.0024060248397290707, "learning_rate": 0.0003563992414567127, "logits/chosen": -13.112617492675781, "logits/rejected": -12.668203353881836, "logps/chosen": -2987.80517578125, "logps/rejected": -3020.880126953125, "loss": 4.5062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -184.90283203125, "rewards/margins": 8.304215431213379, "rewards/rejected": -193.20704650878906, "step": 33410 }, { "epoch": 1.93, "grad_norm": 1.4859169709623946e-15, "learning_rate": 0.0003562057355160804, "logits/chosen": -11.671462059020996, "logits/rejected": -12.200151443481445, "logps/chosen": -2846.8193359375, "logps/rejected": -2973.503662109375, "loss": 7.4045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.3582534790039, "rewards/margins": 1.0633894205093384, "rewards/rejected": -120.421630859375, "step": 33420 }, { "epoch": 1.94, "grad_norm": 0.009698858484625816, "learning_rate": 0.000356012229575448, "logits/chosen": -12.36717414855957, "logits/rejected": -12.648462295532227, "logps/chosen": -3033.935546875, "logps/rejected": -2792.291015625, "loss": 0.8351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -101.93256378173828, "rewards/margins": 29.63812828063965, "rewards/rejected": -131.5706787109375, "step": 33430 }, { "epoch": 1.94, "grad_norm": 0.45328518748283386, "learning_rate": 0.0003558187236348156, "logits/chosen": -12.532801628112793, "logits/rejected": -12.32994270324707, "logps/chosen": -2874.148193359375, "logps/rejected": -2829.010498046875, "loss": 0.984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -170.11233520507812, "rewards/margins": 10.296233177185059, "rewards/rejected": -180.40858459472656, "step": 33440 }, { "epoch": 1.94, "grad_norm": 4.606774597210039e-12, "learning_rate": 0.0003556252176941832, "logits/chosen": -12.482523918151855, "logits/rejected": -12.545430183410645, "logps/chosen": -2538.32666015625, "logps/rejected": -2233.43701171875, "loss": 8.0884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.30230712890625, "rewards/margins": -0.5345954895019531, "rewards/rejected": -148.76771545410156, "step": 33450 }, { "epoch": 1.94, "grad_norm": 3.170989295794846e-12, "learning_rate": 0.0003554317117535508, "logits/chosen": -13.44017219543457, "logits/rejected": -13.621905326843262, "logps/chosen": -2869.00732421875, "logps/rejected": -2640.26513671875, "loss": 1.7323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.03759765625, "rewards/margins": 12.397208213806152, "rewards/rejected": -183.434814453125, "step": 33460 }, { "epoch": 1.94, "grad_norm": 49.78055953979492, "learning_rate": 0.0003552382058129185, "logits/chosen": -16.69210433959961, "logits/rejected": -16.65290069580078, "logps/chosen": -2416.85205078125, "logps/rejected": -2426.20263671875, "loss": 3.4605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.4149627685547, "rewards/margins": 9.964750289916992, "rewards/rejected": -217.3797149658203, "step": 33470 }, { "epoch": 1.94, "grad_norm": 3.4107718871467796e-09, "learning_rate": 0.0003550446998722861, "logits/chosen": -13.807965278625488, "logits/rejected": -14.030293464660645, "logps/chosen": -2857.104248046875, "logps/rejected": -2654.208251953125, "loss": 12.8758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -136.31576538085938, "rewards/margins": -0.48656922578811646, "rewards/rejected": -135.82919311523438, "step": 33480 }, { "epoch": 1.94, "grad_norm": 131.14627075195312, "learning_rate": 0.0003548511939316537, "logits/chosen": -15.484764099121094, "logits/rejected": -16.126056671142578, "logps/chosen": -2488.9990234375, "logps/rejected": -2135.16357421875, "loss": 31.3621, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -215.8691864013672, "rewards/margins": -24.19882583618164, "rewards/rejected": -191.67037963867188, "step": 33490 }, { "epoch": 1.94, "grad_norm": 25.677125930786133, "learning_rate": 0.00035465768799102133, "logits/chosen": -14.464807510375977, "logits/rejected": -14.271062850952148, "logps/chosen": -2682.968994140625, "logps/rejected": -2530.58935546875, "loss": 7.5814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.8043670654297, "rewards/margins": 11.670499801635742, "rewards/rejected": -200.47486877441406, "step": 33500 }, { "epoch": 1.94, "grad_norm": 0.2818399667739868, "learning_rate": 0.00035446418205038895, "logits/chosen": -17.372745513916016, "logits/rejected": -17.583444595336914, "logps/chosen": -2460.45458984375, "logps/rejected": -2195.46484375, "loss": 5.8536, "rewards/accuracies": 0.5, "rewards/chosen": -185.73141479492188, "rewards/margins": 6.280082702636719, "rewards/rejected": -192.01150512695312, "step": 33510 }, { "epoch": 1.94, "grad_norm": 0.014284282922744751, "learning_rate": 0.00035427067610975657, "logits/chosen": -12.562090873718262, "logits/rejected": -12.557473182678223, "logps/chosen": -2901.3994140625, "logps/rejected": -2624.734619140625, "loss": 4.7917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.95993041992188, "rewards/margins": 2.9479877948760986, "rewards/rejected": -144.90792846679688, "step": 33520 }, { "epoch": 1.94, "grad_norm": 72.9389419555664, "learning_rate": 0.00035407717016912424, "logits/chosen": -16.87823486328125, "logits/rejected": -17.275815963745117, "logps/chosen": -2778.994873046875, "logps/rejected": -2748.285400390625, "loss": 4.9843, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -184.35690307617188, "rewards/margins": -1.8792235851287842, "rewards/rejected": -182.47767639160156, "step": 33530 }, { "epoch": 1.94, "grad_norm": 145.57830810546875, "learning_rate": 0.00035388366422849186, "logits/chosen": -15.406231880187988, "logits/rejected": -15.478471755981445, "logps/chosen": -3088.3349609375, "logps/rejected": -2957.29345703125, "loss": 2.8766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -236.02792358398438, "rewards/margins": 2.3565216064453125, "rewards/rejected": -238.3844451904297, "step": 33540 }, { "epoch": 1.94, "grad_norm": 70.72599029541016, "learning_rate": 0.0003536901582878594, "logits/chosen": -17.508487701416016, "logits/rejected": -18.514102935791016, "logps/chosen": -2750.39501953125, "logps/rejected": -2475.753173828125, "loss": 22.7144, "rewards/accuracies": 0.5, "rewards/chosen": -249.9422149658203, "rewards/margins": -17.799999237060547, "rewards/rejected": -232.14224243164062, "step": 33550 }, { "epoch": 1.94, "grad_norm": 85.42112731933594, "learning_rate": 0.00035349665234722704, "logits/chosen": -12.998922348022461, "logits/rejected": -12.837573051452637, "logps/chosen": -3050.65869140625, "logps/rejected": -2601.0595703125, "loss": 5.1733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.20701599121094, "rewards/margins": 4.472933769226074, "rewards/rejected": -156.67996215820312, "step": 33560 }, { "epoch": 1.94, "grad_norm": 1.953804007470694e-09, "learning_rate": 0.00035330314640659465, "logits/chosen": -14.251063346862793, "logits/rejected": -14.379838943481445, "logps/chosen": -2253.380859375, "logps/rejected": -2621.755126953125, "loss": 0.6006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.6916046142578, "rewards/margins": 5.779781341552734, "rewards/rejected": -171.47140502929688, "step": 33570 }, { "epoch": 1.94, "grad_norm": 0.5567411780357361, "learning_rate": 0.0003531096404659623, "logits/chosen": -12.463205337524414, "logits/rejected": -12.700712203979492, "logps/chosen": -3083.62060546875, "logps/rejected": -2962.57666015625, "loss": 10.7502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.3700714111328, "rewards/margins": -3.897871494293213, "rewards/rejected": -156.47219848632812, "step": 33580 }, { "epoch": 1.94, "grad_norm": 1.3756299018859863, "learning_rate": 0.00035291613452532994, "logits/chosen": -13.737322807312012, "logits/rejected": -13.511373519897461, "logps/chosen": -2640.998046875, "logps/rejected": -2349.405029296875, "loss": 1.038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -153.2560577392578, "rewards/margins": 13.153244018554688, "rewards/rejected": -166.40931701660156, "step": 33590 }, { "epoch": 1.94, "grad_norm": 2.790347064518528e-09, "learning_rate": 0.00035272262858469756, "logits/chosen": -12.720815658569336, "logits/rejected": -12.500101089477539, "logps/chosen": -2441.20947265625, "logps/rejected": -2421.055908203125, "loss": 4.271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.13753509521484, "rewards/margins": 1.8292335271835327, "rewards/rejected": -120.96678161621094, "step": 33600 }, { "epoch": 1.95, "grad_norm": 77.56456756591797, "learning_rate": 0.0003525291226440652, "logits/chosen": -14.728752136230469, "logits/rejected": -14.791082382202148, "logps/chosen": -2577.832275390625, "logps/rejected": -2514.736083984375, "loss": 4.5147, "rewards/accuracies": 0.5, "rewards/chosen": -148.947998046875, "rewards/margins": 3.472979784011841, "rewards/rejected": -152.4209747314453, "step": 33610 }, { "epoch": 1.95, "grad_norm": 4.5308419238665465e-09, "learning_rate": 0.0003523356167034328, "logits/chosen": -15.401372909545898, "logits/rejected": -15.386327743530273, "logps/chosen": -2747.33837890625, "logps/rejected": -2286.80224609375, "loss": 28.8101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.07925415039062, "rewards/margins": -11.4866943359375, "rewards/rejected": -123.5925521850586, "step": 33620 }, { "epoch": 1.95, "grad_norm": 14.786615371704102, "learning_rate": 0.00035214211076280047, "logits/chosen": -15.82696533203125, "logits/rejected": -16.07387924194336, "logps/chosen": -2533.94482421875, "logps/rejected": -2236.6669921875, "loss": 1.7713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -146.34365844726562, "rewards/margins": 10.41151237487793, "rewards/rejected": -156.7551727294922, "step": 33630 }, { "epoch": 1.95, "grad_norm": 1.2501475549209218e-12, "learning_rate": 0.0003519486048221681, "logits/chosen": -13.02641487121582, "logits/rejected": -13.025125503540039, "logps/chosen": -2869.121826171875, "logps/rejected": -3045.154541015625, "loss": 1.4459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -114.14102935791016, "rewards/margins": 10.696154594421387, "rewards/rejected": -124.8371810913086, "step": 33640 }, { "epoch": 1.95, "grad_norm": 40.74016571044922, "learning_rate": 0.0003517550988815357, "logits/chosen": -15.330822944641113, "logits/rejected": -15.503190994262695, "logps/chosen": -2556.678955078125, "logps/rejected": -2798.528564453125, "loss": 3.2557, "rewards/accuracies": 0.5, "rewards/chosen": -146.45506286621094, "rewards/margins": 3.9989120960235596, "rewards/rejected": -150.45394897460938, "step": 33650 }, { "epoch": 1.95, "grad_norm": 2.0679363998965528e-08, "learning_rate": 0.00035156159294090326, "logits/chosen": -15.329790115356445, "logits/rejected": -15.656356811523438, "logps/chosen": -2970.070556640625, "logps/rejected": -3026.32861328125, "loss": 2.2167, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -153.95932006835938, "rewards/margins": 9.764322280883789, "rewards/rejected": -163.7236328125, "step": 33660 }, { "epoch": 1.95, "grad_norm": 0.044721171259880066, "learning_rate": 0.0003513680870002709, "logits/chosen": -14.590746879577637, "logits/rejected": -14.620097160339355, "logps/chosen": -2861.555419921875, "logps/rejected": -2668.737060546875, "loss": 13.522, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.11412048339844, "rewards/margins": -7.425594329833984, "rewards/rejected": -170.6885223388672, "step": 33670 }, { "epoch": 1.95, "grad_norm": 39.368228912353516, "learning_rate": 0.00035117458105963855, "logits/chosen": -15.25239086151123, "logits/rejected": -15.19091796875, "logps/chosen": -2952.32666015625, "logps/rejected": -2814.923583984375, "loss": 0.2317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -179.1584930419922, "rewards/margins": 11.439730644226074, "rewards/rejected": -190.59823608398438, "step": 33680 }, { "epoch": 1.95, "grad_norm": 157.46282958984375, "learning_rate": 0.00035098107511900617, "logits/chosen": -13.86767292022705, "logits/rejected": -13.815325736999512, "logps/chosen": -2969.97412109375, "logps/rejected": -2672.342041015625, "loss": 4.8688, "rewards/accuracies": 0.5, "rewards/chosen": -150.47740173339844, "rewards/margins": 0.5574789047241211, "rewards/rejected": -151.03488159179688, "step": 33690 }, { "epoch": 1.95, "grad_norm": 0.09056732803583145, "learning_rate": 0.0003507875691783738, "logits/chosen": -14.210569381713867, "logits/rejected": -14.21086597442627, "logps/chosen": -2949.235595703125, "logps/rejected": -2896.364501953125, "loss": 3.3525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.9655303955078, "rewards/margins": 4.228161811828613, "rewards/rejected": -190.19369506835938, "step": 33700 }, { "epoch": 1.95, "grad_norm": 0.0, "learning_rate": 0.0003505940632377414, "logits/chosen": -16.512468338012695, "logits/rejected": -16.642385482788086, "logps/chosen": -2787.235595703125, "logps/rejected": -2504.083984375, "loss": 1.3039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.0501708984375, "rewards/margins": 15.943923950195312, "rewards/rejected": -182.99407958984375, "step": 33710 }, { "epoch": 1.95, "grad_norm": 86.32247924804688, "learning_rate": 0.000350400557297109, "logits/chosen": -18.047048568725586, "logits/rejected": -18.19799041748047, "logps/chosen": -2687.75048828125, "logps/rejected": -2459.77392578125, "loss": 25.5914, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -230.70596313476562, "rewards/margins": -10.779626846313477, "rewards/rejected": -219.9263458251953, "step": 33720 }, { "epoch": 1.95, "grad_norm": 2.844974994659424, "learning_rate": 0.00035020705135647664, "logits/chosen": -17.595388412475586, "logits/rejected": -17.890424728393555, "logps/chosen": -2785.32666015625, "logps/rejected": -2619.37255859375, "loss": 10.0708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -210.65194702148438, "rewards/margins": -7.190930366516113, "rewards/rejected": -203.4610137939453, "step": 33730 }, { "epoch": 1.95, "grad_norm": 0.012425940483808517, "learning_rate": 0.0003500135454158443, "logits/chosen": -16.768810272216797, "logits/rejected": -16.791584014892578, "logps/chosen": -2769.338134765625, "logps/rejected": -2824.6484375, "loss": 3.6931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.89659118652344, "rewards/margins": 5.2492241859436035, "rewards/rejected": -157.14581298828125, "step": 33740 }, { "epoch": 1.95, "grad_norm": 1.8256314433529042e-05, "learning_rate": 0.0003498200394752119, "logits/chosen": -17.862289428710938, "logits/rejected": -18.352888107299805, "logps/chosen": -2801.0703125, "logps/rejected": -2842.703857421875, "loss": 4.458, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -216.06137084960938, "rewards/margins": 2.471467971801758, "rewards/rejected": -218.5328369140625, "step": 33750 }, { "epoch": 1.95, "grad_norm": 0.046634458005428314, "learning_rate": 0.00034962653353457954, "logits/chosen": -13.534564018249512, "logits/rejected": -13.454996109008789, "logps/chosen": -2925.025146484375, "logps/rejected": -2470.84521484375, "loss": 9.0227, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.81996154785156, "rewards/margins": -2.7358345985412598, "rewards/rejected": -167.08412170410156, "step": 33760 }, { "epoch": 1.95, "grad_norm": 102.04067993164062, "learning_rate": 0.0003494330275939471, "logits/chosen": -17.449283599853516, "logits/rejected": -17.828739166259766, "logps/chosen": -2277.385498046875, "logps/rejected": -1971.935302734375, "loss": 42.7837, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -204.06617736816406, "rewards/margins": -24.949777603149414, "rewards/rejected": -179.11636352539062, "step": 33770 }, { "epoch": 1.96, "grad_norm": 1.9221620559692383, "learning_rate": 0.0003492395216533147, "logits/chosen": -14.755688667297363, "logits/rejected": -14.723240852355957, "logps/chosen": -2639.697998046875, "logps/rejected": -2665.60107421875, "loss": 1.0722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.19692993164062, "rewards/margins": 5.905581474304199, "rewards/rejected": -196.10250854492188, "step": 33780 }, { "epoch": 1.96, "grad_norm": 2.358038460673062e-12, "learning_rate": 0.0003490460157126824, "logits/chosen": -15.784541130065918, "logits/rejected": -16.411380767822266, "logps/chosen": -2639.02099609375, "logps/rejected": -2727.89892578125, "loss": 0.0982, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -157.77175903320312, "rewards/margins": 25.908123016357422, "rewards/rejected": -183.67990112304688, "step": 33790 }, { "epoch": 1.96, "grad_norm": 0.1324913650751114, "learning_rate": 0.00034885250977205, "logits/chosen": -15.654687881469727, "logits/rejected": -15.5001220703125, "logps/chosen": -2637.08984375, "logps/rejected": -2334.698486328125, "loss": 4.3825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.44166564941406, "rewards/margins": 4.6443352699279785, "rewards/rejected": -194.0859832763672, "step": 33800 }, { "epoch": 1.96, "grad_norm": 38.00853729248047, "learning_rate": 0.00034865900383141763, "logits/chosen": -12.044549942016602, "logits/rejected": -12.105559349060059, "logps/chosen": -2967.12841796875, "logps/rejected": -3011.84326171875, "loss": 7.5501, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -155.69879150390625, "rewards/margins": -2.9670016765594482, "rewards/rejected": -152.73179626464844, "step": 33810 }, { "epoch": 1.96, "grad_norm": 34.083892822265625, "learning_rate": 0.00034846549789078525, "logits/chosen": -14.173617362976074, "logits/rejected": -14.204683303833008, "logps/chosen": -3021.623779296875, "logps/rejected": -2851.011962890625, "loss": 2.5994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -171.93173217773438, "rewards/margins": 3.361560821533203, "rewards/rejected": -175.29330444335938, "step": 33820 }, { "epoch": 1.96, "grad_norm": 81.89554595947266, "learning_rate": 0.00034827199195015286, "logits/chosen": -12.941505432128906, "logits/rejected": -12.918403625488281, "logps/chosen": -2604.955322265625, "logps/rejected": -2654.97802734375, "loss": 4.7522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -107.56138610839844, "rewards/margins": 10.091190338134766, "rewards/rejected": -117.652587890625, "step": 33830 }, { "epoch": 1.96, "grad_norm": 97.32707214355469, "learning_rate": 0.00034807848600952053, "logits/chosen": -12.905158996582031, "logits/rejected": -12.955729484558105, "logps/chosen": -2738.743408203125, "logps/rejected": -2795.930419921875, "loss": 5.3999, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -137.5668487548828, "rewards/margins": -1.3652527332305908, "rewards/rejected": -136.2015838623047, "step": 33840 }, { "epoch": 1.96, "grad_norm": 304.7499084472656, "learning_rate": 0.00034788498006888815, "logits/chosen": -12.728426933288574, "logits/rejected": -12.772135734558105, "logps/chosen": -2950.695068359375, "logps/rejected": -2667.66650390625, "loss": 11.4079, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -139.4513702392578, "rewards/margins": -8.87728214263916, "rewards/rejected": -130.5740966796875, "step": 33850 }, { "epoch": 1.96, "grad_norm": 0.037075649946928024, "learning_rate": 0.00034769147412825577, "logits/chosen": -12.882284164428711, "logits/rejected": -12.958636283874512, "logps/chosen": -3054.593017578125, "logps/rejected": -2870.864501953125, "loss": 4.9923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -139.48806762695312, "rewards/margins": 2.297149896621704, "rewards/rejected": -141.7852020263672, "step": 33860 }, { "epoch": 1.96, "grad_norm": 71.20753479003906, "learning_rate": 0.00034749796818762333, "logits/chosen": -12.891998291015625, "logits/rejected": -12.76057243347168, "logps/chosen": -2696.4287109375, "logps/rejected": -2737.522216796875, "loss": 7.8025, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.44564819335938, "rewards/margins": -3.3051650524139404, "rewards/rejected": -154.14048767089844, "step": 33870 }, { "epoch": 1.96, "grad_norm": 1.7482887360031896e-09, "learning_rate": 0.00034730446224699095, "logits/chosen": -14.072837829589844, "logits/rejected": -13.832903861999512, "logps/chosen": -2658.72314453125, "logps/rejected": -2753.59765625, "loss": 6.2825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.29034423828125, "rewards/margins": 3.1268439292907715, "rewards/rejected": -158.41717529296875, "step": 33880 }, { "epoch": 1.96, "grad_norm": 85.03876495361328, "learning_rate": 0.0003471109563063586, "logits/chosen": -14.562045097351074, "logits/rejected": -14.35303020477295, "logps/chosen": -2968.82421875, "logps/rejected": -2759.84375, "loss": 3.5879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.58853149414062, "rewards/margins": 2.97354793548584, "rewards/rejected": -152.5620880126953, "step": 33890 }, { "epoch": 1.96, "grad_norm": 40.540889739990234, "learning_rate": 0.00034691745036572624, "logits/chosen": -15.403387069702148, "logits/rejected": -15.39483642578125, "logps/chosen": -2562.0595703125, "logps/rejected": -2271.831787109375, "loss": 3.6701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -158.12042236328125, "rewards/margins": 7.571218013763428, "rewards/rejected": -165.69163513183594, "step": 33900 }, { "epoch": 1.96, "grad_norm": 8.859293387786238e-08, "learning_rate": 0.00034672394442509386, "logits/chosen": -14.209566116333008, "logits/rejected": -13.854217529296875, "logps/chosen": -2971.012939453125, "logps/rejected": -2833.150390625, "loss": 1.5519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -194.9180908203125, "rewards/margins": 19.070926666259766, "rewards/rejected": -213.98904418945312, "step": 33910 }, { "epoch": 1.96, "grad_norm": 9.465060234069824, "learning_rate": 0.00034653043848446147, "logits/chosen": -16.062236785888672, "logits/rejected": -16.041345596313477, "logps/chosen": -2923.88037109375, "logps/rejected": -2876.36083984375, "loss": 7.9785, "rewards/accuracies": 0.5, "rewards/chosen": -177.4940643310547, "rewards/margins": 2.9492812156677246, "rewards/rejected": -180.44334411621094, "step": 33920 }, { "epoch": 1.96, "grad_norm": 55.72945785522461, "learning_rate": 0.0003463369325438291, "logits/chosen": -14.281949996948242, "logits/rejected": -14.063230514526367, "logps/chosen": -2926.2763671875, "logps/rejected": -2512.868408203125, "loss": 2.3234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.3375244140625, "rewards/margins": 5.328685760498047, "rewards/rejected": -155.66622924804688, "step": 33930 }, { "epoch": 1.96, "grad_norm": 1.965019919225597e-06, "learning_rate": 0.0003461434266031967, "logits/chosen": -12.93365478515625, "logits/rejected": -12.759504318237305, "logps/chosen": -2918.4140625, "logps/rejected": -2747.585205078125, "loss": 1.8983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.32344818115234, "rewards/margins": 10.953598022460938, "rewards/rejected": -126.27705383300781, "step": 33940 }, { "epoch": 1.97, "grad_norm": 67.26925659179688, "learning_rate": 0.0003459499206625644, "logits/chosen": -17.79122543334961, "logits/rejected": -18.068138122558594, "logps/chosen": -2728.21142578125, "logps/rejected": -2351.572265625, "loss": 18.9263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -203.5167694091797, "rewards/margins": -11.779211044311523, "rewards/rejected": -191.73757934570312, "step": 33950 }, { "epoch": 1.97, "grad_norm": 7.509632996516302e-05, "learning_rate": 0.000345756414721932, "logits/chosen": -14.666572570800781, "logits/rejected": -14.557330131530762, "logps/chosen": -2746.51611328125, "logps/rejected": -2878.191162109375, "loss": 4.1671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.73477172851562, "rewards/margins": 14.943977355957031, "rewards/rejected": -188.67877197265625, "step": 33960 }, { "epoch": 1.97, "grad_norm": 196.9208984375, "learning_rate": 0.0003455629087812996, "logits/chosen": -12.802289009094238, "logits/rejected": -12.53045654296875, "logps/chosen": -2866.29736328125, "logps/rejected": -2656.271240234375, "loss": 19.6084, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -114.36873626708984, "rewards/margins": -17.016193389892578, "rewards/rejected": -97.35255432128906, "step": 33970 }, { "epoch": 1.97, "grad_norm": 91.59890747070312, "learning_rate": 0.0003453694028406672, "logits/chosen": -13.86462116241455, "logits/rejected": -13.96154499053955, "logps/chosen": -2604.853515625, "logps/rejected": -2558.46337890625, "loss": 8.0217, "rewards/accuracies": 0.5, "rewards/chosen": -170.71212768554688, "rewards/margins": 4.497642517089844, "rewards/rejected": -175.2097625732422, "step": 33980 }, { "epoch": 1.97, "grad_norm": 0.27165690064430237, "learning_rate": 0.0003451758969000348, "logits/chosen": -11.292790412902832, "logits/rejected": -11.125560760498047, "logps/chosen": -3128.17822265625, "logps/rejected": -2778.512939453125, "loss": 11.4754, "rewards/accuracies": 0.5, "rewards/chosen": -173.5369110107422, "rewards/margins": -7.6969313621521, "rewards/rejected": -165.83998107910156, "step": 33990 }, { "epoch": 1.97, "grad_norm": 63.142242431640625, "learning_rate": 0.00034498239095940246, "logits/chosen": -11.66909122467041, "logits/rejected": -11.568243026733398, "logps/chosen": -2913.569580078125, "logps/rejected": -3060.762939453125, "loss": 7.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.97532653808594, "rewards/margins": -2.7417564392089844, "rewards/rejected": -115.23359680175781, "step": 34000 }, { "epoch": 1.97, "grad_norm": 104.1058120727539, "learning_rate": 0.0003447888850187701, "logits/chosen": -14.507410049438477, "logits/rejected": -14.657049179077148, "logps/chosen": -2873.103515625, "logps/rejected": -2908.36865234375, "loss": 9.922, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -161.31082153320312, "rewards/margins": -8.73222541809082, "rewards/rejected": -152.57858276367188, "step": 34010 }, { "epoch": 1.97, "grad_norm": 0.0005685833166353405, "learning_rate": 0.0003445953790781377, "logits/chosen": -13.553167343139648, "logits/rejected": -13.641698837280273, "logps/chosen": -2842.28466796875, "logps/rejected": -2871.41064453125, "loss": 6.0597, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -188.343505859375, "rewards/margins": -2.280627727508545, "rewards/rejected": -186.0628662109375, "step": 34020 }, { "epoch": 1.97, "grad_norm": 2.9113218147358566e-07, "learning_rate": 0.0003444018731375053, "logits/chosen": -16.395000457763672, "logits/rejected": -15.851089477539062, "logps/chosen": -2861.155517578125, "logps/rejected": -2347.086181640625, "loss": 26.05, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -188.99270629882812, "rewards/margins": -14.441981315612793, "rewards/rejected": -174.5507354736328, "step": 34030 }, { "epoch": 1.97, "grad_norm": 53.11606216430664, "learning_rate": 0.00034420836719687293, "logits/chosen": -16.897274017333984, "logits/rejected": -17.66657066345215, "logps/chosen": -2876.047119140625, "logps/rejected": -2932.424072265625, "loss": 4.6117, "rewards/accuracies": 0.5, "rewards/chosen": -182.83761596679688, "rewards/margins": 9.53795337677002, "rewards/rejected": -192.37554931640625, "step": 34040 }, { "epoch": 1.97, "grad_norm": 47.42193603515625, "learning_rate": 0.0003440148612562406, "logits/chosen": -14.133196830749512, "logits/rejected": -13.74993896484375, "logps/chosen": -2785.93017578125, "logps/rejected": -2581.897216796875, "loss": 2.7363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.52999877929688, "rewards/margins": 3.7066383361816406, "rewards/rejected": -173.23663330078125, "step": 34050 }, { "epoch": 1.97, "grad_norm": 4.375195503234863, "learning_rate": 0.0003438213553156082, "logits/chosen": -16.402324676513672, "logits/rejected": -16.3658390045166, "logps/chosen": -2698.15771484375, "logps/rejected": -2533.306640625, "loss": 2.162, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.71585083007812, "rewards/margins": 5.898003578186035, "rewards/rejected": -204.6138458251953, "step": 34060 }, { "epoch": 1.97, "grad_norm": 74.5574722290039, "learning_rate": 0.00034362784937497584, "logits/chosen": -12.820764541625977, "logits/rejected": -12.680283546447754, "logps/chosen": -3033.27734375, "logps/rejected": -2847.973388671875, "loss": 3.25, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.45762634277344, "rewards/margins": 3.807246446609497, "rewards/rejected": -159.26487731933594, "step": 34070 }, { "epoch": 1.97, "grad_norm": 0.0056379917077720165, "learning_rate": 0.00034343434343434346, "logits/chosen": -13.970855712890625, "logits/rejected": -14.045166015625, "logps/chosen": -3095.786865234375, "logps/rejected": -2705.52392578125, "loss": 6.7707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.6357421875, "rewards/margins": 1.8165512084960938, "rewards/rejected": -167.45230102539062, "step": 34080 }, { "epoch": 1.97, "grad_norm": 3.5608824645438214e-14, "learning_rate": 0.000343240837493711, "logits/chosen": -12.129995346069336, "logits/rejected": -11.918807983398438, "logps/chosen": -2960.751953125, "logps/rejected": -2759.902587890625, "loss": 6.5706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.94367980957031, "rewards/margins": 20.81163787841797, "rewards/rejected": -131.7553253173828, "step": 34090 }, { "epoch": 1.97, "grad_norm": 0.4644448161125183, "learning_rate": 0.0003430473315530787, "logits/chosen": -12.57774543762207, "logits/rejected": -12.616697311401367, "logps/chosen": -2633.750244140625, "logps/rejected": -2516.89111328125, "loss": 18.3367, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -166.2967529296875, "rewards/margins": -12.744183540344238, "rewards/rejected": -153.55258178710938, "step": 34100 }, { "epoch": 1.97, "grad_norm": 3.2711710929870605, "learning_rate": 0.0003428538256124463, "logits/chosen": -12.01909065246582, "logits/rejected": -11.820611000061035, "logps/chosen": -2609.279296875, "logps/rejected": -2586.8828125, "loss": 5.8807, "rewards/accuracies": 0.5, "rewards/chosen": -137.13937377929688, "rewards/margins": -1.1159393787384033, "rewards/rejected": -136.02342224121094, "step": 34110 }, { "epoch": 1.97, "grad_norm": 0.0002311370481038466, "learning_rate": 0.0003426603196718139, "logits/chosen": -12.07996940612793, "logits/rejected": -11.783236503601074, "logps/chosen": -2843.77685546875, "logps/rejected": -2583.425537109375, "loss": 18.3214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -172.52955627441406, "rewards/margins": -2.467698574066162, "rewards/rejected": -170.06185913085938, "step": 34120 }, { "epoch": 1.98, "grad_norm": 3.2415449879863445e-08, "learning_rate": 0.00034246681373118154, "logits/chosen": -15.01526927947998, "logits/rejected": -14.229240417480469, "logps/chosen": -2622.067626953125, "logps/rejected": -2380.206787109375, "loss": 22.5301, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -196.3181915283203, "rewards/margins": -15.830169677734375, "rewards/rejected": -180.48800659179688, "step": 34130 }, { "epoch": 1.98, "grad_norm": 1.4878931153816666e-07, "learning_rate": 0.00034227330779054916, "logits/chosen": -12.01899528503418, "logits/rejected": -12.155637741088867, "logps/chosen": -3236.399169921875, "logps/rejected": -2612.05810546875, "loss": 19.6844, "rewards/accuracies": 0.5, "rewards/chosen": -125.63529205322266, "rewards/margins": -12.446771621704102, "rewards/rejected": -113.18852233886719, "step": 34140 }, { "epoch": 1.98, "grad_norm": 2.6230377443425823e-06, "learning_rate": 0.0003420798018499168, "logits/chosen": -15.202595710754395, "logits/rejected": -15.567831039428711, "logps/chosen": -2816.4140625, "logps/rejected": -2657.037841796875, "loss": 14.8513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -139.98056030273438, "rewards/margins": -3.5417563915252686, "rewards/rejected": -136.4387969970703, "step": 34150 }, { "epoch": 1.98, "grad_norm": 2.6962804794311523, "learning_rate": 0.00034188629590928445, "logits/chosen": -14.44422435760498, "logits/rejected": -14.507100105285645, "logps/chosen": -2585.503662109375, "logps/rejected": -2552.564208984375, "loss": 0.3056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -105.6598892211914, "rewards/margins": 6.010392189025879, "rewards/rejected": -111.6702651977539, "step": 34160 }, { "epoch": 1.98, "grad_norm": 0.0026631327345967293, "learning_rate": 0.00034169278996865206, "logits/chosen": -15.339276313781738, "logits/rejected": -15.243972778320312, "logps/chosen": -2860.88916015625, "logps/rejected": -2629.184814453125, "loss": 9.2706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.9718017578125, "rewards/margins": 1.5750900506973267, "rewards/rejected": -186.54689025878906, "step": 34170 }, { "epoch": 1.98, "grad_norm": 24.2646427154541, "learning_rate": 0.0003414992840280197, "logits/chosen": -13.344517707824707, "logits/rejected": -13.080012321472168, "logps/chosen": -2745.64306640625, "logps/rejected": -2043.5191650390625, "loss": 2.3349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -87.14079284667969, "rewards/margins": 18.87725830078125, "rewards/rejected": -106.01805114746094, "step": 34180 }, { "epoch": 1.98, "grad_norm": 33.29774475097656, "learning_rate": 0.0003413057780873873, "logits/chosen": -14.532186508178711, "logits/rejected": -14.14531421661377, "logps/chosen": -2746.2470703125, "logps/rejected": -2559.993896484375, "loss": 3.8562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -129.6246795654297, "rewards/margins": 7.580744743347168, "rewards/rejected": -137.20541381835938, "step": 34190 }, { "epoch": 1.98, "grad_norm": 0.0, "learning_rate": 0.00034111227214675486, "logits/chosen": -17.29819107055664, "logits/rejected": -17.182296752929688, "logps/chosen": -2621.198486328125, "logps/rejected": -2583.56005859375, "loss": 5.0717, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -161.90365600585938, "rewards/margins": 9.855375289916992, "rewards/rejected": -171.75900268554688, "step": 34200 }, { "epoch": 1.98, "grad_norm": 0.032583266496658325, "learning_rate": 0.00034091876620612253, "logits/chosen": -15.738397598266602, "logits/rejected": -16.105243682861328, "logps/chosen": -2186.38623046875, "logps/rejected": -2391.52001953125, "loss": 10.4955, "rewards/accuracies": 0.5, "rewards/chosen": -184.36904907226562, "rewards/margins": -3.0274200439453125, "rewards/rejected": -181.34164428710938, "step": 34210 }, { "epoch": 1.98, "grad_norm": 80.83981323242188, "learning_rate": 0.00034072526026549015, "logits/chosen": -14.06396770477295, "logits/rejected": -14.19385051727295, "logps/chosen": -2550.388916015625, "logps/rejected": -2334.451171875, "loss": 2.6139, "rewards/accuracies": 0.5, "rewards/chosen": -159.94383239746094, "rewards/margins": 12.372522354125977, "rewards/rejected": -172.31637573242188, "step": 34220 }, { "epoch": 1.98, "grad_norm": 9.389842681173377e-09, "learning_rate": 0.00034053175432485777, "logits/chosen": -13.922419548034668, "logits/rejected": -13.871681213378906, "logps/chosen": -2953.937744140625, "logps/rejected": -2741.509033203125, "loss": 6.7884, "rewards/accuracies": 0.5, "rewards/chosen": -190.08995056152344, "rewards/margins": 0.366189569234848, "rewards/rejected": -190.45616149902344, "step": 34230 }, { "epoch": 1.98, "grad_norm": 0.0, "learning_rate": 0.0003403382483842254, "logits/chosen": -13.219393730163574, "logits/rejected": -12.981335639953613, "logps/chosen": -2739.98046875, "logps/rejected": -2678.698486328125, "loss": 10.7175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -165.76046752929688, "rewards/margins": -1.421038031578064, "rewards/rejected": -164.33944702148438, "step": 34240 }, { "epoch": 1.98, "grad_norm": 77.84085845947266, "learning_rate": 0.000340144742443593, "logits/chosen": -14.054362297058105, "logits/rejected": -13.682167053222656, "logps/chosen": -2517.505859375, "logps/rejected": -2602.457763671875, "loss": 1.9642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.42816162109375, "rewards/margins": 11.611227989196777, "rewards/rejected": -193.03939819335938, "step": 34250 }, { "epoch": 1.98, "grad_norm": 50.91569519042969, "learning_rate": 0.0003399512365029607, "logits/chosen": -11.483469009399414, "logits/rejected": -11.599844932556152, "logps/chosen": -2724.613037109375, "logps/rejected": -2433.05615234375, "loss": 15.5534, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -80.44911193847656, "rewards/margins": 0.28681182861328125, "rewards/rejected": -80.73591613769531, "step": 34260 }, { "epoch": 1.98, "grad_norm": 5.201169187785126e-06, "learning_rate": 0.0003397577305623283, "logits/chosen": -13.053853988647461, "logits/rejected": -12.792499542236328, "logps/chosen": -2553.939208984375, "logps/rejected": -2558.07666015625, "loss": 2.8827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -178.12533569335938, "rewards/margins": 13.838851928710938, "rewards/rejected": -191.9641571044922, "step": 34270 }, { "epoch": 1.98, "grad_norm": 3.7477877867786447e-07, "learning_rate": 0.0003395642246216959, "logits/chosen": -13.470858573913574, "logits/rejected": -13.180425643920898, "logps/chosen": -2818.897216796875, "logps/rejected": -2673.3759765625, "loss": 0.6187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.43869018554688, "rewards/margins": 5.358273506164551, "rewards/rejected": -185.79698181152344, "step": 34280 }, { "epoch": 1.98, "grad_norm": 0.0, "learning_rate": 0.0003393707186810635, "logits/chosen": -15.103068351745605, "logits/rejected": -14.82903003692627, "logps/chosen": -2733.56787109375, "logps/rejected": -2514.62353515625, "loss": 6.814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.04156494140625, "rewards/margins": 24.57972526550293, "rewards/rejected": -191.62130737304688, "step": 34290 }, { "epoch": 1.99, "grad_norm": 0.01740843430161476, "learning_rate": 0.00033917721274043114, "logits/chosen": -13.232884407043457, "logits/rejected": -13.02204418182373, "logps/chosen": -2414.5126953125, "logps/rejected": -2437.775390625, "loss": 4.999, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -181.59701538085938, "rewards/margins": -2.6949431896209717, "rewards/rejected": -178.90206909179688, "step": 34300 }, { "epoch": 1.99, "grad_norm": 2.659296741147621e-11, "learning_rate": 0.00033898370679979876, "logits/chosen": -12.994616508483887, "logits/rejected": -13.18712043762207, "logps/chosen": -2722.377685546875, "logps/rejected": -2617.65478515625, "loss": 7.8398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.33317565917969, "rewards/margins": 3.3934807777404785, "rewards/rejected": -124.7266616821289, "step": 34310 }, { "epoch": 1.99, "grad_norm": 2.5089543669309933e-06, "learning_rate": 0.0003387902008591664, "logits/chosen": -14.020364761352539, "logits/rejected": -13.960615158081055, "logps/chosen": -2872.850830078125, "logps/rejected": -2723.31884765625, "loss": 1.4431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.46035766601562, "rewards/margins": 9.149063110351562, "rewards/rejected": -153.6094207763672, "step": 34320 }, { "epoch": 1.99, "grad_norm": 5.010852288656054e-10, "learning_rate": 0.000338596694918534, "logits/chosen": -14.672994613647461, "logits/rejected": -14.272125244140625, "logps/chosen": -2718.71630859375, "logps/rejected": -2842.543701171875, "loss": 1.9554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.0587615966797, "rewards/margins": 22.676406860351562, "rewards/rejected": -220.7351531982422, "step": 34330 }, { "epoch": 1.99, "grad_norm": 118.16680908203125, "learning_rate": 0.0003384031889779016, "logits/chosen": -15.606819152832031, "logits/rejected": -15.7393217086792, "logps/chosen": -2568.2255859375, "logps/rejected": -2348.66455078125, "loss": 6.2321, "rewards/accuracies": 0.5, "rewards/chosen": -187.78414916992188, "rewards/margins": -0.011417388916015625, "rewards/rejected": -187.77273559570312, "step": 34340 }, { "epoch": 1.99, "grad_norm": 0.8183819651603699, "learning_rate": 0.00033820968303726923, "logits/chosen": -17.08419418334961, "logits/rejected": -16.80422592163086, "logps/chosen": -2567.4375, "logps/rejected": -2571.104736328125, "loss": 3.5697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -230.45004272460938, "rewards/margins": 5.232826232910156, "rewards/rejected": -235.682861328125, "step": 34350 }, { "epoch": 1.99, "grad_norm": 0.0012438305420801044, "learning_rate": 0.0003380161770966369, "logits/chosen": -9.887472152709961, "logits/rejected": -9.753719329833984, "logps/chosen": -3201.71533203125, "logps/rejected": -3148.277587890625, "loss": 6.8951, "rewards/accuracies": 0.5, "rewards/chosen": -67.7349624633789, "rewards/margins": 0.6392198801040649, "rewards/rejected": -68.37417602539062, "step": 34360 }, { "epoch": 1.99, "grad_norm": 0.014287157915532589, "learning_rate": 0.0003378226711560045, "logits/chosen": -11.532142639160156, "logits/rejected": -11.864975929260254, "logps/chosen": -2278.23291015625, "logps/rejected": -2419.022216796875, "loss": 7.1295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.4947509765625, "rewards/margins": -3.3827736377716064, "rewards/rejected": -139.11196899414062, "step": 34370 }, { "epoch": 1.99, "grad_norm": 0.04455531761050224, "learning_rate": 0.00033762916521537213, "logits/chosen": -9.399016380310059, "logits/rejected": -9.14314079284668, "logps/chosen": -3133.6328125, "logps/rejected": -3127.88525390625, "loss": 3.3656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -101.02684020996094, "rewards/margins": 3.1210060119628906, "rewards/rejected": -104.1478500366211, "step": 34380 }, { "epoch": 1.99, "grad_norm": 83.94425964355469, "learning_rate": 0.00033743565927473975, "logits/chosen": -12.699797630310059, "logits/rejected": -12.6386137008667, "logps/chosen": -2907.670166015625, "logps/rejected": -2966.717529296875, "loss": 4.3711, "rewards/accuracies": 0.5, "rewards/chosen": -131.08029174804688, "rewards/margins": 1.0226398706436157, "rewards/rejected": -132.10293579101562, "step": 34390 }, { "epoch": 1.99, "grad_norm": 1.823034608605667e-06, "learning_rate": 0.00033724215333410737, "logits/chosen": -12.602534294128418, "logits/rejected": -12.055883407592773, "logps/chosen": -2799.934814453125, "logps/rejected": -2598.287353515625, "loss": 4.2114, "rewards/accuracies": 0.5, "rewards/chosen": -170.357666015625, "rewards/margins": 5.385685920715332, "rewards/rejected": -175.7433624267578, "step": 34400 }, { "epoch": 1.99, "grad_norm": 1.8114113807678223, "learning_rate": 0.000337048647393475, "logits/chosen": -11.673114776611328, "logits/rejected": -11.825563430786133, "logps/chosen": -3099.1455078125, "logps/rejected": -2704.15283203125, "loss": 5.227, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.42294311523438, "rewards/margins": 2.6559484004974365, "rewards/rejected": -169.07888793945312, "step": 34410 }, { "epoch": 1.99, "grad_norm": 132.97227478027344, "learning_rate": 0.0003368551414528426, "logits/chosen": -12.923395156860352, "logits/rejected": -12.553521156311035, "logps/chosen": -2980.44873046875, "logps/rejected": -2770.89306640625, "loss": 6.4779, "rewards/accuracies": 0.5, "rewards/chosen": -123.4417724609375, "rewards/margins": -2.0683753490448, "rewards/rejected": -121.37339782714844, "step": 34420 }, { "epoch": 1.99, "grad_norm": 5.8950732295670605e-08, "learning_rate": 0.0003366616355122102, "logits/chosen": -15.774665832519531, "logits/rejected": -15.554571151733398, "logps/chosen": -2587.32177734375, "logps/rejected": -2318.00341796875, "loss": 2.2359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.30885314941406, "rewards/margins": 8.982734680175781, "rewards/rejected": -199.29159545898438, "step": 34430 }, { "epoch": 1.99, "grad_norm": 0.0010857456363737583, "learning_rate": 0.00033646812957157784, "logits/chosen": -14.222882270812988, "logits/rejected": -14.158203125, "logps/chosen": -2315.55859375, "logps/rejected": -2499.973388671875, "loss": 4.2104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -133.6881103515625, "rewards/margins": 5.376225471496582, "rewards/rejected": -139.06434631347656, "step": 34440 }, { "epoch": 1.99, "grad_norm": 1.42013015930198e-11, "learning_rate": 0.00033627462363094545, "logits/chosen": -13.989456176757812, "logits/rejected": -13.779284477233887, "logps/chosen": -2653.791259765625, "logps/rejected": -2463.55859375, "loss": 1.5047, "rewards/accuracies": 0.5, "rewards/chosen": -161.62449645996094, "rewards/margins": 9.874590873718262, "rewards/rejected": -171.49908447265625, "step": 34450 }, { "epoch": 1.99, "grad_norm": 0.010406571440398693, "learning_rate": 0.00033608111769031307, "logits/chosen": -14.844507217407227, "logits/rejected": -14.477457046508789, "logps/chosen": -2391.52783203125, "logps/rejected": -2496.002685546875, "loss": 3.0062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.04991149902344, "rewards/margins": 6.5298004150390625, "rewards/rejected": -196.5797119140625, "step": 34460 }, { "epoch": 2.0, "grad_norm": 2.114567548838764e-11, "learning_rate": 0.00033588761174968074, "logits/chosen": -12.282217025756836, "logits/rejected": -12.540390968322754, "logps/chosen": -3196.916748046875, "logps/rejected": -2896.98583984375, "loss": 3.3168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.3281707763672, "rewards/margins": 4.647462368011475, "rewards/rejected": -211.9756622314453, "step": 34470 }, { "epoch": 2.0, "grad_norm": 0.026724757626652718, "learning_rate": 0.00033569410580904836, "logits/chosen": -14.172019958496094, "logits/rejected": -14.034228324890137, "logps/chosen": -2843.350341796875, "logps/rejected": -2633.260009765625, "loss": 5.3564, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -224.58975219726562, "rewards/margins": -1.9838908910751343, "rewards/rejected": -222.6058349609375, "step": 34480 }, { "epoch": 2.0, "grad_norm": 31.56658363342285, "learning_rate": 0.000335500599868416, "logits/chosen": -12.675592422485352, "logits/rejected": -12.401052474975586, "logps/chosen": -3147.413818359375, "logps/rejected": -2862.296630859375, "loss": 1.7667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.90625, "rewards/margins": 7.868828773498535, "rewards/rejected": -171.77508544921875, "step": 34490 }, { "epoch": 2.0, "grad_norm": 104.2686767578125, "learning_rate": 0.0003353070939277836, "logits/chosen": -13.686668395996094, "logits/rejected": -13.518475532531738, "logps/chosen": -2478.361572265625, "logps/rejected": -2222.242919921875, "loss": 25.4173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -200.088134765625, "rewards/margins": -17.291950225830078, "rewards/rejected": -182.79617309570312, "step": 34500 }, { "epoch": 2.0, "grad_norm": 2.9019809558855636e-12, "learning_rate": 0.0003351135879871512, "logits/chosen": -12.808778762817383, "logits/rejected": -12.687556266784668, "logps/chosen": -2968.32275390625, "logps/rejected": -2497.39208984375, "loss": 1.1933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.4923553466797, "rewards/margins": 10.205497741699219, "rewards/rejected": -162.69784545898438, "step": 34510 }, { "epoch": 2.0, "grad_norm": 40.3929443359375, "learning_rate": 0.0003349200820465189, "logits/chosen": -12.294920921325684, "logits/rejected": -12.295226097106934, "logps/chosen": -2789.8359375, "logps/rejected": -2864.811279296875, "loss": 10.334, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -107.3267822265625, "rewards/margins": -2.2938601970672607, "rewards/rejected": -105.0329360961914, "step": 34520 }, { "epoch": 2.0, "grad_norm": 2.330857462595759e-08, "learning_rate": 0.00033472657610588645, "logits/chosen": -12.053472518920898, "logits/rejected": -12.201089859008789, "logps/chosen": -2948.370849609375, "logps/rejected": -2559.693115234375, "loss": 30.8554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -186.70025634765625, "rewards/margins": -24.95772361755371, "rewards/rejected": -161.74253845214844, "step": 34530 }, { "epoch": 2.0, "grad_norm": 53.354896545410156, "learning_rate": 0.00033453307016525406, "logits/chosen": -11.971240997314453, "logits/rejected": -11.900647163391113, "logps/chosen": -2906.14111328125, "logps/rejected": -2596.93115234375, "loss": 22.3773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -178.15371704101562, "rewards/margins": -14.731386184692383, "rewards/rejected": -163.42234802246094, "step": 34540 }, { "epoch": 2.0, "grad_norm": 63.722496032714844, "learning_rate": 0.0003343395642246217, "logits/chosen": -14.456823348999023, "logits/rejected": -14.432149887084961, "logps/chosen": -2065.69384765625, "logps/rejected": -2014.971435546875, "loss": 5.1035, "rewards/accuracies": 0.5, "rewards/chosen": -177.1097869873047, "rewards/margins": 1.2416728734970093, "rewards/rejected": -178.35147094726562, "step": 34550 }, { "epoch": 2.0, "grad_norm": 0.5408036112785339, "learning_rate": 0.0003341460582839893, "logits/chosen": -14.898588180541992, "logits/rejected": -14.120012283325195, "logps/chosen": -2740.069091796875, "logps/rejected": -2962.916748046875, "loss": 1.6863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.54165649414062, "rewards/margins": 5.645431995391846, "rewards/rejected": -202.18710327148438, "step": 34560 }, { "epoch": 2.0, "grad_norm": 0.9615605473518372, "learning_rate": 0.00033395255234335697, "logits/chosen": -13.078897476196289, "logits/rejected": -12.920247077941895, "logps/chosen": -2762.32861328125, "logps/rejected": -2925.222900390625, "loss": 2.5098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.08956909179688, "rewards/margins": 13.319448471069336, "rewards/rejected": -183.40902709960938, "step": 34570 }, { "epoch": 2.0, "grad_norm": 48.048641204833984, "learning_rate": 0.0003337590464027246, "logits/chosen": -12.196235656738281, "logits/rejected": -12.161789894104004, "logps/chosen": -3033.078857421875, "logps/rejected": -2971.09423828125, "loss": 4.7405, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.65377807617188, "rewards/margins": -0.7180797457695007, "rewards/rejected": -156.93569946289062, "step": 34580 }, { "epoch": 2.0, "grad_norm": 12.821822166442871, "learning_rate": 0.0003335655404620922, "logits/chosen": -12.71359634399414, "logits/rejected": -12.55705738067627, "logps/chosen": -2778.783935546875, "logps/rejected": -2738.323486328125, "loss": 4.2718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.9844207763672, "rewards/margins": 4.57125997543335, "rewards/rejected": -202.55567932128906, "step": 34590 }, { "epoch": 2.0, "grad_norm": 1.9690394401550293, "learning_rate": 0.0003333720345214598, "logits/chosen": -15.784289360046387, "logits/rejected": -15.696525573730469, "logps/chosen": -2615.573974609375, "logps/rejected": -2733.78173828125, "loss": 0.2079, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.5784912109375, "rewards/margins": 9.399877548217773, "rewards/rejected": -176.97837829589844, "step": 34600 }, { "epoch": 2.0, "grad_norm": 0.49142512679100037, "learning_rate": 0.00033317852858082744, "logits/chosen": -13.814300537109375, "logits/rejected": -13.750164985656738, "logps/chosen": -2721.72509765625, "logps/rejected": -2982.548095703125, "loss": 4.4598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -157.11253356933594, "rewards/margins": 4.609157562255859, "rewards/rejected": -161.72171020507812, "step": 34610 }, { "epoch": 2.0, "grad_norm": 0.20414341986179352, "learning_rate": 0.00033298502264019505, "logits/chosen": -11.956453323364258, "logits/rejected": -11.979862213134766, "logps/chosen": -3098.1884765625, "logps/rejected": -2769.928955078125, "loss": 0.437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -125.40611267089844, "rewards/margins": 6.11127233505249, "rewards/rejected": -131.5173797607422, "step": 34620 }, { "epoch": 2.0, "grad_norm": 0.006324069108814001, "learning_rate": 0.0003327915166995627, "logits/chosen": -15.488986015319824, "logits/rejected": -15.477185249328613, "logps/chosen": -2899.884765625, "logps/rejected": -2711.30712890625, "loss": 7.7692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.38925170898438, "rewards/margins": -1.9192594289779663, "rewards/rejected": -187.4700164794922, "step": 34630 }, { "epoch": 2.01, "grad_norm": 56.94783020019531, "learning_rate": 0.0003325980107589303, "logits/chosen": -13.138163566589355, "logits/rejected": -12.807726860046387, "logps/chosen": -2831.2294921875, "logps/rejected": -2930.502685546875, "loss": 3.3525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.48587036132812, "rewards/margins": 4.705397605895996, "rewards/rejected": -153.19125366210938, "step": 34640 }, { "epoch": 2.01, "grad_norm": 34.052635192871094, "learning_rate": 0.0003324045048182979, "logits/chosen": -17.001399993896484, "logits/rejected": -16.60581398010254, "logps/chosen": -3119.2724609375, "logps/rejected": -2835.65478515625, "loss": 3.8025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.70162963867188, "rewards/margins": 3.7755191326141357, "rewards/rejected": -206.47714233398438, "step": 34650 }, { "epoch": 2.01, "grad_norm": 60.151851654052734, "learning_rate": 0.0003322109988776655, "logits/chosen": -13.921525955200195, "logits/rejected": -13.729803085327148, "logps/chosen": -3011.0576171875, "logps/rejected": -2952.42236328125, "loss": 5.6311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.7008056640625, "rewards/margins": 2.020615816116333, "rewards/rejected": -186.72140502929688, "step": 34660 }, { "epoch": 2.01, "grad_norm": 0.12227636575698853, "learning_rate": 0.00033201749293703314, "logits/chosen": -11.754727363586426, "logits/rejected": -11.805755615234375, "logps/chosen": -3135.951416015625, "logps/rejected": -2511.107421875, "loss": 3.4965, "rewards/accuracies": 0.5, "rewards/chosen": -101.0291976928711, "rewards/margins": 5.140071868896484, "rewards/rejected": -106.1692886352539, "step": 34670 }, { "epoch": 2.01, "grad_norm": 287.8588562011719, "learning_rate": 0.0003318239869964008, "logits/chosen": -14.353401184082031, "logits/rejected": -14.40904426574707, "logps/chosen": -2758.44775390625, "logps/rejected": -2198.230224609375, "loss": 2.1046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -140.84780883789062, "rewards/margins": 13.400596618652344, "rewards/rejected": -154.24842834472656, "step": 34680 }, { "epoch": 2.01, "grad_norm": 0.00030479428824037313, "learning_rate": 0.00033163048105576843, "logits/chosen": -13.558321952819824, "logits/rejected": -13.368871688842773, "logps/chosen": -3008.817626953125, "logps/rejected": -2676.266357421875, "loss": 2.1849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -155.19871520996094, "rewards/margins": 9.74985122680664, "rewards/rejected": -164.94857788085938, "step": 34690 }, { "epoch": 2.01, "grad_norm": 40.17913818359375, "learning_rate": 0.00033143697511513605, "logits/chosen": -15.414471626281738, "logits/rejected": -15.256248474121094, "logps/chosen": -2653.88818359375, "logps/rejected": -2741.836181640625, "loss": 1.1934, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.4383316040039, "rewards/margins": 20.51833152770996, "rewards/rejected": -136.9566650390625, "step": 34700 }, { "epoch": 2.01, "grad_norm": 172.4883575439453, "learning_rate": 0.00033124346917450366, "logits/chosen": -11.839027404785156, "logits/rejected": -12.057339668273926, "logps/chosen": -3024.9521484375, "logps/rejected": -2994.83642578125, "loss": 7.2662, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -104.26277923583984, "rewards/margins": -0.1946302354335785, "rewards/rejected": -104.06815338134766, "step": 34710 }, { "epoch": 2.01, "grad_norm": 434.5732727050781, "learning_rate": 0.0003310499632338713, "logits/chosen": -14.70384407043457, "logits/rejected": -15.09705638885498, "logps/chosen": -2671.728271484375, "logps/rejected": -2953.739013671875, "loss": 9.3289, "rewards/accuracies": 0.5, "rewards/chosen": -173.93408203125, "rewards/margins": -3.2658638954162598, "rewards/rejected": -170.668212890625, "step": 34720 }, { "epoch": 2.01, "grad_norm": 0.01512078009545803, "learning_rate": 0.00033085645729323895, "logits/chosen": -15.31355094909668, "logits/rejected": -15.093902587890625, "logps/chosen": -3130.20068359375, "logps/rejected": -3060.071044921875, "loss": 9.6887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -275.2959899902344, "rewards/margins": -3.3226516246795654, "rewards/rejected": -271.97332763671875, "step": 34730 }, { "epoch": 2.01, "grad_norm": 0.0016159720253199339, "learning_rate": 0.00033066295135260657, "logits/chosen": -13.425865173339844, "logits/rejected": -12.68917179107666, "logps/chosen": -2963.48974609375, "logps/rejected": -2950.2490234375, "loss": 0.9533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -151.48428344726562, "rewards/margins": 16.273523330688477, "rewards/rejected": -167.75782775878906, "step": 34740 }, { "epoch": 2.01, "grad_norm": 1.7978893041004085e-08, "learning_rate": 0.00033046944541197413, "logits/chosen": -13.26745319366455, "logits/rejected": -13.201263427734375, "logps/chosen": -2809.79833984375, "logps/rejected": -2497.65283203125, "loss": 3.4552, "rewards/accuracies": 0.5, "rewards/chosen": -176.162109375, "rewards/margins": 5.5676774978637695, "rewards/rejected": -181.72979736328125, "step": 34750 }, { "epoch": 2.01, "grad_norm": 3.7822752574356855e-07, "learning_rate": 0.00033027593947134175, "logits/chosen": -13.6627779006958, "logits/rejected": -14.205636978149414, "logps/chosen": -3238.15673828125, "logps/rejected": -3382.567626953125, "loss": 1.2299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -152.33892822265625, "rewards/margins": 10.013222694396973, "rewards/rejected": -162.35214233398438, "step": 34760 }, { "epoch": 2.01, "grad_norm": 82.7120590209961, "learning_rate": 0.00033008243353070937, "logits/chosen": -15.023431777954102, "logits/rejected": -15.142745971679688, "logps/chosen": -2946.966796875, "logps/rejected": -2699.640380859375, "loss": 4.592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.7467498779297, "rewards/margins": 1.1478283405303955, "rewards/rejected": -158.8945770263672, "step": 34770 }, { "epoch": 2.01, "grad_norm": 598.294189453125, "learning_rate": 0.00032988892759007704, "logits/chosen": -12.998319625854492, "logits/rejected": -12.778421401977539, "logps/chosen": -3151.76025390625, "logps/rejected": -2995.21728515625, "loss": 6.7753, "rewards/accuracies": 0.5, "rewards/chosen": -141.95811462402344, "rewards/margins": 2.360328197479248, "rewards/rejected": -144.31845092773438, "step": 34780 }, { "epoch": 2.01, "grad_norm": 60.70674514770508, "learning_rate": 0.00032969542164944466, "logits/chosen": -14.237693786621094, "logits/rejected": -14.112037658691406, "logps/chosen": -3059.316162109375, "logps/rejected": -2528.85791015625, "loss": 4.5324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -200.11953735351562, "rewards/margins": 8.719161987304688, "rewards/rejected": -208.83871459960938, "step": 34790 }, { "epoch": 2.01, "grad_norm": 0.5361911058425903, "learning_rate": 0.00032950191570881227, "logits/chosen": -14.238566398620605, "logits/rejected": -14.19025707244873, "logps/chosen": -2881.7138671875, "logps/rejected": -2641.353759765625, "loss": 4.7162, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -211.56494140625, "rewards/margins": 1.675360918045044, "rewards/rejected": -213.2403106689453, "step": 34800 }, { "epoch": 2.01, "grad_norm": 2.1827538909395313e-22, "learning_rate": 0.0003293084097681799, "logits/chosen": -14.008036613464355, "logits/rejected": -14.071202278137207, "logps/chosen": -2217.284423828125, "logps/rejected": -2088.80224609375, "loss": 12.025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -140.69473266601562, "rewards/margins": 4.021664619445801, "rewards/rejected": -144.71640014648438, "step": 34810 }, { "epoch": 2.02, "grad_norm": 7.941699520230827e-10, "learning_rate": 0.0003291149038275475, "logits/chosen": -11.533720016479492, "logits/rejected": -11.496259689331055, "logps/chosen": -3252.502685546875, "logps/rejected": -2568.30615234375, "loss": 8.4986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -88.56497192382812, "rewards/margins": 2.7739009857177734, "rewards/rejected": -91.3388671875, "step": 34820 }, { "epoch": 2.02, "grad_norm": 112.1622543334961, "learning_rate": 0.0003289213978869151, "logits/chosen": -14.785802841186523, "logits/rejected": -14.874603271484375, "logps/chosen": -2335.71533203125, "logps/rejected": -2369.876953125, "loss": 6.2446, "rewards/accuracies": 0.5, "rewards/chosen": -165.6378936767578, "rewards/margins": -2.0653634071350098, "rewards/rejected": -163.57254028320312, "step": 34830 }, { "epoch": 2.02, "grad_norm": 0.09453068673610687, "learning_rate": 0.0003287278919462828, "logits/chosen": -13.478521347045898, "logits/rejected": -13.175271987915039, "logps/chosen": -2817.106689453125, "logps/rejected": -2545.148193359375, "loss": 15.3719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -139.7706298828125, "rewards/margins": -7.205206871032715, "rewards/rejected": -132.56541442871094, "step": 34840 }, { "epoch": 2.02, "grad_norm": 46.68729782104492, "learning_rate": 0.0003285343860056504, "logits/chosen": -14.535562515258789, "logits/rejected": -14.60645866394043, "logps/chosen": -2707.170654296875, "logps/rejected": -2473.29345703125, "loss": 1.4864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.68582153320312, "rewards/margins": 10.170797348022461, "rewards/rejected": -140.85662841796875, "step": 34850 }, { "epoch": 2.02, "grad_norm": 0.0003900830924976617, "learning_rate": 0.000328340880065018, "logits/chosen": -17.017467498779297, "logits/rejected": -18.141008377075195, "logps/chosen": -2806.360595703125, "logps/rejected": -2692.60009765625, "loss": 7.1764, "rewards/accuracies": 0.5, "rewards/chosen": -186.37457275390625, "rewards/margins": -1.582584023475647, "rewards/rejected": -184.79200744628906, "step": 34860 }, { "epoch": 2.02, "grad_norm": 3.066380023956299, "learning_rate": 0.0003281473741243856, "logits/chosen": -14.542745590209961, "logits/rejected": -15.034469604492188, "logps/chosen": -2855.61181640625, "logps/rejected": -2842.49755859375, "loss": 2.3438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.17161560058594, "rewards/margins": 11.448091506958008, "rewards/rejected": -199.61972045898438, "step": 34870 }, { "epoch": 2.02, "grad_norm": 54.60124588012695, "learning_rate": 0.0003279538681837532, "logits/chosen": -16.678184509277344, "logits/rejected": -16.74837303161621, "logps/chosen": -2880.32470703125, "logps/rejected": -2950.090576171875, "loss": 2.0513, "rewards/accuracies": 0.5, "rewards/chosen": -198.23782348632812, "rewards/margins": 3.0236098766326904, "rewards/rejected": -201.26141357421875, "step": 34880 }, { "epoch": 2.02, "grad_norm": 70.39887237548828, "learning_rate": 0.0003277603622431209, "logits/chosen": -17.85176658630371, "logits/rejected": -17.466588973999023, "logps/chosen": -2676.762451171875, "logps/rejected": -2624.565673828125, "loss": 1.9545, "rewards/accuracies": 0.5, "rewards/chosen": -199.86062622070312, "rewards/margins": 6.106079578399658, "rewards/rejected": -205.9667205810547, "step": 34890 }, { "epoch": 2.02, "grad_norm": 4.85561979512994e-10, "learning_rate": 0.0003275668563024885, "logits/chosen": -12.495260238647461, "logits/rejected": -12.152372360229492, "logps/chosen": -2581.6416015625, "logps/rejected": -2364.9150390625, "loss": 5.5428, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -154.48196411132812, "rewards/margins": 4.743752956390381, "rewards/rejected": -159.2257080078125, "step": 34900 }, { "epoch": 2.02, "grad_norm": 42.254390716552734, "learning_rate": 0.0003273733503618561, "logits/chosen": -14.686330795288086, "logits/rejected": -14.434158325195312, "logps/chosen": -2860.595947265625, "logps/rejected": -2842.807861328125, "loss": 5.3163, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -208.7045135498047, "rewards/margins": 4.351274013519287, "rewards/rejected": -213.0557861328125, "step": 34910 }, { "epoch": 2.02, "grad_norm": 1.625182558245218e-19, "learning_rate": 0.00032717984442122373, "logits/chosen": -15.400949478149414, "logits/rejected": -14.926277160644531, "logps/chosen": -2919.940185546875, "logps/rejected": -2665.04736328125, "loss": 1.3202, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -143.7906036376953, "rewards/margins": 12.602499008178711, "rewards/rejected": -156.39309692382812, "step": 34920 }, { "epoch": 2.02, "grad_norm": 40.56005096435547, "learning_rate": 0.00032698633848059135, "logits/chosen": -13.428915023803711, "logits/rejected": -13.209871292114258, "logps/chosen": -2871.25439453125, "logps/rejected": -2190.048095703125, "loss": 2.0457, "rewards/accuracies": 0.5, "rewards/chosen": -138.53915405273438, "rewards/margins": 10.818563461303711, "rewards/rejected": -149.35772705078125, "step": 34930 }, { "epoch": 2.02, "grad_norm": 6.461219163611531e-05, "learning_rate": 0.000326792832539959, "logits/chosen": -17.292016983032227, "logits/rejected": -16.987506866455078, "logps/chosen": -3064.72021484375, "logps/rejected": -3040.24072265625, "loss": 0.8222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.2674102783203, "rewards/margins": 12.84278392791748, "rewards/rejected": -217.1101837158203, "step": 34940 }, { "epoch": 2.02, "grad_norm": 120.09697723388672, "learning_rate": 0.00032659932659932664, "logits/chosen": -15.525209426879883, "logits/rejected": -15.784111022949219, "logps/chosen": -2719.5029296875, "logps/rejected": -2416.411865234375, "loss": 18.5947, "rewards/accuracies": 0.5, "rewards/chosen": -138.47470092773438, "rewards/margins": -10.896936416625977, "rewards/rejected": -127.5777587890625, "step": 34950 }, { "epoch": 2.02, "grad_norm": 8.76336018183245e-14, "learning_rate": 0.00032640582065869426, "logits/chosen": -16.0714168548584, "logits/rejected": -15.762079238891602, "logps/chosen": -2819.65283203125, "logps/rejected": -2145.152587890625, "loss": 1.5704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.87533569335938, "rewards/margins": 15.716379165649414, "rewards/rejected": -169.5917205810547, "step": 34960 }, { "epoch": 2.02, "grad_norm": 1.5103847545105964e-05, "learning_rate": 0.0003262123147180618, "logits/chosen": -16.914609909057617, "logits/rejected": -16.153696060180664, "logps/chosen": -2716.63671875, "logps/rejected": -2747.198486328125, "loss": 0.2958, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -191.37408447265625, "rewards/margins": 14.54023551940918, "rewards/rejected": -205.914306640625, "step": 34970 }, { "epoch": 2.02, "grad_norm": 0.012894890271127224, "learning_rate": 0.00032601880877742944, "logits/chosen": -15.540771484375, "logits/rejected": -15.309514045715332, "logps/chosen": -3105.036376953125, "logps/rejected": -2938.91943359375, "loss": 15.7218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.40203857421875, "rewards/margins": -4.867678642272949, "rewards/rejected": -150.5343475341797, "step": 34980 }, { "epoch": 2.03, "grad_norm": 90.29098510742188, "learning_rate": 0.0003258253028367971, "logits/chosen": -21.231115341186523, "logits/rejected": -21.53176498413086, "logps/chosen": -2765.67236328125, "logps/rejected": -2879.39013671875, "loss": 2.1872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -252.6537628173828, "rewards/margins": 12.865527153015137, "rewards/rejected": -265.5193176269531, "step": 34990 }, { "epoch": 2.03, "grad_norm": 8.168656495399773e-05, "learning_rate": 0.0003256317968961647, "logits/chosen": -17.63088607788086, "logits/rejected": -18.27442741394043, "logps/chosen": -2689.8310546875, "logps/rejected": -2703.66943359375, "loss": 1.5689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.69613647460938, "rewards/margins": 12.380632400512695, "rewards/rejected": -209.0767822265625, "step": 35000 }, { "epoch": 2.03, "grad_norm": 203.77210998535156, "learning_rate": 0.00032543829095553234, "logits/chosen": -16.699308395385742, "logits/rejected": -16.435998916625977, "logps/chosen": -2496.34912109375, "logps/rejected": -2529.677001953125, "loss": 2.3065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.51002502441406, "rewards/margins": 7.636923313140869, "rewards/rejected": -189.14694213867188, "step": 35010 }, { "epoch": 2.03, "grad_norm": 0.0058541856706142426, "learning_rate": 0.00032524478501489996, "logits/chosen": -17.95718002319336, "logits/rejected": -17.91294288635254, "logps/chosen": -2601.357666015625, "logps/rejected": -2568.81201171875, "loss": 1.1109, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -137.08895874023438, "rewards/margins": 10.779141426086426, "rewards/rejected": -147.8680877685547, "step": 35020 }, { "epoch": 2.03, "grad_norm": 9.889959868514763e-16, "learning_rate": 0.0003250512790742676, "logits/chosen": -14.6534423828125, "logits/rejected": -14.406254768371582, "logps/chosen": -3072.336181640625, "logps/rejected": -2882.95263671875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -163.2185516357422, "rewards/margins": 17.523151397705078, "rewards/rejected": -180.74169921875, "step": 35030 }, { "epoch": 2.03, "grad_norm": 58.35735321044922, "learning_rate": 0.00032485777313363525, "logits/chosen": -17.696430206298828, "logits/rejected": -17.728866577148438, "logps/chosen": -2872.12158203125, "logps/rejected": -2596.862060546875, "loss": 1.9775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -173.28512573242188, "rewards/margins": 14.474653244018555, "rewards/rejected": -187.75979614257812, "step": 35040 }, { "epoch": 2.03, "grad_norm": 0.0007022778736427426, "learning_rate": 0.00032466426719300286, "logits/chosen": -16.442995071411133, "logits/rejected": -16.143224716186523, "logps/chosen": -2354.778076171875, "logps/rejected": -2431.4208984375, "loss": 8.7315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.54661560058594, "rewards/margins": 0.7177085876464844, "rewards/rejected": -139.2643280029297, "step": 35050 }, { "epoch": 2.03, "grad_norm": 0.0, "learning_rate": 0.0003244707612523705, "logits/chosen": -20.24385643005371, "logits/rejected": -19.736591339111328, "logps/chosen": -2774.75927734375, "logps/rejected": -2701.06689453125, "loss": 3.6051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.1243438720703, "rewards/margins": 11.729513168334961, "rewards/rejected": -218.85385131835938, "step": 35060 }, { "epoch": 2.03, "grad_norm": 0.07778970152139664, "learning_rate": 0.0003242772553117381, "logits/chosen": -17.001117706298828, "logits/rejected": -16.90520477294922, "logps/chosen": -2380.27880859375, "logps/rejected": -2602.7109375, "loss": 5.5662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.28562927246094, "rewards/margins": 3.3693385124206543, "rewards/rejected": -192.65499877929688, "step": 35070 }, { "epoch": 2.03, "grad_norm": 1.7263230085372925, "learning_rate": 0.00032408374937110566, "logits/chosen": -13.766874313354492, "logits/rejected": -13.816510200500488, "logps/chosen": -2674.56689453125, "logps/rejected": -2548.17333984375, "loss": 10.6777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.07424926757812, "rewards/margins": -2.310182571411133, "rewards/rejected": -156.76406860351562, "step": 35080 }, { "epoch": 2.03, "grad_norm": 26.349464416503906, "learning_rate": 0.0003238902434304733, "logits/chosen": -19.371883392333984, "logits/rejected": -19.290008544921875, "logps/chosen": -2814.658203125, "logps/rejected": -2847.01904296875, "loss": 1.894, "rewards/accuracies": 0.5, "rewards/chosen": -223.145263671875, "rewards/margins": 6.274779319763184, "rewards/rejected": -229.42007446289062, "step": 35090 }, { "epoch": 2.03, "grad_norm": 2.59877158659246e-20, "learning_rate": 0.00032369673748984095, "logits/chosen": -13.447654724121094, "logits/rejected": -13.336692810058594, "logps/chosen": -2951.4072265625, "logps/rejected": -2680.99853515625, "loss": 1.2052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.44970703125, "rewards/margins": 18.1766414642334, "rewards/rejected": -190.62632751464844, "step": 35100 }, { "epoch": 2.03, "grad_norm": 59.62046432495117, "learning_rate": 0.00032350323154920857, "logits/chosen": -13.566853523254395, "logits/rejected": -13.336735725402832, "logps/chosen": -2847.160400390625, "logps/rejected": -2786.9501953125, "loss": 4.3207, "rewards/accuracies": 0.5, "rewards/chosen": -178.32972717285156, "rewards/margins": 2.326781988143921, "rewards/rejected": -180.65652465820312, "step": 35110 }, { "epoch": 2.03, "grad_norm": 48.25675582885742, "learning_rate": 0.0003233097256085762, "logits/chosen": -15.761003494262695, "logits/rejected": -15.352335929870605, "logps/chosen": -2897.55615234375, "logps/rejected": -2784.146728515625, "loss": 1.6795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.21231079101562, "rewards/margins": 18.617401123046875, "rewards/rejected": -190.82972717285156, "step": 35120 }, { "epoch": 2.03, "grad_norm": 0.028903057798743248, "learning_rate": 0.0003231162196679438, "logits/chosen": -17.52429962158203, "logits/rejected": -18.832727432250977, "logps/chosen": -3003.552001953125, "logps/rejected": -2739.16845703125, "loss": 3.8971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.108642578125, "rewards/margins": 4.202286720275879, "rewards/rejected": -165.31092834472656, "step": 35130 }, { "epoch": 2.03, "grad_norm": 0.007463293150067329, "learning_rate": 0.0003229227137273114, "logits/chosen": -15.101127624511719, "logits/rejected": -15.021838188171387, "logps/chosen": -2954.149658203125, "logps/rejected": -2494.49560546875, "loss": 6.5408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.64439392089844, "rewards/margins": 4.458120822906494, "rewards/rejected": -165.10252380371094, "step": 35140 }, { "epoch": 2.03, "grad_norm": 0.014459395781159401, "learning_rate": 0.0003227292077866791, "logits/chosen": -18.885910034179688, "logits/rejected": -18.232574462890625, "logps/chosen": -2969.188232421875, "logps/rejected": -2742.90478515625, "loss": 4.2093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -185.62582397460938, "rewards/margins": 13.489614486694336, "rewards/rejected": -199.11544799804688, "step": 35150 }, { "epoch": 2.04, "grad_norm": 0.1659591645002365, "learning_rate": 0.0003225357018460467, "logits/chosen": -18.255678176879883, "logits/rejected": -18.25326919555664, "logps/chosen": -2909.36376953125, "logps/rejected": -2387.24267578125, "loss": 33.4366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.4244384765625, "rewards/margins": -23.067922592163086, "rewards/rejected": -201.35650634765625, "step": 35160 }, { "epoch": 2.04, "grad_norm": 1.091574031164555e-08, "learning_rate": 0.0003223421959054143, "logits/chosen": -16.820556640625, "logits/rejected": -16.508098602294922, "logps/chosen": -2577.728515625, "logps/rejected": -2432.478759765625, "loss": 2.3521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.68972778320312, "rewards/margins": 4.33745002746582, "rewards/rejected": -203.0271759033203, "step": 35170 }, { "epoch": 2.04, "grad_norm": 138.31155395507812, "learning_rate": 0.00032214868996478194, "logits/chosen": -18.678760528564453, "logits/rejected": -18.656606674194336, "logps/chosen": -3244.4892578125, "logps/rejected": -3260.641845703125, "loss": 2.7016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -183.69003295898438, "rewards/margins": 18.4797306060791, "rewards/rejected": -202.1697540283203, "step": 35180 }, { "epoch": 2.04, "grad_norm": 0.021743837743997574, "learning_rate": 0.0003219551840241495, "logits/chosen": -18.23763084411621, "logits/rejected": -18.069072723388672, "logps/chosen": -2500.372802734375, "logps/rejected": -2517.939453125, "loss": 4.5176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -214.92086791992188, "rewards/margins": 4.017064094543457, "rewards/rejected": -218.93795776367188, "step": 35190 }, { "epoch": 2.04, "grad_norm": 2.5312923979115e-10, "learning_rate": 0.0003217616780835172, "logits/chosen": -18.454877853393555, "logits/rejected": -19.352832794189453, "logps/chosen": -2715.923095703125, "logps/rejected": -2310.755859375, "loss": 19.5995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.89202880859375, "rewards/margins": -6.300580024719238, "rewards/rejected": -171.59146118164062, "step": 35200 }, { "epoch": 2.04, "grad_norm": 73.41709899902344, "learning_rate": 0.0003215681721428848, "logits/chosen": -16.92171287536621, "logits/rejected": -17.00917625427246, "logps/chosen": -2593.049072265625, "logps/rejected": -2625.42626953125, "loss": 2.5224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -222.52890014648438, "rewards/margins": 10.027043342590332, "rewards/rejected": -232.55593872070312, "step": 35210 }, { "epoch": 2.04, "grad_norm": 82.0718002319336, "learning_rate": 0.0003213746662022524, "logits/chosen": -15.380216598510742, "logits/rejected": -15.476841926574707, "logps/chosen": -2772.7431640625, "logps/rejected": -2782.93408203125, "loss": 0.8005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -157.562744140625, "rewards/margins": 13.66498851776123, "rewards/rejected": -171.2277374267578, "step": 35220 }, { "epoch": 2.04, "grad_norm": 19.14719009399414, "learning_rate": 0.00032118116026162003, "logits/chosen": -20.280315399169922, "logits/rejected": -20.202545166015625, "logps/chosen": -2785.9140625, "logps/rejected": -2639.00341796875, "loss": 0.8114, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -206.5864715576172, "rewards/margins": 14.8388671875, "rewards/rejected": -221.4253387451172, "step": 35230 }, { "epoch": 2.04, "grad_norm": 3.251821034666591e-11, "learning_rate": 0.00032098765432098765, "logits/chosen": -15.418660163879395, "logits/rejected": -15.404054641723633, "logps/chosen": -2373.962158203125, "logps/rejected": -2248.360107421875, "loss": 7.6741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -137.27713012695312, "rewards/margins": 4.9981279373168945, "rewards/rejected": -142.27525329589844, "step": 35240 }, { "epoch": 2.04, "grad_norm": 0.04582800343632698, "learning_rate": 0.0003207941483803553, "logits/chosen": -12.723161697387695, "logits/rejected": -12.595772743225098, "logps/chosen": -3186.30517578125, "logps/rejected": -3004.66552734375, "loss": 5.2463, "rewards/accuracies": 0.5, "rewards/chosen": -101.01536560058594, "rewards/margins": 3.868201732635498, "rewards/rejected": -104.8835678100586, "step": 35250 }, { "epoch": 2.04, "grad_norm": 3.649876452982426e-05, "learning_rate": 0.00032060064243972293, "logits/chosen": -15.648798942565918, "logits/rejected": -16.09324073791504, "logps/chosen": -2907.367919921875, "logps/rejected": -2845.330810546875, "loss": 4.9183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.76043701171875, "rewards/margins": 7.7514495849609375, "rewards/rejected": -154.5118865966797, "step": 35260 }, { "epoch": 2.04, "grad_norm": 0.0013325366890057921, "learning_rate": 0.00032040713649909055, "logits/chosen": -15.695640563964844, "logits/rejected": -16.460033416748047, "logps/chosen": -3063.62158203125, "logps/rejected": -2930.7822265625, "loss": 5.1173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.07508850097656, "rewards/margins": 8.493866920471191, "rewards/rejected": -140.56893920898438, "step": 35270 }, { "epoch": 2.04, "grad_norm": 41.821876525878906, "learning_rate": 0.00032021363055845817, "logits/chosen": -18.096729278564453, "logits/rejected": -18.17218017578125, "logps/chosen": -2401.70068359375, "logps/rejected": -2622.788818359375, "loss": 4.8826, "rewards/accuracies": 0.5, "rewards/chosen": -195.17941284179688, "rewards/margins": 0.034120749682188034, "rewards/rejected": -195.2135467529297, "step": 35280 }, { "epoch": 2.04, "grad_norm": 1.2031963706249371e-05, "learning_rate": 0.00032002012461782573, "logits/chosen": -15.147611618041992, "logits/rejected": -14.995241165161133, "logps/chosen": -2936.8310546875, "logps/rejected": -2826.869873046875, "loss": 7.3165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.38108825683594, "rewards/margins": 0.21696051955223083, "rewards/rejected": -181.59805297851562, "step": 35290 }, { "epoch": 2.04, "grad_norm": 59.60438919067383, "learning_rate": 0.00031982661867719335, "logits/chosen": -15.718783378601074, "logits/rejected": -15.716242790222168, "logps/chosen": -2602.840576171875, "logps/rejected": -2816.58837890625, "loss": 9.168, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -175.8877410888672, "rewards/margins": 2.84051775932312, "rewards/rejected": -178.728271484375, "step": 35300 }, { "epoch": 2.04, "grad_norm": 1.4424301753024338e-07, "learning_rate": 0.000319633112736561, "logits/chosen": -17.88497543334961, "logits/rejected": -18.0915584564209, "logps/chosen": -2432.18603515625, "logps/rejected": -2413.96484375, "loss": 2.6478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -206.1970672607422, "rewards/margins": 11.088888168334961, "rewards/rejected": -217.2859649658203, "step": 35310 }, { "epoch": 2.04, "grad_norm": 9.249967115465552e-05, "learning_rate": 0.00031943960679592864, "logits/chosen": -15.962030410766602, "logits/rejected": -15.84593391418457, "logps/chosen": -3133.52001953125, "logps/rejected": -2933.55615234375, "loss": 1.2641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -192.30630493164062, "rewards/margins": 8.248324394226074, "rewards/rejected": -200.5546417236328, "step": 35320 }, { "epoch": 2.05, "grad_norm": 76.75830841064453, "learning_rate": 0.00031924610085529625, "logits/chosen": -15.467805862426758, "logits/rejected": -15.610006332397461, "logps/chosen": -3139.357666015625, "logps/rejected": -2688.4912109375, "loss": 1.6748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.0261688232422, "rewards/margins": 12.495322227478027, "rewards/rejected": -179.52146911621094, "step": 35330 }, { "epoch": 2.05, "grad_norm": 1.1176713087479584e-05, "learning_rate": 0.00031905259491466387, "logits/chosen": -14.474260330200195, "logits/rejected": -14.930435180664062, "logps/chosen": -2792.4912109375, "logps/rejected": -2660.593505859375, "loss": 1.8824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.6650848388672, "rewards/margins": 15.66222095489502, "rewards/rejected": -180.32730102539062, "step": 35340 }, { "epoch": 2.05, "grad_norm": 0.00015018241538200527, "learning_rate": 0.0003188590889740315, "logits/chosen": -17.480785369873047, "logits/rejected": -17.932676315307617, "logps/chosen": -2771.540771484375, "logps/rejected": -2605.9853515625, "loss": 1.5811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.89846801757812, "rewards/margins": 7.638175964355469, "rewards/rejected": -197.53665161132812, "step": 35350 }, { "epoch": 2.05, "grad_norm": 127.89420318603516, "learning_rate": 0.00031866558303339916, "logits/chosen": -16.16983413696289, "logits/rejected": -15.896020889282227, "logps/chosen": -3173.642578125, "logps/rejected": -2847.487548828125, "loss": 14.1218, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -209.63723754882812, "rewards/margins": -6.082266807556152, "rewards/rejected": -203.55496215820312, "step": 35360 }, { "epoch": 2.05, "grad_norm": 0.02293250523507595, "learning_rate": 0.0003184720770927668, "logits/chosen": -13.625205993652344, "logits/rejected": -13.478337287902832, "logps/chosen": -3023.471435546875, "logps/rejected": -3012.81591796875, "loss": 0.4159, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -118.3077163696289, "rewards/margins": 9.36127758026123, "rewards/rejected": -127.66898345947266, "step": 35370 }, { "epoch": 2.05, "grad_norm": 59.7153205871582, "learning_rate": 0.0003182785711521344, "logits/chosen": -17.586740493774414, "logits/rejected": -18.133718490600586, "logps/chosen": -2739.443115234375, "logps/rejected": -2878.6923828125, "loss": 12.1608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -210.6810760498047, "rewards/margins": 12.2711820602417, "rewards/rejected": -222.95230102539062, "step": 35380 }, { "epoch": 2.05, "grad_norm": 83.91505432128906, "learning_rate": 0.000318085065211502, "logits/chosen": -16.441974639892578, "logits/rejected": -16.056943893432617, "logps/chosen": -2835.071044921875, "logps/rejected": -2794.54931640625, "loss": 10.4681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.2504425048828, "rewards/margins": -3.3182365894317627, "rewards/rejected": -200.9322052001953, "step": 35390 }, { "epoch": 2.05, "grad_norm": 8.241815407927788e-07, "learning_rate": 0.0003178915592708696, "logits/chosen": -14.887168884277344, "logits/rejected": -14.909268379211426, "logps/chosen": -2688.10009765625, "logps/rejected": -2430.879638671875, "loss": 3.1747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.78970336914062, "rewards/margins": 2.45204496383667, "rewards/rejected": -146.24176025390625, "step": 35400 }, { "epoch": 2.05, "grad_norm": 61.121131896972656, "learning_rate": 0.00031769805333023725, "logits/chosen": -19.649377822875977, "logits/rejected": -19.833133697509766, "logps/chosen": -2602.512939453125, "logps/rejected": -2780.74658203125, "loss": 1.234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -231.0236358642578, "rewards/margins": 16.747892379760742, "rewards/rejected": -247.77151489257812, "step": 35410 }, { "epoch": 2.05, "grad_norm": 5.736819019830364e-08, "learning_rate": 0.00031750454738960486, "logits/chosen": -18.381114959716797, "logits/rejected": -18.58094024658203, "logps/chosen": -2961.719970703125, "logps/rejected": -2869.58056640625, "loss": 1.2305, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -236.80642700195312, "rewards/margins": 9.789214134216309, "rewards/rejected": -246.5956573486328, "step": 35420 }, { "epoch": 2.05, "grad_norm": 0.0, "learning_rate": 0.0003173110414489725, "logits/chosen": -15.05730152130127, "logits/rejected": -14.948643684387207, "logps/chosen": -2867.33837890625, "logps/rejected": -2890.518310546875, "loss": 2.2952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.74716186523438, "rewards/margins": 9.738752365112305, "rewards/rejected": -169.4859161376953, "step": 35430 }, { "epoch": 2.05, "grad_norm": 0.0008861004025675356, "learning_rate": 0.0003171175355083401, "logits/chosen": -19.958681106567383, "logits/rejected": -20.471004486083984, "logps/chosen": -2357.26806640625, "logps/rejected": -2370.171875, "loss": 1.756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -195.4910125732422, "rewards/margins": 6.1016340255737305, "rewards/rejected": -201.5926513671875, "step": 35440 }, { "epoch": 2.05, "grad_norm": 0.0006594008300453424, "learning_rate": 0.0003169240295677077, "logits/chosen": -15.9154052734375, "logits/rejected": -16.312068939208984, "logps/chosen": -2909.480712890625, "logps/rejected": -2615.1123046875, "loss": 0.2025, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -113.50589752197266, "rewards/margins": 13.852668762207031, "rewards/rejected": -127.35856628417969, "step": 35450 }, { "epoch": 2.05, "grad_norm": 5.904595562489234e-11, "learning_rate": 0.0003167305236270754, "logits/chosen": -17.84025764465332, "logits/rejected": -17.737199783325195, "logps/chosen": -2603.83935546875, "logps/rejected": -2603.4970703125, "loss": 2.9768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.94644165039062, "rewards/margins": 4.632046699523926, "rewards/rejected": -229.5784912109375, "step": 35460 }, { "epoch": 2.05, "grad_norm": 1.2125343396007793e-08, "learning_rate": 0.000316537017686443, "logits/chosen": -17.60032081604004, "logits/rejected": -18.26477813720703, "logps/chosen": -2796.88720703125, "logps/rejected": -2741.8388671875, "loss": 2.366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.0857696533203, "rewards/margins": 12.00651741027832, "rewards/rejected": -197.09228515625, "step": 35470 }, { "epoch": 2.05, "grad_norm": 65.95024108886719, "learning_rate": 0.0003163435117458106, "logits/chosen": -17.986164093017578, "logits/rejected": -18.332319259643555, "logps/chosen": -2855.84375, "logps/rejected": -2521.1708984375, "loss": 35.1283, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -214.7384490966797, "rewards/margins": -29.48587989807129, "rewards/rejected": -185.25257873535156, "step": 35480 }, { "epoch": 2.05, "grad_norm": 2.031902113230899e-08, "learning_rate": 0.00031615000580517824, "logits/chosen": -14.011190414428711, "logits/rejected": -14.169021606445312, "logps/chosen": -3090.89697265625, "logps/rejected": -3001.237548828125, "loss": 2.2756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -133.02101135253906, "rewards/margins": 5.973227500915527, "rewards/rejected": -138.99424743652344, "step": 35490 }, { "epoch": 2.05, "grad_norm": 29.31175994873047, "learning_rate": 0.00031595649986454586, "logits/chosen": -18.806018829345703, "logits/rejected": -18.579757690429688, "logps/chosen": -2726.74462890625, "logps/rejected": -2475.68994140625, "loss": 3.0875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.54127502441406, "rewards/margins": 6.529315948486328, "rewards/rejected": -209.07058715820312, "step": 35500 }, { "epoch": 2.06, "grad_norm": 164.0309600830078, "learning_rate": 0.0003157629939239134, "logits/chosen": -16.832195281982422, "logits/rejected": -17.808073043823242, "logps/chosen": -2978.501953125, "logps/rejected": -2983.321044921875, "loss": 8.7008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.33291625976562, "rewards/margins": -2.7280495166778564, "rewards/rejected": -160.60487365722656, "step": 35510 }, { "epoch": 2.06, "grad_norm": 158.6446990966797, "learning_rate": 0.0003155694879832811, "logits/chosen": -16.103872299194336, "logits/rejected": -15.66563892364502, "logps/chosen": -2949.96435546875, "logps/rejected": -2971.576904296875, "loss": 15.2301, "rewards/accuracies": 0.5, "rewards/chosen": -163.02401733398438, "rewards/margins": -10.539149284362793, "rewards/rejected": -152.48487854003906, "step": 35520 }, { "epoch": 2.06, "grad_norm": 1.7186206579208374, "learning_rate": 0.0003153759820426487, "logits/chosen": -17.664209365844727, "logits/rejected": -18.32108497619629, "logps/chosen": -2691.689453125, "logps/rejected": -2815.999267578125, "loss": 5.6504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -178.96902465820312, "rewards/margins": 7.769822597503662, "rewards/rejected": -186.73886108398438, "step": 35530 }, { "epoch": 2.06, "grad_norm": 127.74658203125, "learning_rate": 0.0003151824761020163, "logits/chosen": -17.031169891357422, "logits/rejected": -17.776105880737305, "logps/chosen": -2960.32958984375, "logps/rejected": -2810.71337890625, "loss": 3.38, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.25015258789062, "rewards/margins": 3.955714464187622, "rewards/rejected": -188.2058563232422, "step": 35540 }, { "epoch": 2.06, "grad_norm": 0.00571176502853632, "learning_rate": 0.00031498897016138394, "logits/chosen": -16.089813232421875, "logits/rejected": -15.721267700195312, "logps/chosen": -2958.20849609375, "logps/rejected": -3183.546142578125, "loss": 0.6806, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.93190002441406, "rewards/margins": 12.7406644821167, "rewards/rejected": -180.67254638671875, "step": 35550 }, { "epoch": 2.06, "grad_norm": 0.950484037399292, "learning_rate": 0.00031479546422075156, "logits/chosen": -20.95016860961914, "logits/rejected": -21.056325912475586, "logps/chosen": -2775.48974609375, "logps/rejected": -2654.28564453125, "loss": 11.763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -242.12338256835938, "rewards/margins": -5.5539655685424805, "rewards/rejected": -236.5693817138672, "step": 35560 }, { "epoch": 2.06, "grad_norm": 4.325828861784231e-10, "learning_rate": 0.00031460195828011923, "logits/chosen": -17.186365127563477, "logits/rejected": -17.77231216430664, "logps/chosen": -2905.507568359375, "logps/rejected": -2801.601806640625, "loss": 12.002, "rewards/accuracies": 0.5, "rewards/chosen": -200.4980926513672, "rewards/margins": 3.8816559314727783, "rewards/rejected": -204.37973022460938, "step": 35570 }, { "epoch": 2.06, "grad_norm": 0.0006516297580674291, "learning_rate": 0.00031440845233948685, "logits/chosen": -16.704360961914062, "logits/rejected": -17.002025604248047, "logps/chosen": -3070.028076171875, "logps/rejected": -2831.83935546875, "loss": 0.3416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -137.24874877929688, "rewards/margins": 9.891304016113281, "rewards/rejected": -147.14004516601562, "step": 35580 }, { "epoch": 2.06, "grad_norm": 36.748260498046875, "learning_rate": 0.00031421494639885446, "logits/chosen": -18.710880279541016, "logits/rejected": -18.648481369018555, "logps/chosen": -3024.696044921875, "logps/rejected": -2955.112548828125, "loss": 3.5777, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.2255859375, "rewards/margins": 1.4687095880508423, "rewards/rejected": -190.6942901611328, "step": 35590 }, { "epoch": 2.06, "grad_norm": 38.565635681152344, "learning_rate": 0.0003140214404582221, "logits/chosen": -17.026330947875977, "logits/rejected": -17.089786529541016, "logps/chosen": -2959.553955078125, "logps/rejected": -2845.86474609375, "loss": 5.0735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.6700439453125, "rewards/margins": 0.8778492212295532, "rewards/rejected": -145.5478973388672, "step": 35600 }, { "epoch": 2.06, "grad_norm": 52.12705993652344, "learning_rate": 0.0003138279345175897, "logits/chosen": -15.624771118164062, "logits/rejected": -15.120607376098633, "logps/chosen": -2576.686767578125, "logps/rejected": -2835.65185546875, "loss": 3.1656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.18112182617188, "rewards/margins": 9.666831970214844, "rewards/rejected": -175.84799194335938, "step": 35610 }, { "epoch": 2.06, "grad_norm": 0.0029132519848644733, "learning_rate": 0.0003136344285769573, "logits/chosen": -18.520854949951172, "logits/rejected": -18.20515251159668, "logps/chosen": -2719.829833984375, "logps/rejected": -2419.817626953125, "loss": 3.7046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.03439331054688, "rewards/margins": 2.474038600921631, "rewards/rejected": -206.5084228515625, "step": 35620 }, { "epoch": 2.06, "grad_norm": 0.33803635835647583, "learning_rate": 0.00031344092263632493, "logits/chosen": -15.93608283996582, "logits/rejected": -16.106969833374023, "logps/chosen": -3119.22119140625, "logps/rejected": -2858.34228515625, "loss": 5.4397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -128.74411010742188, "rewards/margins": 6.011178493499756, "rewards/rejected": -134.7552947998047, "step": 35630 }, { "epoch": 2.06, "grad_norm": 83.06365203857422, "learning_rate": 0.00031324741669569255, "logits/chosen": -15.729204177856445, "logits/rejected": -15.972328186035156, "logps/chosen": -2980.989501953125, "logps/rejected": -3056.37939453125, "loss": 0.919, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -184.5997772216797, "rewards/margins": 21.588104248046875, "rewards/rejected": -206.18789672851562, "step": 35640 }, { "epoch": 2.06, "grad_norm": 0.011010157875716686, "learning_rate": 0.00031305391075506017, "logits/chosen": -17.195575714111328, "logits/rejected": -17.287961959838867, "logps/chosen": -2638.50439453125, "logps/rejected": -2692.16845703125, "loss": 1.9206, "rewards/accuracies": 0.5, "rewards/chosen": -199.0601348876953, "rewards/margins": 4.24030065536499, "rewards/rejected": -203.30044555664062, "step": 35650 }, { "epoch": 2.06, "grad_norm": 8.059685363548397e-09, "learning_rate": 0.0003128604048144278, "logits/chosen": -15.982007026672363, "logits/rejected": -16.09665298461914, "logps/chosen": -2925.07470703125, "logps/rejected": -2731.08056640625, "loss": 0.1005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -141.96763610839844, "rewards/margins": 14.9100341796875, "rewards/rejected": -156.877685546875, "step": 35660 }, { "epoch": 2.06, "grad_norm": 2.9018237590789795, "learning_rate": 0.00031266689887379546, "logits/chosen": -14.21116828918457, "logits/rejected": -14.193652153015137, "logps/chosen": -3068.51806640625, "logps/rejected": -2953.57763671875, "loss": 7.3231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -76.7013168334961, "rewards/margins": 4.18344783782959, "rewards/rejected": -80.884765625, "step": 35670 }, { "epoch": 2.07, "grad_norm": 91.07778930664062, "learning_rate": 0.00031247339293316307, "logits/chosen": -15.008440971374512, "logits/rejected": -14.960153579711914, "logps/chosen": -2300.448486328125, "logps/rejected": -2319.16943359375, "loss": 2.3983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.0153045654297, "rewards/margins": 8.779176712036133, "rewards/rejected": -157.7944793701172, "step": 35680 }, { "epoch": 2.07, "grad_norm": 0.0, "learning_rate": 0.0003122798869925307, "logits/chosen": -14.967120170593262, "logits/rejected": -14.779057502746582, "logps/chosen": -2360.12158203125, "logps/rejected": -1896.243896484375, "loss": 31.7673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.9694366455078, "rewards/margins": 14.452051162719727, "rewards/rejected": -147.42147827148438, "step": 35690 }, { "epoch": 2.07, "grad_norm": 179.9315185546875, "learning_rate": 0.0003120863810518983, "logits/chosen": -13.444305419921875, "logits/rejected": -12.727048873901367, "logps/chosen": -2728.114501953125, "logps/rejected": -2348.35205078125, "loss": 2.842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -87.52013397216797, "rewards/margins": 13.01356029510498, "rewards/rejected": -100.53369903564453, "step": 35700 }, { "epoch": 2.07, "grad_norm": 0.2673119902610779, "learning_rate": 0.0003118928751112659, "logits/chosen": -19.148073196411133, "logits/rejected": -19.128833770751953, "logps/chosen": -2835.114013671875, "logps/rejected": -2812.207763671875, "loss": 3.7102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -233.96951293945312, "rewards/margins": 5.006319522857666, "rewards/rejected": -238.97579956054688, "step": 35710 }, { "epoch": 2.07, "grad_norm": 161.7796630859375, "learning_rate": 0.00031169936917063354, "logits/chosen": -17.745458602905273, "logits/rejected": -17.600801467895508, "logps/chosen": -2566.418701171875, "logps/rejected": -2753.766845703125, "loss": 0.6764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.9089813232422, "rewards/margins": 7.687216281890869, "rewards/rejected": -157.59622192382812, "step": 35720 }, { "epoch": 2.07, "grad_norm": 0.19024337828159332, "learning_rate": 0.00031150586323000116, "logits/chosen": -20.61825942993164, "logits/rejected": -24.289823532104492, "logps/chosen": -2579.393310546875, "logps/rejected": -2630.945068359375, "loss": 0.2512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -227.05923461914062, "rewards/margins": 10.335683822631836, "rewards/rejected": -237.39492797851562, "step": 35730 }, { "epoch": 2.07, "grad_norm": 2.263111031908238e-09, "learning_rate": 0.0003113123572893688, "logits/chosen": -20.19199562072754, "logits/rejected": -20.527469635009766, "logps/chosen": -2837.104736328125, "logps/rejected": -2805.867919921875, "loss": 3.765, "rewards/accuracies": 0.5, "rewards/chosen": -246.5016632080078, "rewards/margins": 8.917752265930176, "rewards/rejected": -255.41940307617188, "step": 35740 }, { "epoch": 2.07, "grad_norm": 0.01915118284523487, "learning_rate": 0.0003111188513487364, "logits/chosen": -15.376848220825195, "logits/rejected": -15.916725158691406, "logps/chosen": -2611.849609375, "logps/rejected": -2995.217529296875, "loss": 3.9925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -132.7937469482422, "rewards/margins": 2.3059234619140625, "rewards/rejected": -135.0996551513672, "step": 35750 }, { "epoch": 2.07, "grad_norm": 0.044313572347164154, "learning_rate": 0.000310925345408104, "logits/chosen": -17.261730194091797, "logits/rejected": -17.42938995361328, "logps/chosen": -3143.3681640625, "logps/rejected": -2659.4384765625, "loss": 0.7509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -176.80747985839844, "rewards/margins": 14.048558235168457, "rewards/rejected": -190.85604858398438, "step": 35760 }, { "epoch": 2.07, "grad_norm": 0.00036528470809571445, "learning_rate": 0.00031073183946747163, "logits/chosen": -12.672435760498047, "logits/rejected": -12.896255493164062, "logps/chosen": -3457.096923828125, "logps/rejected": -3050.831787109375, "loss": 0.4786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -89.1248550415039, "rewards/margins": 10.2858304977417, "rewards/rejected": -99.41069030761719, "step": 35770 }, { "epoch": 2.07, "grad_norm": 128.43417358398438, "learning_rate": 0.0003105383335268393, "logits/chosen": -14.685369491577148, "logits/rejected": -14.68506908416748, "logps/chosen": -2728.65771484375, "logps/rejected": -2621.30322265625, "loss": 10.0286, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.4048614501953, "rewards/margins": -0.1625339537858963, "rewards/rejected": -172.24232482910156, "step": 35780 }, { "epoch": 2.07, "grad_norm": 18.616790771484375, "learning_rate": 0.0003103448275862069, "logits/chosen": -16.407255172729492, "logits/rejected": -16.35413932800293, "logps/chosen": -2640.68115234375, "logps/rejected": -2683.588623046875, "loss": 0.1367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -158.9929962158203, "rewards/margins": 13.698214530944824, "rewards/rejected": -172.6912078857422, "step": 35790 }, { "epoch": 2.07, "grad_norm": 26.237953186035156, "learning_rate": 0.00031015132164557453, "logits/chosen": -15.507575988769531, "logits/rejected": -15.573779106140137, "logps/chosen": -2877.32080078125, "logps/rejected": -2821.705078125, "loss": 1.9704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -151.3628692626953, "rewards/margins": 12.220206260681152, "rewards/rejected": -163.5830841064453, "step": 35800 }, { "epoch": 2.07, "grad_norm": 1.5713836898356703e-09, "learning_rate": 0.00030995781570494215, "logits/chosen": -16.97797203063965, "logits/rejected": -16.6705265045166, "logps/chosen": -2521.8046875, "logps/rejected": -2344.93603515625, "loss": 5.9869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.1436004638672, "rewards/margins": 2.958235263824463, "rewards/rejected": -175.10183715820312, "step": 35810 }, { "epoch": 2.07, "grad_norm": 2.0888153429154954e-08, "learning_rate": 0.00030976430976430977, "logits/chosen": -15.79272747039795, "logits/rejected": -16.534992218017578, "logps/chosen": -2804.142333984375, "logps/rejected": -3216.3466796875, "loss": 7.1128, "rewards/accuracies": 0.5, "rewards/chosen": -135.25291442871094, "rewards/margins": 3.9438350200653076, "rewards/rejected": -139.19674682617188, "step": 35820 }, { "epoch": 2.07, "grad_norm": 68.93962860107422, "learning_rate": 0.00030957080382367744, "logits/chosen": -15.142560005187988, "logits/rejected": -15.193572998046875, "logps/chosen": -2976.701171875, "logps/rejected": -2995.443603515625, "loss": 1.5217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.38502502441406, "rewards/margins": 10.937620162963867, "rewards/rejected": -166.32264709472656, "step": 35830 }, { "epoch": 2.07, "grad_norm": 64.2920913696289, "learning_rate": 0.000309377297883045, "logits/chosen": -15.096613883972168, "logits/rejected": -15.010258674621582, "logps/chosen": -2955.997802734375, "logps/rejected": -2902.30126953125, "loss": 1.709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.33787536621094, "rewards/margins": 5.616882801055908, "rewards/rejected": -195.95477294921875, "step": 35840 }, { "epoch": 2.08, "grad_norm": 201.78089904785156, "learning_rate": 0.0003091837919424126, "logits/chosen": -13.06645393371582, "logits/rejected": -12.965288162231445, "logps/chosen": -2813.7783203125, "logps/rejected": -2893.257080078125, "loss": 2.9309, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.24623107910156, "rewards/margins": 9.49211311340332, "rewards/rejected": -184.7383575439453, "step": 35850 }, { "epoch": 2.08, "grad_norm": 8.854328370944131e-06, "learning_rate": 0.00030899028600178024, "logits/chosen": -14.117835998535156, "logits/rejected": -13.608294486999512, "logps/chosen": -2891.708740234375, "logps/rejected": -2800.101318359375, "loss": 6.0602, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -177.9144287109375, "rewards/margins": -0.5172941088676453, "rewards/rejected": -177.39712524414062, "step": 35860 }, { "epoch": 2.08, "grad_norm": 0.017257409170269966, "learning_rate": 0.00030879678006114785, "logits/chosen": -14.87312126159668, "logits/rejected": -15.059282302856445, "logps/chosen": -2850.76318359375, "logps/rejected": -3263.27490234375, "loss": 0.3928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -131.2362060546875, "rewards/margins": 7.650173187255859, "rewards/rejected": -138.88638305664062, "step": 35870 }, { "epoch": 2.08, "grad_norm": 10.219926834106445, "learning_rate": 0.0003086032741205155, "logits/chosen": -12.379915237426758, "logits/rejected": -11.89565372467041, "logps/chosen": -2593.10693359375, "logps/rejected": -2413.58154296875, "loss": 4.2677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -183.57662963867188, "rewards/margins": 6.3739166259765625, "rewards/rejected": -189.95054626464844, "step": 35880 }, { "epoch": 2.08, "grad_norm": 5.0936026230186116e-11, "learning_rate": 0.00030840976817988314, "logits/chosen": -14.690271377563477, "logits/rejected": -15.264162063598633, "logps/chosen": -2890.576171875, "logps/rejected": -2519.094970703125, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.0390167236328, "rewards/margins": 7.214688301086426, "rewards/rejected": -161.25369262695312, "step": 35890 }, { "epoch": 2.08, "grad_norm": 4.481578798731789e-06, "learning_rate": 0.00030821626223925076, "logits/chosen": -13.939287185668945, "logits/rejected": -13.668001174926758, "logps/chosen": -3118.11279296875, "logps/rejected": -3070.3095703125, "loss": 9.9643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.36505126953125, "rewards/margins": 0.6407791376113892, "rewards/rejected": -205.0058135986328, "step": 35900 }, { "epoch": 2.08, "grad_norm": 341.2643737792969, "learning_rate": 0.0003080227562986184, "logits/chosen": -12.538203239440918, "logits/rejected": -12.073423385620117, "logps/chosen": -2972.994384765625, "logps/rejected": -3405.979736328125, "loss": 3.1846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.61234283447266, "rewards/margins": 5.275012493133545, "rewards/rejected": -126.88736724853516, "step": 35910 }, { "epoch": 2.08, "grad_norm": 43.25627899169922, "learning_rate": 0.000307829250357986, "logits/chosen": -12.589166641235352, "logits/rejected": -12.15329360961914, "logps/chosen": -3379.8359375, "logps/rejected": -3678.559814453125, "loss": 2.7714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.3363494873047, "rewards/margins": 9.647466659545898, "rewards/rejected": -141.98382568359375, "step": 35920 }, { "epoch": 2.08, "grad_norm": 80.53321838378906, "learning_rate": 0.00030763574441735367, "logits/chosen": -12.873819351196289, "logits/rejected": -12.730822563171387, "logps/chosen": -2924.97607421875, "logps/rejected": -2819.321533203125, "loss": 2.1182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -175.7218780517578, "rewards/margins": 7.6244378089904785, "rewards/rejected": -183.3463134765625, "step": 35930 }, { "epoch": 2.08, "grad_norm": 7.333448084900773e-12, "learning_rate": 0.0003074422384767213, "logits/chosen": -16.156856536865234, "logits/rejected": -15.848185539245605, "logps/chosen": -2898.25634765625, "logps/rejected": -2805.68603515625, "loss": 6.0129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -145.28805541992188, "rewards/margins": 5.625450134277344, "rewards/rejected": -150.91351318359375, "step": 35940 }, { "epoch": 2.08, "grad_norm": 59.661956787109375, "learning_rate": 0.00030724873253608885, "logits/chosen": -12.921018600463867, "logits/rejected": -12.733660697937012, "logps/chosen": -3552.965576171875, "logps/rejected": -2898.1376953125, "loss": 5.3794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -116.61079406738281, "rewards/margins": 6.006960868835449, "rewards/rejected": -122.61773681640625, "step": 35950 }, { "epoch": 2.08, "grad_norm": 64.09147644042969, "learning_rate": 0.00030705522659545646, "logits/chosen": -13.542546272277832, "logits/rejected": -14.359281539916992, "logps/chosen": -3115.698486328125, "logps/rejected": -3060.48291015625, "loss": 4.864, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -134.8753662109375, "rewards/margins": 1.0534894466400146, "rewards/rejected": -135.9288330078125, "step": 35960 }, { "epoch": 2.08, "grad_norm": 2.9204585552215576, "learning_rate": 0.0003068617206548241, "logits/chosen": -17.73781967163086, "logits/rejected": -18.12445068359375, "logps/chosen": -2694.955322265625, "logps/rejected": -2511.25244140625, "loss": 6.3287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -196.8795623779297, "rewards/margins": 3.6170010566711426, "rewards/rejected": -200.49655151367188, "step": 35970 }, { "epoch": 2.08, "grad_norm": 0.17634516954421997, "learning_rate": 0.0003066682147141917, "logits/chosen": -19.397966384887695, "logits/rejected": -19.67009162902832, "logps/chosen": -2798.12060546875, "logps/rejected": -2938.86083984375, "loss": 1.3587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.5086669921875, "rewards/margins": 16.59314727783203, "rewards/rejected": -227.101806640625, "step": 35980 }, { "epoch": 2.08, "grad_norm": 153.04220581054688, "learning_rate": 0.00030647470877355937, "logits/chosen": -17.749393463134766, "logits/rejected": -17.639307022094727, "logps/chosen": -2603.85791015625, "logps/rejected": -2731.39208984375, "loss": 1.8737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.24099731445312, "rewards/margins": 5.325192928314209, "rewards/rejected": -190.56617736816406, "step": 35990 }, { "epoch": 2.08, "grad_norm": 26.51788902282715, "learning_rate": 0.000306281202832927, "logits/chosen": -15.788517951965332, "logits/rejected": -16.198070526123047, "logps/chosen": -2960.505615234375, "logps/rejected": -2831.815185546875, "loss": 1.0037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.31747436523438, "rewards/margins": 10.60646915435791, "rewards/rejected": -168.92391967773438, "step": 36000 }, { "epoch": 2.08, "grad_norm": 1.4334768820845056e-06, "learning_rate": 0.0003060876968922946, "logits/chosen": -20.276111602783203, "logits/rejected": -20.384662628173828, "logps/chosen": -2725.47705078125, "logps/rejected": -2790.82373046875, "loss": 1.802, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -233.1757354736328, "rewards/margins": 13.6178617477417, "rewards/rejected": -246.7935791015625, "step": 36010 }, { "epoch": 2.08, "grad_norm": 73.56999969482422, "learning_rate": 0.0003058941909516622, "logits/chosen": -18.102643966674805, "logits/rejected": -21.274646759033203, "logps/chosen": -3302.32763671875, "logps/rejected": -3334.796142578125, "loss": 2.8766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.92074584960938, "rewards/margins": 5.526825904846191, "rewards/rejected": -195.4475860595703, "step": 36020 }, { "epoch": 2.09, "grad_norm": 6.65906862884789e-17, "learning_rate": 0.00030570068501102984, "logits/chosen": -15.74096393585205, "logits/rejected": -16.17441749572754, "logps/chosen": -3189.21337890625, "logps/rejected": -2819.180419921875, "loss": 12.9979, "rewards/accuracies": 0.5, "rewards/chosen": -172.8187255859375, "rewards/margins": -3.774932861328125, "rewards/rejected": -169.04379272460938, "step": 36030 }, { "epoch": 2.09, "grad_norm": 4.225657150580542e-12, "learning_rate": 0.0003055071790703975, "logits/chosen": -18.9118709564209, "logits/rejected": -18.607492446899414, "logps/chosen": -2961.136474609375, "logps/rejected": -2979.218017578125, "loss": 2.949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.8071746826172, "rewards/margins": 5.923174858093262, "rewards/rejected": -199.73036193847656, "step": 36040 }, { "epoch": 2.09, "grad_norm": 1.2150853240244786e-12, "learning_rate": 0.0003053136731297651, "logits/chosen": -19.763057708740234, "logits/rejected": -21.603137969970703, "logps/chosen": -2887.093994140625, "logps/rejected": -2614.619384765625, "loss": 10.0153, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -153.1360321044922, "rewards/margins": 2.379481554031372, "rewards/rejected": -155.51551818847656, "step": 36050 }, { "epoch": 2.09, "grad_norm": 58.79326248168945, "learning_rate": 0.0003051201671891327, "logits/chosen": -16.604228973388672, "logits/rejected": -16.603384017944336, "logps/chosen": -2759.234375, "logps/rejected": -2760.34033203125, "loss": 3.2989, "rewards/accuracies": 0.5, "rewards/chosen": -147.4688720703125, "rewards/margins": 1.6692310571670532, "rewards/rejected": -149.1381072998047, "step": 36060 }, { "epoch": 2.09, "grad_norm": 142.5612335205078, "learning_rate": 0.0003049266612485003, "logits/chosen": -14.446612358093262, "logits/rejected": -15.292158126831055, "logps/chosen": -3134.120361328125, "logps/rejected": -3125.591552734375, "loss": 6.6803, "rewards/accuracies": 0.5, "rewards/chosen": -114.16401672363281, "rewards/margins": -2.1486384868621826, "rewards/rejected": -112.015380859375, "step": 36070 }, { "epoch": 2.09, "grad_norm": 1.639631186378665e-08, "learning_rate": 0.0003047331553078679, "logits/chosen": -17.459001541137695, "logits/rejected": -19.100727081298828, "logps/chosen": -3170.541259765625, "logps/rejected": -3113.0673828125, "loss": 4.0619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.61082458496094, "rewards/margins": 8.798894882202148, "rewards/rejected": -209.4097137451172, "step": 36080 }, { "epoch": 2.09, "grad_norm": 102.4530258178711, "learning_rate": 0.0003045396493672356, "logits/chosen": -18.78715705871582, "logits/rejected": -17.7508544921875, "logps/chosen": -2696.785400390625, "logps/rejected": -2911.11572265625, "loss": 5.1297, "rewards/accuracies": 0.5, "rewards/chosen": -211.9522247314453, "rewards/margins": 4.901797294616699, "rewards/rejected": -216.85400390625, "step": 36090 }, { "epoch": 2.09, "grad_norm": 0.015025869011878967, "learning_rate": 0.0003043461434266032, "logits/chosen": -17.000707626342773, "logits/rejected": -16.50396728515625, "logps/chosen": -2641.31103515625, "logps/rejected": -2785.64111328125, "loss": 6.8294, "rewards/accuracies": 0.5, "rewards/chosen": -210.8506317138672, "rewards/margins": 1.8229373693466187, "rewards/rejected": -212.67361450195312, "step": 36100 }, { "epoch": 2.09, "grad_norm": 94.42396545410156, "learning_rate": 0.00030415263748597083, "logits/chosen": -15.752737045288086, "logits/rejected": -15.74754524230957, "logps/chosen": -2939.535888671875, "logps/rejected": -2948.02978515625, "loss": 3.4048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -221.6046600341797, "rewards/margins": 2.097107410430908, "rewards/rejected": -223.7017364501953, "step": 36110 }, { "epoch": 2.09, "grad_norm": 65.67659759521484, "learning_rate": 0.00030395913154533845, "logits/chosen": -16.085494995117188, "logits/rejected": -16.472078323364258, "logps/chosen": -3015.18505859375, "logps/rejected": -2921.67236328125, "loss": 4.0022, "rewards/accuracies": 0.5, "rewards/chosen": -210.90713500976562, "rewards/margins": 8.209329605102539, "rewards/rejected": -219.11642456054688, "step": 36120 }, { "epoch": 2.09, "grad_norm": 0.11505994945764542, "learning_rate": 0.00030376562560470606, "logits/chosen": -15.424097061157227, "logits/rejected": -16.37932586669922, "logps/chosen": -3134.23193359375, "logps/rejected": -3069.734130859375, "loss": 2.9815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.38357543945312, "rewards/margins": 1.682537317276001, "rewards/rejected": -169.06610107421875, "step": 36130 }, { "epoch": 2.09, "grad_norm": 74.3900375366211, "learning_rate": 0.00030357211966407373, "logits/chosen": -12.958944320678711, "logits/rejected": -12.639570236206055, "logps/chosen": -2868.108154296875, "logps/rejected": -3049.01904296875, "loss": 1.4948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.00286865234375, "rewards/margins": 4.261757850646973, "rewards/rejected": -164.26463317871094, "step": 36140 }, { "epoch": 2.09, "grad_norm": 0.27649617195129395, "learning_rate": 0.00030337861372344135, "logits/chosen": -15.477375984191895, "logits/rejected": -15.511686325073242, "logps/chosen": -2896.23779296875, "logps/rejected": -2753.0478515625, "loss": 0.8524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -183.70118713378906, "rewards/margins": 7.514407157897949, "rewards/rejected": -191.21560668945312, "step": 36150 }, { "epoch": 2.09, "grad_norm": 8.427161191093546e-08, "learning_rate": 0.00030318510778280897, "logits/chosen": -13.286888122558594, "logits/rejected": -12.841215133666992, "logps/chosen": -3376.680908203125, "logps/rejected": -2978.05810546875, "loss": 5.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.93124389648438, "rewards/margins": 3.6034045219421387, "rewards/rejected": -153.53465270996094, "step": 36160 }, { "epoch": 2.09, "grad_norm": 1.1651887893676758, "learning_rate": 0.00030299160184217653, "logits/chosen": -15.25994873046875, "logits/rejected": -14.957727432250977, "logps/chosen": -2914.370361328125, "logps/rejected": -2927.654296875, "loss": 5.1288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -164.25094604492188, "rewards/margins": 3.3235023021698, "rewards/rejected": -167.57444763183594, "step": 36170 }, { "epoch": 2.09, "grad_norm": 85.21482849121094, "learning_rate": 0.00030279809590154415, "logits/chosen": -17.781864166259766, "logits/rejected": -17.792341232299805, "logps/chosen": -2941.190185546875, "logps/rejected": -2656.056640625, "loss": 1.141, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -194.27828979492188, "rewards/margins": 8.81926155090332, "rewards/rejected": -203.09756469726562, "step": 36180 }, { "epoch": 2.09, "grad_norm": 10.760111808776855, "learning_rate": 0.00030260458996091177, "logits/chosen": -17.549650192260742, "logits/rejected": -17.86461639404297, "logps/chosen": -2765.76123046875, "logps/rejected": -2627.760498046875, "loss": 18.3432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.2528533935547, "rewards/margins": -11.256501197814941, "rewards/rejected": -197.99636840820312, "step": 36190 }, { "epoch": 2.1, "grad_norm": 0.003102540737017989, "learning_rate": 0.00030241108402027944, "logits/chosen": -16.845243453979492, "logits/rejected": -17.549076080322266, "logps/chosen": -2451.7744140625, "logps/rejected": -2428.483154296875, "loss": 1.5586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.21434020996094, "rewards/margins": 5.962320804595947, "rewards/rejected": -186.17665100097656, "step": 36200 }, { "epoch": 2.1, "grad_norm": 6.245540618010637e-08, "learning_rate": 0.00030221757807964705, "logits/chosen": -12.526134490966797, "logits/rejected": -12.502592086791992, "logps/chosen": -3125.07470703125, "logps/rejected": -2566.534423828125, "loss": 1.7205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -96.63917541503906, "rewards/margins": 25.48491096496582, "rewards/rejected": -122.12408447265625, "step": 36210 }, { "epoch": 2.1, "grad_norm": 7.251438050015751e-15, "learning_rate": 0.00030202407213901467, "logits/chosen": -16.553647994995117, "logits/rejected": -17.07596778869629, "logps/chosen": -2867.24365234375, "logps/rejected": -2768.607421875, "loss": 0.2237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.84609985351562, "rewards/margins": 11.33116626739502, "rewards/rejected": -162.17726135253906, "step": 36220 }, { "epoch": 2.1, "grad_norm": 1.0199164152145386, "learning_rate": 0.0003018305661983823, "logits/chosen": -17.78522300720215, "logits/rejected": -18.77560806274414, "logps/chosen": -2566.32763671875, "logps/rejected": -2470.68603515625, "loss": 15.8077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -228.29983520507812, "rewards/margins": -6.239813327789307, "rewards/rejected": -222.0600128173828, "step": 36230 }, { "epoch": 2.1, "grad_norm": 0.10676468908786774, "learning_rate": 0.0003016370602577499, "logits/chosen": -13.816787719726562, "logits/rejected": -14.15454387664795, "logps/chosen": -2737.986328125, "logps/rejected": -2818.48486328125, "loss": 6.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -147.55752563476562, "rewards/margins": 2.3844501972198486, "rewards/rejected": -149.94198608398438, "step": 36240 }, { "epoch": 2.1, "grad_norm": 8.16812515258789, "learning_rate": 0.0003014435543171176, "logits/chosen": -16.790796279907227, "logits/rejected": -17.006502151489258, "logps/chosen": -2813.363525390625, "logps/rejected": -2785.985595703125, "loss": 2.1253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -242.561767578125, "rewards/margins": 12.284324645996094, "rewards/rejected": -254.8460693359375, "step": 36250 }, { "epoch": 2.1, "grad_norm": 1.172375974078932e-09, "learning_rate": 0.0003012500483764852, "logits/chosen": -15.7247314453125, "logits/rejected": -16.632036209106445, "logps/chosen": -2922.22412109375, "logps/rejected": -2734.82275390625, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": -153.95681762695312, "rewards/margins": 24.35379409790039, "rewards/rejected": -178.31060791015625, "step": 36260 }, { "epoch": 2.1, "grad_norm": 46.27276611328125, "learning_rate": 0.0003010565424358528, "logits/chosen": -16.230792999267578, "logits/rejected": -16.605920791625977, "logps/chosen": -2531.23486328125, "logps/rejected": -2376.76171875, "loss": 0.4465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -147.77432250976562, "rewards/margins": 11.57752799987793, "rewards/rejected": -159.35183715820312, "step": 36270 }, { "epoch": 2.1, "grad_norm": 4.426725208617199e-09, "learning_rate": 0.0003008630364952204, "logits/chosen": -15.720471382141113, "logits/rejected": -15.933954238891602, "logps/chosen": -2579.309814453125, "logps/rejected": -2272.3935546875, "loss": 0.1523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -121.11833190917969, "rewards/margins": 14.787572860717773, "rewards/rejected": -135.90589904785156, "step": 36280 }, { "epoch": 2.1, "grad_norm": 37.7388916015625, "learning_rate": 0.000300669530554588, "logits/chosen": -16.314157485961914, "logits/rejected": -16.26815414428711, "logps/chosen": -2582.712158203125, "logps/rejected": -2435.36181640625, "loss": 1.7165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -115.0800552368164, "rewards/margins": 12.712911605834961, "rewards/rejected": -127.79295349121094, "step": 36290 }, { "epoch": 2.1, "grad_norm": 1.7163204191206205e-09, "learning_rate": 0.00030047602461395566, "logits/chosen": -17.40319061279297, "logits/rejected": -18.08012580871582, "logps/chosen": -2466.30419921875, "logps/rejected": -2146.95849609375, "loss": 16.3117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.24745178222656, "rewards/margins": -6.29625940322876, "rewards/rejected": -169.951171875, "step": 36300 }, { "epoch": 2.1, "grad_norm": 27.626375198364258, "learning_rate": 0.0003002825186733233, "logits/chosen": -17.6294002532959, "logits/rejected": -18.16985321044922, "logps/chosen": -2771.53857421875, "logps/rejected": -2934.021484375, "loss": 3.1635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.1511688232422, "rewards/margins": 10.312104225158691, "rewards/rejected": -155.46328735351562, "step": 36310 }, { "epoch": 2.1, "grad_norm": 1.3876989835015507e-11, "learning_rate": 0.0003000890127326909, "logits/chosen": -17.032516479492188, "logits/rejected": -17.04949378967285, "logps/chosen": -2965.690673828125, "logps/rejected": -2721.47021484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -167.6075439453125, "rewards/margins": 21.976938247680664, "rewards/rejected": -189.58447265625, "step": 36320 }, { "epoch": 2.1, "grad_norm": 4.593719610430924e-13, "learning_rate": 0.0002998955067920585, "logits/chosen": -19.513071060180664, "logits/rejected": -18.956741333007812, "logps/chosen": -2933.08544921875, "logps/rejected": -2937.85546875, "loss": 3.827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -187.14125061035156, "rewards/margins": 14.117630004882812, "rewards/rejected": -201.25888061523438, "step": 36330 }, { "epoch": 2.1, "grad_norm": 4.363163952803006e-06, "learning_rate": 0.00029970200085142613, "logits/chosen": -14.634387016296387, "logits/rejected": -14.47724437713623, "logps/chosen": -2892.49951171875, "logps/rejected": -3019.1630859375, "loss": 8.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.36466217041016, "rewards/margins": 3.7309470176696777, "rewards/rejected": -119.09561920166016, "step": 36340 }, { "epoch": 2.1, "grad_norm": 102.20781707763672, "learning_rate": 0.0002995084949107938, "logits/chosen": -16.05976676940918, "logits/rejected": -16.197566986083984, "logps/chosen": -3240.73779296875, "logps/rejected": -2963.73974609375, "loss": 3.4204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.47006225585938, "rewards/margins": 6.046612739562988, "rewards/rejected": -148.51669311523438, "step": 36350 }, { "epoch": 2.1, "grad_norm": 0.025807002559304237, "learning_rate": 0.0002993149889701614, "logits/chosen": -15.563400268554688, "logits/rejected": -15.286517143249512, "logps/chosen": -3076.428466796875, "logps/rejected": -3285.616455078125, "loss": 3.0398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -104.9876708984375, "rewards/margins": 4.225209712982178, "rewards/rejected": -109.2128677368164, "step": 36360 }, { "epoch": 2.11, "grad_norm": 3.189317119332491e-19, "learning_rate": 0.00029912148302952904, "logits/chosen": -16.6522159576416, "logits/rejected": -16.88469886779785, "logps/chosen": -2633.84130859375, "logps/rejected": -2412.569091796875, "loss": 2.3725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -182.54299926757812, "rewards/margins": 19.80459976196289, "rewards/rejected": -202.3476104736328, "step": 36370 }, { "epoch": 2.11, "grad_norm": 5.966459097617172e-12, "learning_rate": 0.00029892797708889666, "logits/chosen": -16.198257446289062, "logits/rejected": -16.181743621826172, "logps/chosen": -2804.970947265625, "logps/rejected": -2640.15771484375, "loss": 0.7448, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -164.85635375976562, "rewards/margins": 13.513089179992676, "rewards/rejected": -178.3694305419922, "step": 36380 }, { "epoch": 2.11, "grad_norm": 38.72802734375, "learning_rate": 0.0002987344711482642, "logits/chosen": -16.39793586730957, "logits/rejected": -16.42288589477539, "logps/chosen": -2960.744140625, "logps/rejected": -2617.10546875, "loss": 6.5259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -143.32737731933594, "rewards/margins": 6.077696800231934, "rewards/rejected": -149.40509033203125, "step": 36390 }, { "epoch": 2.11, "grad_norm": 0.005969444755464792, "learning_rate": 0.00029854096520763184, "logits/chosen": -17.41448402404785, "logits/rejected": -17.180368423461914, "logps/chosen": -2660.944580078125, "logps/rejected": -2671.37255859375, "loss": 5.5913, "rewards/accuracies": 0.5, "rewards/chosen": -229.3899383544922, "rewards/margins": 2.9183449745178223, "rewards/rejected": -232.30825805664062, "step": 36400 }, { "epoch": 2.11, "grad_norm": 0.013721039518713951, "learning_rate": 0.0002983474592669995, "logits/chosen": -18.372425079345703, "logits/rejected": -17.33340072631836, "logps/chosen": -2514.26708984375, "logps/rejected": -2985.293701171875, "loss": 3.703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -202.54139709472656, "rewards/margins": 3.161815643310547, "rewards/rejected": -205.70321655273438, "step": 36410 }, { "epoch": 2.11, "grad_norm": 0.2820938229560852, "learning_rate": 0.0002981539533263671, "logits/chosen": -18.627017974853516, "logits/rejected": -18.6152400970459, "logps/chosen": -2713.596435546875, "logps/rejected": -2729.125732421875, "loss": 1.8041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.05648803710938, "rewards/margins": 9.83088207244873, "rewards/rejected": -174.88735961914062, "step": 36420 }, { "epoch": 2.11, "grad_norm": 0.4476950764656067, "learning_rate": 0.00029796044738573474, "logits/chosen": -19.269227981567383, "logits/rejected": -18.47098159790039, "logps/chosen": -3151.166015625, "logps/rejected": -3212.803955078125, "loss": 3.075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -250.45620727539062, "rewards/margins": 3.793959140777588, "rewards/rejected": -254.25015258789062, "step": 36430 }, { "epoch": 2.11, "grad_norm": 1.18143320647357e-12, "learning_rate": 0.00029776694144510236, "logits/chosen": -14.096944808959961, "logits/rejected": -14.294400215148926, "logps/chosen": -3510.190673828125, "logps/rejected": -3526.74853515625, "loss": 1.7373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -147.16360473632812, "rewards/margins": 13.883878707885742, "rewards/rejected": -161.04751586914062, "step": 36440 }, { "epoch": 2.11, "grad_norm": 0.0, "learning_rate": 0.00029757343550447, "logits/chosen": -18.88937759399414, "logits/rejected": -20.431285858154297, "logps/chosen": -3144.87939453125, "logps/rejected": -3155.029541015625, "loss": 24.1355, "rewards/accuracies": 0.5, "rewards/chosen": -225.08773803710938, "rewards/margins": -7.913581848144531, "rewards/rejected": -217.17416381835938, "step": 36450 }, { "epoch": 2.11, "grad_norm": 5.331049404005972e-12, "learning_rate": 0.00029737992956383765, "logits/chosen": -19.26228904724121, "logits/rejected": -19.972225189208984, "logps/chosen": -3273.542236328125, "logps/rejected": -3336.33544921875, "loss": 3.8135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -257.33905029296875, "rewards/margins": 5.692330360412598, "rewards/rejected": -263.0313415527344, "step": 36460 }, { "epoch": 2.11, "grad_norm": 0.0, "learning_rate": 0.00029718642362320526, "logits/chosen": -18.971187591552734, "logits/rejected": -19.082754135131836, "logps/chosen": -2965.66357421875, "logps/rejected": -3124.67724609375, "loss": 1.3895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -227.90670776367188, "rewards/margins": 18.095678329467773, "rewards/rejected": -246.0023956298828, "step": 36470 }, { "epoch": 2.11, "grad_norm": 1.2321284884819761e-05, "learning_rate": 0.0002969929176825729, "logits/chosen": -15.442916870117188, "logits/rejected": -15.453580856323242, "logps/chosen": -3463.400390625, "logps/rejected": -3451.207763671875, "loss": 2.8997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -201.38665771484375, "rewards/margins": 18.633920669555664, "rewards/rejected": -220.0205841064453, "step": 36480 }, { "epoch": 2.11, "grad_norm": 1.5068901777267456, "learning_rate": 0.0002967994117419405, "logits/chosen": -16.11033821105957, "logits/rejected": -16.377094268798828, "logps/chosen": -2931.7119140625, "logps/rejected": -2589.12255859375, "loss": 2.5287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -223.97653198242188, "rewards/margins": 4.463065147399902, "rewards/rejected": -228.4396209716797, "step": 36490 }, { "epoch": 2.11, "grad_norm": 4.5278795823833207e-07, "learning_rate": 0.00029660590580130806, "logits/chosen": -15.020662307739258, "logits/rejected": -15.073938369750977, "logps/chosen": -2911.0234375, "logps/rejected": -3030.60595703125, "loss": 1.5156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -160.9805908203125, "rewards/margins": 10.899126052856445, "rewards/rejected": -171.87969970703125, "step": 36500 }, { "epoch": 2.11, "grad_norm": 0.3092897832393646, "learning_rate": 0.00029641239986067573, "logits/chosen": -15.438023567199707, "logits/rejected": -15.364294052124023, "logps/chosen": -3147.626220703125, "logps/rejected": -3131.26806640625, "loss": 0.1409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.30809020996094, "rewards/margins": 9.575217247009277, "rewards/rejected": -176.88333129882812, "step": 36510 }, { "epoch": 2.11, "grad_norm": 0.0012026976328343153, "learning_rate": 0.00029621889392004335, "logits/chosen": -17.255760192871094, "logits/rejected": -20.31900978088379, "logps/chosen": -3075.13330078125, "logps/rejected": -2785.61865234375, "loss": 2.1497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -202.31459045410156, "rewards/margins": 7.352414131164551, "rewards/rejected": -209.66702270507812, "step": 36520 }, { "epoch": 2.11, "grad_norm": 17.499238967895508, "learning_rate": 0.00029602538797941097, "logits/chosen": -20.498626708984375, "logits/rejected": -20.151147842407227, "logps/chosen": -2810.985107421875, "logps/rejected": -2826.882568359375, "loss": 0.7465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -252.24942016601562, "rewards/margins": 6.1266584396362305, "rewards/rejected": -258.3760986328125, "step": 36530 }, { "epoch": 2.12, "grad_norm": 110.53749084472656, "learning_rate": 0.0002958318820387786, "logits/chosen": -18.279010772705078, "logits/rejected": -18.199756622314453, "logps/chosen": -2890.770263671875, "logps/rejected": -2553.120849609375, "loss": 3.739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -215.6990203857422, "rewards/margins": 1.906593680381775, "rewards/rejected": -217.60562133789062, "step": 36540 }, { "epoch": 2.12, "grad_norm": 15.006585121154785, "learning_rate": 0.0002956383760981462, "logits/chosen": -17.238771438598633, "logits/rejected": -17.450231552124023, "logps/chosen": -2803.582275390625, "logps/rejected": -2897.80712890625, "loss": 5.2684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.3409423828125, "rewards/margins": 3.0682272911071777, "rewards/rejected": -212.4091796875, "step": 36550 }, { "epoch": 2.12, "grad_norm": 9.112763876828467e-08, "learning_rate": 0.0002954448701575139, "logits/chosen": -13.189977645874023, "logits/rejected": -12.98041820526123, "logps/chosen": -3208.393798828125, "logps/rejected": -2836.642578125, "loss": 6.0734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.70783996582031, "rewards/margins": 3.9441521167755127, "rewards/rejected": -131.65199279785156, "step": 36560 }, { "epoch": 2.12, "grad_norm": 12.577199935913086, "learning_rate": 0.0002952513642168815, "logits/chosen": -15.2078857421875, "logits/rejected": -15.517145156860352, "logps/chosen": -2988.4013671875, "logps/rejected": -2771.00341796875, "loss": 1.0783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -161.31988525390625, "rewards/margins": 11.879866600036621, "rewards/rejected": -173.19973754882812, "step": 36570 }, { "epoch": 2.12, "grad_norm": 42.43686294555664, "learning_rate": 0.0002950578582762491, "logits/chosen": -16.020402908325195, "logits/rejected": -16.810293197631836, "logps/chosen": -2710.848388671875, "logps/rejected": -2829.382568359375, "loss": 2.1184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.40402221679688, "rewards/margins": 7.293145656585693, "rewards/rejected": -159.69717407226562, "step": 36580 }, { "epoch": 2.12, "grad_norm": 148.154296875, "learning_rate": 0.0002948643523356167, "logits/chosen": -14.510988235473633, "logits/rejected": -14.910041809082031, "logps/chosen": -3200.767578125, "logps/rejected": -2584.161376953125, "loss": 19.1086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.9397735595703, "rewards/margins": -9.71723747253418, "rewards/rejected": -132.2225341796875, "step": 36590 }, { "epoch": 2.12, "grad_norm": 0.13449087738990784, "learning_rate": 0.00029467084639498434, "logits/chosen": -20.170047760009766, "logits/rejected": -19.999982833862305, "logps/chosen": -2552.75634765625, "logps/rejected": -2620.858642578125, "loss": 2.5564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.916015625, "rewards/margins": 19.950214385986328, "rewards/rejected": -214.86624145507812, "step": 36600 }, { "epoch": 2.12, "grad_norm": 47.996036529541016, "learning_rate": 0.0002944773404543519, "logits/chosen": -15.800409317016602, "logits/rejected": -15.654756546020508, "logps/chosen": -3131.1591796875, "logps/rejected": -2479.91064453125, "loss": 2.9102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.3314971923828, "rewards/margins": 8.059371948242188, "rewards/rejected": -144.390869140625, "step": 36610 }, { "epoch": 2.12, "grad_norm": 0.00029596174135804176, "learning_rate": 0.0002942838345137196, "logits/chosen": -17.70126724243164, "logits/rejected": -18.262813568115234, "logps/chosen": -2896.3603515625, "logps/rejected": -2903.068359375, "loss": 3.2264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -183.27072143554688, "rewards/margins": 4.058542728424072, "rewards/rejected": -187.32925415039062, "step": 36620 }, { "epoch": 2.12, "grad_norm": 13.660171508789062, "learning_rate": 0.0002940903285730872, "logits/chosen": -20.742427825927734, "logits/rejected": -21.591556549072266, "logps/chosen": -2917.99072265625, "logps/rejected": -2651.980712890625, "loss": 4.9161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.45384216308594, "rewards/margins": 8.469090461730957, "rewards/rejected": -201.92294311523438, "step": 36630 }, { "epoch": 2.12, "grad_norm": 97.20475769042969, "learning_rate": 0.0002938968226324548, "logits/chosen": -22.434017181396484, "logits/rejected": -22.646562576293945, "logps/chosen": -2863.919677734375, "logps/rejected": -2659.240478515625, "loss": 1.4764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -216.85037231445312, "rewards/margins": 7.267269134521484, "rewards/rejected": -224.11764526367188, "step": 36640 }, { "epoch": 2.12, "grad_norm": 0.18581487238407135, "learning_rate": 0.00029370331669182243, "logits/chosen": -18.203767776489258, "logits/rejected": -18.785268783569336, "logps/chosen": -2656.083740234375, "logps/rejected": -2539.5224609375, "loss": 3.3138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -193.7327117919922, "rewards/margins": 1.1611511707305908, "rewards/rejected": -194.8938751220703, "step": 36650 }, { "epoch": 2.12, "grad_norm": 1.7829428254970137e-12, "learning_rate": 0.00029350981075119004, "logits/chosen": -18.037139892578125, "logits/rejected": -18.11006736755371, "logps/chosen": -2805.029052734375, "logps/rejected": -2686.068115234375, "loss": 3.5564, "rewards/accuracies": 0.5, "rewards/chosen": -130.0657501220703, "rewards/margins": 3.0925700664520264, "rewards/rejected": -133.15830993652344, "step": 36660 }, { "epoch": 2.12, "grad_norm": 0.7865509986877441, "learning_rate": 0.0002933163048105577, "logits/chosen": -19.946659088134766, "logits/rejected": -21.05929183959961, "logps/chosen": -2998.11083984375, "logps/rejected": -2761.771240234375, "loss": 2.0638, "rewards/accuracies": 0.5, "rewards/chosen": -189.498046875, "rewards/margins": 2.6346306800842285, "rewards/rejected": -192.13265991210938, "step": 36670 }, { "epoch": 2.12, "grad_norm": 6.445868968963623, "learning_rate": 0.00029312279886992533, "logits/chosen": -21.784475326538086, "logits/rejected": -21.722707748413086, "logps/chosen": -2471.835693359375, "logps/rejected": -2210.572265625, "loss": 0.9362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -145.53814697265625, "rewards/margins": 14.145685195922852, "rewards/rejected": -159.683837890625, "step": 36680 }, { "epoch": 2.12, "grad_norm": 234.6046905517578, "learning_rate": 0.00029292929292929295, "logits/chosen": -17.51467514038086, "logits/rejected": -17.74322509765625, "logps/chosen": -2771.6181640625, "logps/rejected": -2655.863525390625, "loss": 9.1164, "rewards/accuracies": 0.5, "rewards/chosen": -95.59024810791016, "rewards/margins": -1.6456260681152344, "rewards/rejected": -93.94461059570312, "step": 36690 }, { "epoch": 2.12, "grad_norm": 1.4533307890474134e-10, "learning_rate": 0.00029273578698866057, "logits/chosen": -20.56124496459961, "logits/rejected": -20.27176856994629, "logps/chosen": -2843.815673828125, "logps/rejected": -2573.468017578125, "loss": 2.4327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -110.36544036865234, "rewards/margins": 9.69722843170166, "rewards/rejected": -120.06269836425781, "step": 36700 }, { "epoch": 2.12, "grad_norm": 1.8929344074473334e-15, "learning_rate": 0.0002925422810480282, "logits/chosen": -17.44363021850586, "logits/rejected": -17.713802337646484, "logps/chosen": -2817.10498046875, "logps/rejected": -2850.35595703125, "loss": 2.2236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -99.0173568725586, "rewards/margins": 12.19968032836914, "rewards/rejected": -111.217041015625, "step": 36710 }, { "epoch": 2.13, "grad_norm": 3.253227548460913e-11, "learning_rate": 0.0002923487751073958, "logits/chosen": -19.022457122802734, "logits/rejected": -19.677152633666992, "logps/chosen": -2912.502685546875, "logps/rejected": -2836.304443359375, "loss": 0.8703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.02365112304688, "rewards/margins": 12.994634628295898, "rewards/rejected": -175.018310546875, "step": 36720 }, { "epoch": 2.13, "grad_norm": 43.85832214355469, "learning_rate": 0.0002921552691667634, "logits/chosen": -17.612436294555664, "logits/rejected": -17.746551513671875, "logps/chosen": -3008.576171875, "logps/rejected": -2559.224609375, "loss": 2.7328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -124.38134765625, "rewards/margins": 11.722513198852539, "rewards/rejected": -136.10385131835938, "step": 36730 }, { "epoch": 2.13, "grad_norm": 0.00015477817214559764, "learning_rate": 0.00029196176322613104, "logits/chosen": -23.727466583251953, "logits/rejected": -25.091283798217773, "logps/chosen": -3022.03564453125, "logps/rejected": -3056.0087890625, "loss": 2.3628, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -265.73663330078125, "rewards/margins": 7.203774929046631, "rewards/rejected": -272.94036865234375, "step": 36740 }, { "epoch": 2.13, "grad_norm": 6.52141011414642e-07, "learning_rate": 0.00029176825728549865, "logits/chosen": -20.514324188232422, "logits/rejected": -21.530399322509766, "logps/chosen": -2947.40625, "logps/rejected": -2891.97705078125, "loss": 6.78, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.15673828125, "rewards/margins": 8.004971504211426, "rewards/rejected": -141.1616973876953, "step": 36750 }, { "epoch": 2.13, "grad_norm": 2.7282521841698326e-07, "learning_rate": 0.00029157475134486627, "logits/chosen": -20.9162540435791, "logits/rejected": -20.58258056640625, "logps/chosen": -2675.22412109375, "logps/rejected": -2620.968017578125, "loss": 1.4131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.043212890625, "rewards/margins": 7.237046718597412, "rewards/rejected": -176.2802734375, "step": 36760 }, { "epoch": 2.13, "grad_norm": 63.25688552856445, "learning_rate": 0.00029138124540423394, "logits/chosen": -19.631567001342773, "logits/rejected": -21.253643035888672, "logps/chosen": -2895.958984375, "logps/rejected": -2942.923095703125, "loss": 5.435, "rewards/accuracies": 0.5, "rewards/chosen": -214.5814971923828, "rewards/margins": -1.7501754760742188, "rewards/rejected": -212.8313446044922, "step": 36770 }, { "epoch": 2.13, "grad_norm": 0.01809201017022133, "learning_rate": 0.00029118773946360156, "logits/chosen": -18.612089157104492, "logits/rejected": -18.71330451965332, "logps/chosen": -2902.60498046875, "logps/rejected": -2957.708984375, "loss": 1.1664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.75091552734375, "rewards/margins": 11.925711631774902, "rewards/rejected": -148.67662048339844, "step": 36780 }, { "epoch": 2.13, "grad_norm": 0.1356481909751892, "learning_rate": 0.0002909942335229692, "logits/chosen": -19.646617889404297, "logits/rejected": -21.007320404052734, "logps/chosen": -2870.831298828125, "logps/rejected": -2912.10595703125, "loss": 3.266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.4967498779297, "rewards/margins": 2.259535312652588, "rewards/rejected": -170.7563018798828, "step": 36790 }, { "epoch": 2.13, "grad_norm": 47.82028579711914, "learning_rate": 0.0002908007275823368, "logits/chosen": -20.315338134765625, "logits/rejected": -20.33719825744629, "logps/chosen": -2786.98828125, "logps/rejected": -2717.181884765625, "loss": 5.9728, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -206.9474639892578, "rewards/margins": -1.831766128540039, "rewards/rejected": -205.11569213867188, "step": 36800 }, { "epoch": 2.13, "grad_norm": 0.04029979929327965, "learning_rate": 0.0002906072216417044, "logits/chosen": -16.21891212463379, "logits/rejected": -17.525035858154297, "logps/chosen": -3325.97265625, "logps/rejected": -2958.13330078125, "loss": 3.6812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -106.237060546875, "rewards/margins": 8.84860897064209, "rewards/rejected": -115.0856704711914, "step": 36810 }, { "epoch": 2.13, "grad_norm": 0.0016295799287036061, "learning_rate": 0.0002904137157010721, "logits/chosen": -19.182125091552734, "logits/rejected": -19.168596267700195, "logps/chosen": -2813.617431640625, "logps/rejected": -2844.11279296875, "loss": 5.6137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -213.12722778320312, "rewards/margins": 3.716733932495117, "rewards/rejected": -216.8439483642578, "step": 36820 }, { "epoch": 2.13, "grad_norm": 0.00022653753694612533, "learning_rate": 0.00029022020976043965, "logits/chosen": -16.691165924072266, "logits/rejected": -16.810626983642578, "logps/chosen": -3155.769287109375, "logps/rejected": -3052.098876953125, "loss": 10.223, "rewards/accuracies": 0.5, "rewards/chosen": -113.55043029785156, "rewards/margins": -3.615154266357422, "rewards/rejected": -109.93526458740234, "step": 36830 }, { "epoch": 2.13, "grad_norm": 43.0529670715332, "learning_rate": 0.00029002670381980726, "logits/chosen": -20.514781951904297, "logits/rejected": -21.17337989807129, "logps/chosen": -2774.651611328125, "logps/rejected": -2712.34375, "loss": 4.2966, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -218.1107940673828, "rewards/margins": 0.6325100064277649, "rewards/rejected": -218.7433319091797, "step": 36840 }, { "epoch": 2.13, "grad_norm": 2.0496782049406193e-09, "learning_rate": 0.0002898331978791749, "logits/chosen": -19.027263641357422, "logits/rejected": -19.609968185424805, "logps/chosen": -3125.9345703125, "logps/rejected": -2659.1669921875, "loss": 3.148, "rewards/accuracies": 0.5, "rewards/chosen": -142.21072387695312, "rewards/margins": 7.129513740539551, "rewards/rejected": -149.34022521972656, "step": 36850 }, { "epoch": 2.13, "grad_norm": 0.1006850004196167, "learning_rate": 0.0002896396919385425, "logits/chosen": -17.90591049194336, "logits/rejected": -17.984127044677734, "logps/chosen": -2623.575439453125, "logps/rejected": -2487.81689453125, "loss": 0.4148, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -163.69525146484375, "rewards/margins": 7.504796028137207, "rewards/rejected": -171.20004272460938, "step": 36860 }, { "epoch": 2.13, "grad_norm": 75.89276885986328, "learning_rate": 0.0002894461859979101, "logits/chosen": -19.338214874267578, "logits/rejected": -19.741832733154297, "logps/chosen": -3028.200927734375, "logps/rejected": -2740.8310546875, "loss": 3.5162, "rewards/accuracies": 0.5, "rewards/chosen": -192.904052734375, "rewards/margins": -0.02602539025247097, "rewards/rejected": -192.87802124023438, "step": 36870 }, { "epoch": 2.13, "grad_norm": 0.0, "learning_rate": 0.0002892526800572778, "logits/chosen": -18.666336059570312, "logits/rejected": -19.189435958862305, "logps/chosen": -2972.8349609375, "logps/rejected": -2772.68359375, "loss": 1.4898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -215.7294158935547, "rewards/margins": 25.62765121459961, "rewards/rejected": -241.35708618164062, "step": 36880 }, { "epoch": 2.14, "grad_norm": 312.6156311035156, "learning_rate": 0.0002890591741166454, "logits/chosen": -19.400836944580078, "logits/rejected": -21.74838638305664, "logps/chosen": -2396.71337890625, "logps/rejected": -2862.98583984375, "loss": 5.7331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -186.3760986328125, "rewards/margins": 4.2241387367248535, "rewards/rejected": -190.60023498535156, "step": 36890 }, { "epoch": 2.14, "grad_norm": 24.045024871826172, "learning_rate": 0.000288865668176013, "logits/chosen": -16.148170471191406, "logits/rejected": -16.79329490661621, "logps/chosen": -2759.66064453125, "logps/rejected": -2565.61669921875, "loss": 2.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.3321990966797, "rewards/margins": 12.275555610656738, "rewards/rejected": -162.60775756835938, "step": 36900 }, { "epoch": 2.14, "grad_norm": 2.59479207898039e-07, "learning_rate": 0.00028867216223538064, "logits/chosen": -15.679040908813477, "logits/rejected": -18.122760772705078, "logps/chosen": -3500.59619140625, "logps/rejected": -3268.79345703125, "loss": 7.8937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -128.92880249023438, "rewards/margins": 7.098771095275879, "rewards/rejected": -136.02755737304688, "step": 36910 }, { "epoch": 2.14, "grad_norm": 2.9215705581009388e-05, "learning_rate": 0.00028847865629474825, "logits/chosen": -22.156017303466797, "logits/rejected": -24.22406768798828, "logps/chosen": -2812.335205078125, "logps/rejected": -2627.942138671875, "loss": 22.245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -255.3347930908203, "rewards/margins": -8.055105209350586, "rewards/rejected": -247.27969360351562, "step": 36920 }, { "epoch": 2.14, "grad_norm": 82.72757720947266, "learning_rate": 0.0002882851503541159, "logits/chosen": -14.906147956848145, "logits/rejected": -16.40913200378418, "logps/chosen": -3286.35400390625, "logps/rejected": -3057.38818359375, "loss": 5.9671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.76734924316406, "rewards/margins": 10.972007751464844, "rewards/rejected": -132.73934936523438, "step": 36930 }, { "epoch": 2.14, "grad_norm": 60.04438400268555, "learning_rate": 0.0002880916444134835, "logits/chosen": -19.574758529663086, "logits/rejected": -19.39198875427246, "logps/chosen": -2778.202880859375, "logps/rejected": -2760.018310546875, "loss": 6.9811, "rewards/accuracies": 0.5, "rewards/chosen": -215.8634033203125, "rewards/margins": -2.0055601596832275, "rewards/rejected": -213.8578643798828, "step": 36940 }, { "epoch": 2.14, "grad_norm": 0.0, "learning_rate": 0.0002878981384728511, "logits/chosen": -18.052814483642578, "logits/rejected": -18.0823917388916, "logps/chosen": -3144.995361328125, "logps/rejected": -3249.144287109375, "loss": 0.6008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -201.20217895507812, "rewards/margins": 26.82479476928711, "rewards/rejected": -228.0269775390625, "step": 36950 }, { "epoch": 2.14, "grad_norm": 16.55097770690918, "learning_rate": 0.0002877046325322187, "logits/chosen": -15.344978332519531, "logits/rejected": -16.161510467529297, "logps/chosen": -2890.74365234375, "logps/rejected": -2867.4130859375, "loss": 1.8988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -99.58747100830078, "rewards/margins": 11.897417068481445, "rewards/rejected": -111.4848861694336, "step": 36960 }, { "epoch": 2.14, "grad_norm": 1.0064421762353074e-13, "learning_rate": 0.00028751112659158634, "logits/chosen": -19.291179656982422, "logits/rejected": -18.85585594177246, "logps/chosen": -2787.4951171875, "logps/rejected": -2764.6884765625, "loss": 1.4959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -208.97824096679688, "rewards/margins": 12.829391479492188, "rewards/rejected": -221.80764770507812, "step": 36970 }, { "epoch": 2.14, "grad_norm": 0.0, "learning_rate": 0.000287317620650954, "logits/chosen": -20.607234954833984, "logits/rejected": -23.11486053466797, "logps/chosen": -2954.15478515625, "logps/rejected": -3027.201171875, "loss": 2.3066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -231.3438262939453, "rewards/margins": 11.634370803833008, "rewards/rejected": -242.9781951904297, "step": 36980 }, { "epoch": 2.14, "grad_norm": 0.21125715970993042, "learning_rate": 0.00028712411471032163, "logits/chosen": -17.148113250732422, "logits/rejected": -18.086029052734375, "logps/chosen": -3211.84326171875, "logps/rejected": -3262.8388671875, "loss": 1.2501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.83889770507812, "rewards/margins": 11.763849258422852, "rewards/rejected": -183.60276794433594, "step": 36990 }, { "epoch": 2.14, "grad_norm": 2.0133549341605494e-11, "learning_rate": 0.00028693060876968925, "logits/chosen": -15.487386703491211, "logits/rejected": -15.48418140411377, "logps/chosen": -2934.09423828125, "logps/rejected": -2622.096923828125, "loss": 3.2337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -179.22909545898438, "rewards/margins": 5.428475856781006, "rewards/rejected": -184.65757751464844, "step": 37000 }, { "epoch": 2.14, "grad_norm": 4.968593597412109, "learning_rate": 0.00028673710282905686, "logits/chosen": -20.372610092163086, "logits/rejected": -20.27640151977539, "logps/chosen": -2631.9287109375, "logps/rejected": -2589.24560546875, "loss": 4.6691, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -236.86233520507812, "rewards/margins": 1.6046326160430908, "rewards/rejected": -238.46694946289062, "step": 37010 }, { "epoch": 2.14, "grad_norm": 0.0008708464447408915, "learning_rate": 0.0002865435968884245, "logits/chosen": -14.059516906738281, "logits/rejected": -13.685857772827148, "logps/chosen": -3315.841796875, "logps/rejected": -2859.91748046875, "loss": 3.5348, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -138.57264709472656, "rewards/margins": 17.224876403808594, "rewards/rejected": -155.79751586914062, "step": 37020 }, { "epoch": 2.14, "grad_norm": 1.6877760300459954e-09, "learning_rate": 0.00028635009094779215, "logits/chosen": -11.834406852722168, "logits/rejected": -12.01198673248291, "logps/chosen": -3358.375732421875, "logps/rejected": -3199.078857421875, "loss": 8.7697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.82742691040039, "rewards/margins": 1.442064881324768, "rewards/rejected": -34.26948928833008, "step": 37030 }, { "epoch": 2.14, "grad_norm": 7.747476774966344e-05, "learning_rate": 0.00028615658500715977, "logits/chosen": -15.572553634643555, "logits/rejected": -15.543922424316406, "logps/chosen": -2789.719970703125, "logps/rejected": -2832.1015625, "loss": 2.9854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.31512451171875, "rewards/margins": 7.937071323394775, "rewards/rejected": -183.25218200683594, "step": 37040 }, { "epoch": 2.14, "grad_norm": 1.0459849268196555e-13, "learning_rate": 0.00028596307906652733, "logits/chosen": -16.601102828979492, "logits/rejected": -17.887359619140625, "logps/chosen": -2861.986328125, "logps/rejected": -2552.40625, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -111.82852935791016, "rewards/margins": 28.20551109313965, "rewards/rejected": -140.03402709960938, "step": 37050 }, { "epoch": 2.15, "grad_norm": 0.0001280632131965831, "learning_rate": 0.00028576957312589495, "logits/chosen": -15.790245056152344, "logits/rejected": -15.329612731933594, "logps/chosen": -2556.26611328125, "logps/rejected": -2597.031005859375, "loss": 1.1693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.73419189453125, "rewards/margins": 7.959323883056641, "rewards/rejected": -195.69351196289062, "step": 37060 }, { "epoch": 2.15, "grad_norm": 5.119093202665681e-06, "learning_rate": 0.00028557606718526257, "logits/chosen": -16.58761215209961, "logits/rejected": -18.775386810302734, "logps/chosen": -3054.446533203125, "logps/rejected": -3104.616455078125, "loss": 1.2673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -167.7025146484375, "rewards/margins": 9.681034088134766, "rewards/rejected": -177.38356018066406, "step": 37070 }, { "epoch": 2.15, "grad_norm": 105.271240234375, "learning_rate": 0.0002853825612446302, "logits/chosen": -15.464808464050293, "logits/rejected": -15.177976608276367, "logps/chosen": -2782.451171875, "logps/rejected": -2924.5205078125, "loss": 6.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.87631225585938, "rewards/margins": 0.7991188764572144, "rewards/rejected": -154.67544555664062, "step": 37080 }, { "epoch": 2.15, "grad_norm": 14.38634204864502, "learning_rate": 0.00028518905530399786, "logits/chosen": -15.432241439819336, "logits/rejected": -17.177635192871094, "logps/chosen": -2827.210693359375, "logps/rejected": -2842.09912109375, "loss": 6.6074, "rewards/accuracies": 0.5, "rewards/chosen": -181.4276885986328, "rewards/margins": -3.2906525135040283, "rewards/rejected": -178.13702392578125, "step": 37090 }, { "epoch": 2.15, "grad_norm": 7.417700784038648e-18, "learning_rate": 0.00028499554936336547, "logits/chosen": -14.996932983398438, "logits/rejected": -16.11956214904785, "logps/chosen": -3203.58642578125, "logps/rejected": -2987.02099609375, "loss": 2.1116, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -172.74063110351562, "rewards/margins": 6.385845184326172, "rewards/rejected": -179.12648010253906, "step": 37100 }, { "epoch": 2.15, "grad_norm": 174.77073669433594, "learning_rate": 0.0002848020434227331, "logits/chosen": -13.706766128540039, "logits/rejected": -13.085174560546875, "logps/chosen": -3100.96826171875, "logps/rejected": -3208.736328125, "loss": 3.1287, "rewards/accuracies": 0.5, "rewards/chosen": -171.0166473388672, "rewards/margins": 3.6286797523498535, "rewards/rejected": -174.6453399658203, "step": 37110 }, { "epoch": 2.15, "grad_norm": 0.010307533666491508, "learning_rate": 0.0002846085374821007, "logits/chosen": -14.896690368652344, "logits/rejected": -15.152952194213867, "logps/chosen": -2900.42529296875, "logps/rejected": -2566.115966796875, "loss": 3.131, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -125.57389831542969, "rewards/margins": 12.774083137512207, "rewards/rejected": -138.34799194335938, "step": 37120 }, { "epoch": 2.15, "grad_norm": 0.10723384469747543, "learning_rate": 0.0002844150315414683, "logits/chosen": -14.5938720703125, "logits/rejected": -14.905645370483398, "logps/chosen": -2660.5771484375, "logps/rejected": -2994.170166015625, "loss": 4.431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.94241333007812, "rewards/margins": 4.643710136413574, "rewards/rejected": -178.58612060546875, "step": 37130 }, { "epoch": 2.15, "grad_norm": 97.43265533447266, "learning_rate": 0.000284221525600836, "logits/chosen": -14.977783203125, "logits/rejected": -15.368499755859375, "logps/chosen": -3374.704345703125, "logps/rejected": -3342.21435546875, "loss": 2.2111, "rewards/accuracies": 0.5, "rewards/chosen": -195.25030517578125, "rewards/margins": 8.197636604309082, "rewards/rejected": -203.4479522705078, "step": 37140 }, { "epoch": 2.15, "grad_norm": 2.24284679717357e-08, "learning_rate": 0.00028402801966020356, "logits/chosen": -13.812357902526855, "logits/rejected": -13.519735336303711, "logps/chosen": -3235.696044921875, "logps/rejected": -3141.4296875, "loss": 2.8926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.26123046875, "rewards/margins": 4.2044782638549805, "rewards/rejected": -177.4656982421875, "step": 37150 }, { "epoch": 2.15, "grad_norm": 0.00039059919072315097, "learning_rate": 0.0002838345137195712, "logits/chosen": -14.421687126159668, "logits/rejected": -15.916595458984375, "logps/chosen": -3037.18994140625, "logps/rejected": -2733.49951171875, "loss": 3.8051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.7052001953125, "rewards/margins": 1.3673303127288818, "rewards/rejected": -195.072509765625, "step": 37160 }, { "epoch": 2.15, "grad_norm": 115.31206512451172, "learning_rate": 0.0002836410077789388, "logits/chosen": -18.085908889770508, "logits/rejected": -17.213645935058594, "logps/chosen": -2896.611328125, "logps/rejected": -2769.192626953125, "loss": 3.5436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -220.9251251220703, "rewards/margins": 4.2916059494018555, "rewards/rejected": -225.2167510986328, "step": 37170 }, { "epoch": 2.15, "grad_norm": 0.0008732926216907799, "learning_rate": 0.0002834475018383064, "logits/chosen": -15.180888175964355, "logits/rejected": -16.616397857666016, "logps/chosen": -3016.660400390625, "logps/rejected": -2984.346435546875, "loss": 0.8616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -255.0293426513672, "rewards/margins": 11.563733100891113, "rewards/rejected": -266.59307861328125, "step": 37180 }, { "epoch": 2.15, "grad_norm": 0.0002520253765396774, "learning_rate": 0.0002832539958976741, "logits/chosen": -12.885168075561523, "logits/rejected": -14.050933837890625, "logps/chosen": -2998.292724609375, "logps/rejected": -2713.37109375, "loss": 9.8041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -120.61015319824219, "rewards/margins": -1.4801948070526123, "rewards/rejected": -119.12995910644531, "step": 37190 }, { "epoch": 2.15, "grad_norm": 16.02829933166504, "learning_rate": 0.0002830604899570417, "logits/chosen": -13.980230331420898, "logits/rejected": -14.102783203125, "logps/chosen": -3233.08740234375, "logps/rejected": -3027.34130859375, "loss": 3.6411, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -123.10243225097656, "rewards/margins": 7.486297607421875, "rewards/rejected": -130.58872985839844, "step": 37200 }, { "epoch": 2.15, "grad_norm": 1.2518122503024642e-06, "learning_rate": 0.0002828669840164093, "logits/chosen": -16.17247772216797, "logits/rejected": -16.4527530670166, "logps/chosen": -2644.76611328125, "logps/rejected": -2485.28759765625, "loss": 2.7469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -102.7558822631836, "rewards/margins": 44.89383316040039, "rewards/rejected": -147.6497039794922, "step": 37210 }, { "epoch": 2.15, "grad_norm": 51.2669563293457, "learning_rate": 0.00028267347807577693, "logits/chosen": -12.218810081481934, "logits/rejected": -12.337130546569824, "logps/chosen": -2671.471923828125, "logps/rejected": -2866.98486328125, "loss": 7.0274, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -108.83055114746094, "rewards/margins": -2.705138683319092, "rewards/rejected": -106.12541198730469, "step": 37220 }, { "epoch": 2.16, "grad_norm": 0.001356010208837688, "learning_rate": 0.00028247997213514455, "logits/chosen": -14.797407150268555, "logits/rejected": -15.848788261413574, "logps/chosen": -2950.65869140625, "logps/rejected": -2658.0849609375, "loss": 1.4423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.57135009765625, "rewards/margins": 9.770705223083496, "rewards/rejected": -159.34207153320312, "step": 37230 }, { "epoch": 2.16, "grad_norm": 0.00015349077875725925, "learning_rate": 0.0002822864661945122, "logits/chosen": -10.412398338317871, "logits/rejected": -10.439398765563965, "logps/chosen": -3433.082763671875, "logps/rejected": -3197.391845703125, "loss": 6.2136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -74.49026489257812, "rewards/margins": 8.236343383789062, "rewards/rejected": -82.72660827636719, "step": 37240 }, { "epoch": 2.16, "grad_norm": 0.01628163456916809, "learning_rate": 0.00028209296025387984, "logits/chosen": -15.654802322387695, "logits/rejected": -16.50342559814453, "logps/chosen": -3047.840087890625, "logps/rejected": -2512.93798828125, "loss": 17.9184, "rewards/accuracies": 0.5, "rewards/chosen": -166.14151000976562, "rewards/margins": -8.533708572387695, "rewards/rejected": -157.60781860351562, "step": 37250 }, { "epoch": 2.16, "grad_norm": 0.0014185219770297408, "learning_rate": 0.0002818994543132474, "logits/chosen": -13.603571891784668, "logits/rejected": -14.489109992980957, "logps/chosen": -2902.82177734375, "logps/rejected": -2671.71044921875, "loss": 4.6766, "rewards/accuracies": 0.5, "rewards/chosen": -159.12222290039062, "rewards/margins": 2.70317006111145, "rewards/rejected": -161.82540893554688, "step": 37260 }, { "epoch": 2.16, "grad_norm": 0.0002646481152623892, "learning_rate": 0.000281705948372615, "logits/chosen": -16.55955696105957, "logits/rejected": -17.994651794433594, "logps/chosen": -2798.735107421875, "logps/rejected": -3080.17529296875, "loss": 2.2614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.09103393554688, "rewards/margins": 10.384614944458008, "rewards/rejected": -165.4756622314453, "step": 37270 }, { "epoch": 2.16, "grad_norm": 0.5094397664070129, "learning_rate": 0.00028151244243198264, "logits/chosen": -12.498411178588867, "logits/rejected": -12.849186897277832, "logps/chosen": -2878.0751953125, "logps/rejected": -2697.72216796875, "loss": 1.9939, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.23091125488281, "rewards/margins": 12.701370239257812, "rewards/rejected": -100.93228912353516, "step": 37280 }, { "epoch": 2.16, "grad_norm": 49.0980339050293, "learning_rate": 0.00028131893649135025, "logits/chosen": -15.378756523132324, "logits/rejected": -15.206796646118164, "logps/chosen": -2624.41455078125, "logps/rejected": -2717.146240234375, "loss": 11.075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -162.65560913085938, "rewards/margins": -9.856290817260742, "rewards/rejected": -152.79933166503906, "step": 37290 }, { "epoch": 2.16, "grad_norm": 103.14277648925781, "learning_rate": 0.0002811254305507179, "logits/chosen": -18.362443923950195, "logits/rejected": -19.05385971069336, "logps/chosen": -2392.050048828125, "logps/rejected": -2423.99072265625, "loss": 1.2769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.31039428710938, "rewards/margins": 7.964500427246094, "rewards/rejected": -218.27490234375, "step": 37300 }, { "epoch": 2.16, "grad_norm": 50.51082992553711, "learning_rate": 0.00028093192461008554, "logits/chosen": -15.013792037963867, "logits/rejected": -18.640987396240234, "logps/chosen": -2709.115234375, "logps/rejected": -2684.40185546875, "loss": 0.7159, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -164.14334106445312, "rewards/margins": 8.162237167358398, "rewards/rejected": -172.3055877685547, "step": 37310 }, { "epoch": 2.16, "grad_norm": 22.374950408935547, "learning_rate": 0.00028073841866945316, "logits/chosen": -15.938840866088867, "logits/rejected": -16.403751373291016, "logps/chosen": -2672.628662109375, "logps/rejected": -2636.30419921875, "loss": 3.712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.2053985595703, "rewards/margins": 1.1697555780410767, "rewards/rejected": -169.3751678466797, "step": 37320 }, { "epoch": 2.16, "grad_norm": 0.0005790701252408326, "learning_rate": 0.0002805449127288208, "logits/chosen": -16.16090965270996, "logits/rejected": -17.38771629333496, "logps/chosen": -2749.1611328125, "logps/rejected": -2363.56640625, "loss": 16.593, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -157.30059814453125, "rewards/margins": -11.686318397521973, "rewards/rejected": -145.61428833007812, "step": 37330 }, { "epoch": 2.16, "grad_norm": 45.49223327636719, "learning_rate": 0.0002803514067881884, "logits/chosen": -16.524486541748047, "logits/rejected": -16.288589477539062, "logps/chosen": -2817.454833984375, "logps/rejected": -2783.02880859375, "loss": 1.2986, "rewards/accuracies": 0.5, "rewards/chosen": -148.53427124023438, "rewards/margins": 5.883847236633301, "rewards/rejected": -154.41810607910156, "step": 37340 }, { "epoch": 2.16, "grad_norm": 63.927486419677734, "learning_rate": 0.00028015790084755606, "logits/chosen": -15.917523384094238, "logits/rejected": -16.029293060302734, "logps/chosen": -2886.038818359375, "logps/rejected": -2717.795654296875, "loss": 7.8671, "rewards/accuracies": 0.5, "rewards/chosen": -199.10226440429688, "rewards/margins": -1.314215064048767, "rewards/rejected": -197.78805541992188, "step": 37350 }, { "epoch": 2.16, "grad_norm": 1.921550886174728e-16, "learning_rate": 0.0002799643949069237, "logits/chosen": -14.117413520812988, "logits/rejected": -15.668996810913086, "logps/chosen": -3258.9482421875, "logps/rejected": -2929.612548828125, "loss": 8.9655, "rewards/accuracies": 0.5, "rewards/chosen": -181.0161895751953, "rewards/margins": 3.115384578704834, "rewards/rejected": -184.13156127929688, "step": 37360 }, { "epoch": 2.16, "grad_norm": 4.299393415070629e-13, "learning_rate": 0.00027977088896629124, "logits/chosen": -14.042485237121582, "logits/rejected": -14.26280403137207, "logps/chosen": -2900.764892578125, "logps/rejected": -3041.611572265625, "loss": 6.5797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.03079223632812, "rewards/margins": 5.982824802398682, "rewards/rejected": -147.0136260986328, "step": 37370 }, { "epoch": 2.16, "grad_norm": 17.694934844970703, "learning_rate": 0.00027957738302565886, "logits/chosen": -16.49045181274414, "logits/rejected": -16.043516159057617, "logps/chosen": -2778.853271484375, "logps/rejected": -2919.90771484375, "loss": 0.6771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -184.50965881347656, "rewards/margins": 9.726499557495117, "rewards/rejected": -194.23617553710938, "step": 37380 }, { "epoch": 2.16, "grad_norm": 1.0569632280052326e-20, "learning_rate": 0.0002793838770850265, "logits/chosen": -15.355443000793457, "logits/rejected": -16.06972885131836, "logps/chosen": -2922.7177734375, "logps/rejected": -2670.033203125, "loss": 1.07, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -219.72314453125, "rewards/margins": 12.355656623840332, "rewards/rejected": -232.0788116455078, "step": 37390 }, { "epoch": 2.16, "grad_norm": 9.585182851878926e-06, "learning_rate": 0.00027919037114439415, "logits/chosen": -11.991995811462402, "logits/rejected": -12.552704811096191, "logps/chosen": -2741.380859375, "logps/rejected": -3238.9814453125, "loss": 4.7347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.0044174194336, "rewards/margins": -0.11796226352453232, "rewards/rejected": -108.8864517211914, "step": 37400 }, { "epoch": 2.17, "grad_norm": 0.0011586040491238236, "learning_rate": 0.00027899686520376177, "logits/chosen": -10.414361953735352, "logits/rejected": -10.262458801269531, "logps/chosen": -3650.533935546875, "logps/rejected": -3511.8515625, "loss": 5.0945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -103.93269348144531, "rewards/margins": 1.3877894878387451, "rewards/rejected": -105.32048034667969, "step": 37410 }, { "epoch": 2.17, "grad_norm": 0.010180925950407982, "learning_rate": 0.0002788033592631294, "logits/chosen": -12.995463371276855, "logits/rejected": -12.976997375488281, "logps/chosen": -2806.70849609375, "logps/rejected": -3039.50634765625, "loss": 1.2738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -110.83099365234375, "rewards/margins": 17.760366439819336, "rewards/rejected": -128.59136962890625, "step": 37420 }, { "epoch": 2.17, "grad_norm": 55.36363220214844, "learning_rate": 0.000278609853322497, "logits/chosen": -17.448238372802734, "logits/rejected": -17.29358673095703, "logps/chosen": -2651.551513671875, "logps/rejected": -2716.44775390625, "loss": 11.1022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.16259765625, "rewards/margins": -2.017589569091797, "rewards/rejected": -203.14498901367188, "step": 37430 }, { "epoch": 2.17, "grad_norm": 2.1994290364091285e-05, "learning_rate": 0.0002784163473818646, "logits/chosen": -14.629796981811523, "logits/rejected": -15.792500495910645, "logps/chosen": -2987.59716796875, "logps/rejected": -2987.931640625, "loss": 10.6548, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.55308532714844, "rewards/margins": 1.0344101190567017, "rewards/rejected": -174.58749389648438, "step": 37440 }, { "epoch": 2.17, "grad_norm": 4.655731417659839e-15, "learning_rate": 0.0002782228414412323, "logits/chosen": -12.520715713500977, "logits/rejected": -12.915972709655762, "logps/chosen": -3084.86279296875, "logps/rejected": -3021.768798828125, "loss": 2.7094, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -138.11056518554688, "rewards/margins": 3.629692792892456, "rewards/rejected": -141.74026489257812, "step": 37450 }, { "epoch": 2.17, "grad_norm": 4.177753925323486, "learning_rate": 0.0002780293355005999, "logits/chosen": -10.753519058227539, "logits/rejected": -11.171732902526855, "logps/chosen": -3448.84912109375, "logps/rejected": -2743.92431640625, "loss": 7.4878, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -102.04966735839844, "rewards/margins": -4.689920425415039, "rewards/rejected": -97.35974884033203, "step": 37460 }, { "epoch": 2.17, "grad_norm": 586.152587890625, "learning_rate": 0.0002778358295599675, "logits/chosen": -15.234663009643555, "logits/rejected": -16.1063232421875, "logps/chosen": -3369.872314453125, "logps/rejected": -2979.4912109375, "loss": 7.809, "rewards/accuracies": 0.5, "rewards/chosen": -154.5088653564453, "rewards/margins": 0.295846164226532, "rewards/rejected": -154.80470275878906, "step": 37470 }, { "epoch": 2.17, "grad_norm": 1.5853855609893799, "learning_rate": 0.0002776423236193351, "logits/chosen": -17.88448715209961, "logits/rejected": -18.889263153076172, "logps/chosen": -2817.412841796875, "logps/rejected": -2373.675537109375, "loss": 0.1364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -180.35313415527344, "rewards/margins": 20.19816017150879, "rewards/rejected": -200.55130004882812, "step": 37480 }, { "epoch": 2.17, "grad_norm": 13.453152656555176, "learning_rate": 0.0002774488176787027, "logits/chosen": -16.491844177246094, "logits/rejected": -16.496967315673828, "logps/chosen": -3099.593017578125, "logps/rejected": -2973.91162109375, "loss": 1.1803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.81320190429688, "rewards/margins": 2.385336399078369, "rewards/rejected": -176.19854736328125, "step": 37490 }, { "epoch": 2.17, "grad_norm": 0.02545284852385521, "learning_rate": 0.0002772553117380703, "logits/chosen": -13.896051406860352, "logits/rejected": -14.493646621704102, "logps/chosen": -3484.059814453125, "logps/rejected": -3029.19091796875, "loss": 0.3563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -91.77046203613281, "rewards/margins": 12.790714263916016, "rewards/rejected": -104.5611801147461, "step": 37500 }, { "epoch": 2.17, "grad_norm": 17.535722732543945, "learning_rate": 0.000277061805797438, "logits/chosen": -21.666826248168945, "logits/rejected": -21.292003631591797, "logps/chosen": -2766.610595703125, "logps/rejected": -2970.593994140625, "loss": 2.4863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.0287628173828, "rewards/margins": 9.259549140930176, "rewards/rejected": -199.28829956054688, "step": 37510 }, { "epoch": 2.17, "grad_norm": 1.3928222415415803e-06, "learning_rate": 0.0002768682998568056, "logits/chosen": -22.194862365722656, "logits/rejected": -20.15032386779785, "logps/chosen": -2677.880126953125, "logps/rejected": -2478.991455078125, "loss": 3.28, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -207.31643676757812, "rewards/margins": 10.104344367980957, "rewards/rejected": -217.42074584960938, "step": 37520 }, { "epoch": 2.17, "grad_norm": 0.0042377645149827, "learning_rate": 0.00027667479391617323, "logits/chosen": -17.995054244995117, "logits/rejected": -21.203487396240234, "logps/chosen": -3308.26708984375, "logps/rejected": -3035.12353515625, "loss": 2.67, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -144.45718383789062, "rewards/margins": 14.856157302856445, "rewards/rejected": -159.3133544921875, "step": 37530 }, { "epoch": 2.17, "grad_norm": 3.0271973372464345e-08, "learning_rate": 0.00027648128797554085, "logits/chosen": -15.732378959655762, "logits/rejected": -16.335363388061523, "logps/chosen": -3107.69287109375, "logps/rejected": -2819.210205078125, "loss": 3.9515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -91.60365295410156, "rewards/margins": 8.232650756835938, "rewards/rejected": -99.83628845214844, "step": 37540 }, { "epoch": 2.17, "grad_norm": 55.82551193237305, "learning_rate": 0.00027628778203490846, "logits/chosen": -21.638023376464844, "logits/rejected": -20.08597755432129, "logps/chosen": -3169.274658203125, "logps/rejected": -3330.35546875, "loss": 9.98, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -236.35769653320312, "rewards/margins": 14.91368579864502, "rewards/rejected": -251.27139282226562, "step": 37550 }, { "epoch": 2.17, "grad_norm": 2.289542635480757e-07, "learning_rate": 0.00027609427609427613, "logits/chosen": -18.856685638427734, "logits/rejected": -20.2524356842041, "logps/chosen": -3480.440673828125, "logps/rejected": -3027.788818359375, "loss": 9.9413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -229.8862762451172, "rewards/margins": 1.047864556312561, "rewards/rejected": -230.9341583251953, "step": 37560 }, { "epoch": 2.17, "grad_norm": 3.556676042679475e-19, "learning_rate": 0.00027590077015364375, "logits/chosen": -18.4061336517334, "logits/rejected": -20.886964797973633, "logps/chosen": -2939.57080078125, "logps/rejected": -2807.421875, "loss": 3.9454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.400634765625, "rewards/margins": 4.7545599937438965, "rewards/rejected": -192.1551971435547, "step": 37570 }, { "epoch": 2.18, "grad_norm": 53.66639709472656, "learning_rate": 0.00027570726421301137, "logits/chosen": -13.874380111694336, "logits/rejected": -14.5879545211792, "logps/chosen": -3003.17822265625, "logps/rejected": -2832.5927734375, "loss": 1.4793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -91.91758728027344, "rewards/margins": 15.00848388671875, "rewards/rejected": -106.92607116699219, "step": 37580 }, { "epoch": 2.18, "grad_norm": 0.00029187719337642193, "learning_rate": 0.00027551375827237893, "logits/chosen": -15.589266777038574, "logits/rejected": -16.28641128540039, "logps/chosen": -2663.39306640625, "logps/rejected": -2698.578857421875, "loss": 2.1345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -114.7069320678711, "rewards/margins": 7.846646308898926, "rewards/rejected": -122.5535888671875, "step": 37590 }, { "epoch": 2.18, "grad_norm": 87.19440460205078, "learning_rate": 0.00027532025233174655, "logits/chosen": -15.786544799804688, "logits/rejected": -16.056406021118164, "logps/chosen": -3101.88134765625, "logps/rejected": -2900.30322265625, "loss": 1.998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.93878173828125, "rewards/margins": 12.397636413574219, "rewards/rejected": -164.33642578125, "step": 37600 }, { "epoch": 2.18, "grad_norm": 3.2269210350932553e-05, "learning_rate": 0.0002751267463911142, "logits/chosen": -14.731704711914062, "logits/rejected": -14.538156509399414, "logps/chosen": -3036.17578125, "logps/rejected": -3038.47900390625, "loss": 2.3116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -169.6028289794922, "rewards/margins": 8.819365501403809, "rewards/rejected": -178.42222595214844, "step": 37610 }, { "epoch": 2.18, "grad_norm": 0.229373037815094, "learning_rate": 0.00027493324045048184, "logits/chosen": -19.835397720336914, "logits/rejected": -20.71807289123535, "logps/chosen": -2640.083740234375, "logps/rejected": -2878.017578125, "loss": 0.0716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -232.39376831054688, "rewards/margins": 25.239078521728516, "rewards/rejected": -257.63287353515625, "step": 37620 }, { "epoch": 2.18, "grad_norm": 0.034007199108600616, "learning_rate": 0.00027473973450984945, "logits/chosen": -16.159194946289062, "logits/rejected": -21.219804763793945, "logps/chosen": -2809.724609375, "logps/rejected": -2820.117431640625, "loss": 14.3179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -220.03512573242188, "rewards/margins": -2.2807350158691406, "rewards/rejected": -217.75442504882812, "step": 37630 }, { "epoch": 2.18, "grad_norm": 0.0067548248916864395, "learning_rate": 0.00027454622856921707, "logits/chosen": -17.874876022338867, "logits/rejected": -17.999095916748047, "logps/chosen": -2691.241455078125, "logps/rejected": -2656.2451171875, "loss": 4.8929, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -170.96197509765625, "rewards/margins": -2.4896233081817627, "rewards/rejected": -168.4723663330078, "step": 37640 }, { "epoch": 2.18, "grad_norm": 0.0, "learning_rate": 0.0002743527226285847, "logits/chosen": -15.340200424194336, "logits/rejected": -15.338624954223633, "logps/chosen": -2948.75537109375, "logps/rejected": -2820.45556640625, "loss": 1.8397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.4454345703125, "rewards/margins": 14.188692092895508, "rewards/rejected": -167.63412475585938, "step": 37650 }, { "epoch": 2.18, "grad_norm": 4.710252370435097e-13, "learning_rate": 0.00027415921668795236, "logits/chosen": -17.886600494384766, "logits/rejected": -17.661975860595703, "logps/chosen": -3453.85302734375, "logps/rejected": -3119.67333984375, "loss": 5.5419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.8439178466797, "rewards/margins": 2.5736796855926514, "rewards/rejected": -172.41757202148438, "step": 37660 }, { "epoch": 2.18, "grad_norm": 0.014646296389400959, "learning_rate": 0.00027396571074732, "logits/chosen": -17.425395965576172, "logits/rejected": -17.669843673706055, "logps/chosen": -3208.35546875, "logps/rejected": -3430.692138671875, "loss": 9.4369, "rewards/accuracies": 0.5, "rewards/chosen": -228.1450653076172, "rewards/margins": 0.5498321652412415, "rewards/rejected": -228.6949005126953, "step": 37670 }, { "epoch": 2.18, "grad_norm": 523.8096923828125, "learning_rate": 0.0002737722048066876, "logits/chosen": -16.10240364074707, "logits/rejected": -16.51565933227539, "logps/chosen": -2801.93408203125, "logps/rejected": -2620.54833984375, "loss": 4.3186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -159.45542907714844, "rewards/margins": 3.3824639320373535, "rewards/rejected": -162.837890625, "step": 37680 }, { "epoch": 2.18, "grad_norm": 0.0, "learning_rate": 0.0002735786988660552, "logits/chosen": -13.657916069030762, "logits/rejected": -14.055376052856445, "logps/chosen": -3026.59326171875, "logps/rejected": -3359.524169921875, "loss": 2.4347, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -99.35041809082031, "rewards/margins": 19.78713607788086, "rewards/rejected": -119.1375503540039, "step": 37690 }, { "epoch": 2.18, "grad_norm": 0.0022461693733930588, "learning_rate": 0.0002733851929254228, "logits/chosen": -16.09865951538086, "logits/rejected": -16.49260711669922, "logps/chosen": -2851.536376953125, "logps/rejected": -3011.647705078125, "loss": 2.8179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.94265747070312, "rewards/margins": 5.125217437744141, "rewards/rejected": -193.06785583496094, "step": 37700 }, { "epoch": 2.18, "grad_norm": 0.0, "learning_rate": 0.00027319168698479045, "logits/chosen": -18.030405044555664, "logits/rejected": -18.501422882080078, "logps/chosen": -2688.2861328125, "logps/rejected": -2524.27294921875, "loss": 2.2529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.6514892578125, "rewards/margins": 19.586624145507812, "rewards/rejected": -220.23812866210938, "step": 37710 }, { "epoch": 2.18, "grad_norm": 5.345572162696044e-07, "learning_rate": 0.00027299818104415806, "logits/chosen": -15.11681842803955, "logits/rejected": -17.647476196289062, "logps/chosen": -3184.807373046875, "logps/rejected": -3058.0966796875, "loss": 2.1736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.7424774169922, "rewards/margins": 13.804837226867676, "rewards/rejected": -160.54733276367188, "step": 37720 }, { "epoch": 2.18, "grad_norm": 0.03557395935058594, "learning_rate": 0.0002728046751035257, "logits/chosen": -14.048274040222168, "logits/rejected": -13.769558906555176, "logps/chosen": -3281.9140625, "logps/rejected": -2924.648193359375, "loss": 1.9427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.9564208984375, "rewards/margins": 7.568607330322266, "rewards/rejected": -117.5250244140625, "step": 37730 }, { "epoch": 2.18, "grad_norm": 7.041840581223369e-05, "learning_rate": 0.0002726111691628933, "logits/chosen": -20.69467544555664, "logits/rejected": -20.164020538330078, "logps/chosen": -2737.5859375, "logps/rejected": -2868.325927734375, "loss": 3.6243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -248.5862274169922, "rewards/margins": 7.059401035308838, "rewards/rejected": -255.64566040039062, "step": 37740 }, { "epoch": 2.19, "grad_norm": 0.000908154877834022, "learning_rate": 0.0002724176632222609, "logits/chosen": -19.424867630004883, "logits/rejected": -21.992855072021484, "logps/chosen": -2788.286865234375, "logps/rejected": -2772.81201171875, "loss": 1.4503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -214.6480255126953, "rewards/margins": 8.556488037109375, "rewards/rejected": -223.2045135498047, "step": 37750 }, { "epoch": 2.19, "grad_norm": 5.130120772633973e-10, "learning_rate": 0.00027222415728162853, "logits/chosen": -18.647897720336914, "logits/rejected": -22.81403160095215, "logps/chosen": -3047.888671875, "logps/rejected": -2875.621337890625, "loss": 1.6631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.43441772460938, "rewards/margins": 10.322792053222656, "rewards/rejected": -190.75721740722656, "step": 37760 }, { "epoch": 2.19, "grad_norm": 54.54063034057617, "learning_rate": 0.0002720306513409962, "logits/chosen": -16.71504020690918, "logits/rejected": -14.644078254699707, "logps/chosen": -3253.53466796875, "logps/rejected": -3187.18505859375, "loss": 2.755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.99513244628906, "rewards/margins": 10.403210639953613, "rewards/rejected": -184.39834594726562, "step": 37770 }, { "epoch": 2.19, "grad_norm": 4.204081744188515e-13, "learning_rate": 0.0002718371454003638, "logits/chosen": -15.473213195800781, "logits/rejected": -15.76225757598877, "logps/chosen": -2959.93310546875, "logps/rejected": -2885.32470703125, "loss": 2.2117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.10928344726562, "rewards/margins": 7.646980285644531, "rewards/rejected": -201.7562713623047, "step": 37780 }, { "epoch": 2.19, "grad_norm": 12.605074882507324, "learning_rate": 0.00027164363945973144, "logits/chosen": -18.49380111694336, "logits/rejected": -19.163280487060547, "logps/chosen": -2894.905029296875, "logps/rejected": -2816.251708984375, "loss": 17.4505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -164.9113311767578, "rewards/margins": -14.796628952026367, "rewards/rejected": -150.11471557617188, "step": 37790 }, { "epoch": 2.19, "grad_norm": 6.154408038128167e-05, "learning_rate": 0.00027145013351909905, "logits/chosen": -19.529935836791992, "logits/rejected": -20.927906036376953, "logps/chosen": -2711.951171875, "logps/rejected": -2870.982421875, "loss": 2.9527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.1367950439453, "rewards/margins": 7.6517486572265625, "rewards/rejected": -169.78854370117188, "step": 37800 }, { "epoch": 2.19, "grad_norm": 74.71420288085938, "learning_rate": 0.0002712566275784666, "logits/chosen": -14.80744457244873, "logits/rejected": -14.506898880004883, "logps/chosen": -2919.02001953125, "logps/rejected": -2891.510986328125, "loss": 4.6414, "rewards/accuracies": 0.5, "rewards/chosen": -142.63040161132812, "rewards/margins": -0.4764065742492676, "rewards/rejected": -142.15399169921875, "step": 37810 }, { "epoch": 2.19, "grad_norm": 0.27680647373199463, "learning_rate": 0.0002710631216378343, "logits/chosen": -16.549869537353516, "logits/rejected": -17.197307586669922, "logps/chosen": -3006.482421875, "logps/rejected": -2939.99609375, "loss": 4.2773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -153.0810089111328, "rewards/margins": 8.731090545654297, "rewards/rejected": -161.81210327148438, "step": 37820 }, { "epoch": 2.19, "grad_norm": 75.97637176513672, "learning_rate": 0.0002708696156972019, "logits/chosen": -21.844114303588867, "logits/rejected": -21.094314575195312, "logps/chosen": -2833.814697265625, "logps/rejected": -2761.74609375, "loss": 5.5745, "rewards/accuracies": 0.5, "rewards/chosen": -187.59251403808594, "rewards/margins": 1.225101113319397, "rewards/rejected": -188.817626953125, "step": 37830 }, { "epoch": 2.19, "grad_norm": 21.304859161376953, "learning_rate": 0.0002706761097565695, "logits/chosen": -15.365200996398926, "logits/rejected": -15.619153022766113, "logps/chosen": -2853.116455078125, "logps/rejected": -2720.359375, "loss": 6.115, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -136.71812438964844, "rewards/margins": -3.7963576316833496, "rewards/rejected": -132.92178344726562, "step": 37840 }, { "epoch": 2.19, "grad_norm": 1.8079277651850134e-05, "learning_rate": 0.00027048260381593714, "logits/chosen": -16.29002571105957, "logits/rejected": -16.39630889892578, "logps/chosen": -2804.647216796875, "logps/rejected": -2568.18505859375, "loss": 0.6398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -137.1080780029297, "rewards/margins": 10.310789108276367, "rewards/rejected": -147.4188690185547, "step": 37850 }, { "epoch": 2.19, "grad_norm": 0.1383400559425354, "learning_rate": 0.00027028909787530476, "logits/chosen": -15.74548625946045, "logits/rejected": -16.41140365600586, "logps/chosen": -2595.866455078125, "logps/rejected": -2748.1533203125, "loss": 4.4403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -186.12786865234375, "rewards/margins": 3.9153952598571777, "rewards/rejected": -190.04327392578125, "step": 37860 }, { "epoch": 2.19, "grad_norm": 2.9158266079409856e-11, "learning_rate": 0.00027009559193467243, "logits/chosen": -15.995549201965332, "logits/rejected": -16.182912826538086, "logps/chosen": -2790.619873046875, "logps/rejected": -2773.16845703125, "loss": 6.8588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.20016479492188, "rewards/margins": 1.8405754566192627, "rewards/rejected": -152.04074096679688, "step": 37870 }, { "epoch": 2.19, "grad_norm": 103.56556701660156, "learning_rate": 0.00026990208599404005, "logits/chosen": -20.242809295654297, "logits/rejected": -21.40683364868164, "logps/chosen": -2944.207275390625, "logps/rejected": -2915.10302734375, "loss": 6.2502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -263.9766845703125, "rewards/margins": 0.27773284912109375, "rewards/rejected": -264.2544250488281, "step": 37880 }, { "epoch": 2.19, "grad_norm": 0.00020391707948874682, "learning_rate": 0.00026970858005340766, "logits/chosen": -16.726804733276367, "logits/rejected": -18.033504486083984, "logps/chosen": -2987.373046875, "logps/rejected": -2998.536865234375, "loss": 2.1913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -182.51901245117188, "rewards/margins": 16.329242706298828, "rewards/rejected": -198.8482666015625, "step": 37890 }, { "epoch": 2.19, "grad_norm": 3.7770021776850626e-08, "learning_rate": 0.0002695150741127753, "logits/chosen": -16.831134796142578, "logits/rejected": -17.740406036376953, "logps/chosen": -3056.2294921875, "logps/rejected": -3054.242431640625, "loss": 1.306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.64865112304688, "rewards/margins": 9.065305709838867, "rewards/rejected": -219.7139892578125, "step": 37900 }, { "epoch": 2.19, "grad_norm": 1.5831615201022942e-06, "learning_rate": 0.0002693215681721429, "logits/chosen": -15.281911849975586, "logits/rejected": -17.120588302612305, "logps/chosen": -2755.44921875, "logps/rejected": -2531.697998046875, "loss": 0.1679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -144.70309448242188, "rewards/margins": 11.962240219116211, "rewards/rejected": -156.6653289794922, "step": 37910 }, { "epoch": 2.19, "grad_norm": 144.3910675048828, "learning_rate": 0.0002691280622315105, "logits/chosen": -17.310693740844727, "logits/rejected": -17.990703582763672, "logps/chosen": -2639.42919921875, "logps/rejected": -3005.03369140625, "loss": 4.4558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.45672607421875, "rewards/margins": 3.9607880115509033, "rewards/rejected": -168.41751098632812, "step": 37920 }, { "epoch": 2.2, "grad_norm": 0.00014613212260883301, "learning_rate": 0.00026893455629087813, "logits/chosen": -18.9162654876709, "logits/rejected": -20.873489379882812, "logps/chosen": -3089.442626953125, "logps/rejected": -3022.45361328125, "loss": 0.6031, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -148.91006469726562, "rewards/margins": 10.165438652038574, "rewards/rejected": -159.0754852294922, "step": 37930 }, { "epoch": 2.2, "grad_norm": 0.00015867239562794566, "learning_rate": 0.00026874105035024575, "logits/chosen": -16.72931480407715, "logits/rejected": -17.351757049560547, "logps/chosen": -3220.13037109375, "logps/rejected": -3071.598388671875, "loss": 1.2704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -202.1754608154297, "rewards/margins": 6.407814025878906, "rewards/rejected": -208.5832977294922, "step": 37940 }, { "epoch": 2.2, "grad_norm": 0.0010299981804564595, "learning_rate": 0.00026854754440961337, "logits/chosen": -16.47274398803711, "logits/rejected": -19.43926239013672, "logps/chosen": -3098.27783203125, "logps/rejected": -3055.504638671875, "loss": 1.6914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -168.2517547607422, "rewards/margins": 7.644667148590088, "rewards/rejected": -175.89642333984375, "step": 37950 }, { "epoch": 2.2, "grad_norm": 4.6880811055416416e-07, "learning_rate": 0.000268354038468981, "logits/chosen": -19.35305404663086, "logits/rejected": -20.586814880371094, "logps/chosen": -2945.766845703125, "logps/rejected": -2851.09716796875, "loss": 15.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -186.37879943847656, "rewards/margins": -6.174882411956787, "rewards/rejected": -180.2039031982422, "step": 37960 }, { "epoch": 2.2, "grad_norm": 7.147882996605404e-08, "learning_rate": 0.0002681605325283486, "logits/chosen": -17.462072372436523, "logits/rejected": -18.393442153930664, "logps/chosen": -2720.136962890625, "logps/rejected": -2521.4970703125, "loss": 1.6795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -138.06031799316406, "rewards/margins": 12.732852935791016, "rewards/rejected": -150.79318237304688, "step": 37970 }, { "epoch": 2.2, "grad_norm": 9.87951912864867e-16, "learning_rate": 0.00026796702658771627, "logits/chosen": -18.378986358642578, "logits/rejected": -18.936281204223633, "logps/chosen": -2980.29150390625, "logps/rejected": -2913.76318359375, "loss": 6.8104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.57110595703125, "rewards/margins": 5.271857261657715, "rewards/rejected": -206.8429718017578, "step": 37980 }, { "epoch": 2.2, "grad_norm": 47.51057052612305, "learning_rate": 0.0002677735206470839, "logits/chosen": -20.807422637939453, "logits/rejected": -20.763336181640625, "logps/chosen": -3262.339599609375, "logps/rejected": -2934.41455078125, "loss": 0.153, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -188.56320190429688, "rewards/margins": 17.691402435302734, "rewards/rejected": -206.25460815429688, "step": 37990 }, { "epoch": 2.2, "grad_norm": 2.0720070624535847e-12, "learning_rate": 0.0002675800147064515, "logits/chosen": -20.90353012084961, "logits/rejected": -19.423349380493164, "logps/chosen": -3078.019287109375, "logps/rejected": -2970.776611328125, "loss": 7.1761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.4595184326172, "rewards/margins": 7.824265956878662, "rewards/rejected": -184.28379821777344, "step": 38000 }, { "epoch": 2.2, "grad_norm": 1.6810251963761402e-06, "learning_rate": 0.0002673865087658191, "logits/chosen": -18.253087997436523, "logits/rejected": -17.42154884338379, "logps/chosen": -3416.09375, "logps/rejected": -3215.69091796875, "loss": 2.6953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -191.72024536132812, "rewards/margins": 11.675168991088867, "rewards/rejected": -203.39541625976562, "step": 38010 }, { "epoch": 2.2, "grad_norm": 106.18647766113281, "learning_rate": 0.00026719300282518674, "logits/chosen": -18.646499633789062, "logits/rejected": -22.298940658569336, "logps/chosen": -3099.45068359375, "logps/rejected": -3005.08642578125, "loss": 1.9191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -206.58035278320312, "rewards/margins": 11.648907661437988, "rewards/rejected": -218.22927856445312, "step": 38020 }, { "epoch": 2.2, "grad_norm": 8.806904792785645, "learning_rate": 0.00026699949688455436, "logits/chosen": -18.902606964111328, "logits/rejected": -19.214893341064453, "logps/chosen": -2741.59912109375, "logps/rejected": -2966.93798828125, "loss": 3.6012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.25379943847656, "rewards/margins": 16.773855209350586, "rewards/rejected": -182.0276641845703, "step": 38030 }, { "epoch": 2.2, "grad_norm": 0.07701615244150162, "learning_rate": 0.000266805990943922, "logits/chosen": -17.42171859741211, "logits/rejected": -18.474979400634766, "logps/chosen": -2903.688232421875, "logps/rejected": -2884.96435546875, "loss": 1.8833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -216.90847778320312, "rewards/margins": 5.912976264953613, "rewards/rejected": -222.82144165039062, "step": 38040 }, { "epoch": 2.2, "grad_norm": 0.011001084931194782, "learning_rate": 0.0002666124850032896, "logits/chosen": -13.866869926452637, "logits/rejected": -14.91380786895752, "logps/chosen": -2968.9599609375, "logps/rejected": -2796.44482421875, "loss": 13.5595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -103.83988952636719, "rewards/margins": -6.014101982116699, "rewards/rejected": -97.82579040527344, "step": 38050 }, { "epoch": 2.2, "grad_norm": 9.623844380257651e-05, "learning_rate": 0.0002664189790626572, "logits/chosen": -18.609630584716797, "logits/rejected": -19.093103408813477, "logps/chosen": -2305.92333984375, "logps/rejected": -2297.13232421875, "loss": 6.977, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -205.1648406982422, "rewards/margins": -2.3126511573791504, "rewards/rejected": -202.85220336914062, "step": 38060 }, { "epoch": 2.2, "grad_norm": 0.12082412838935852, "learning_rate": 0.00026622547312202483, "logits/chosen": -16.600425720214844, "logits/rejected": -17.203182220458984, "logps/chosen": -3097.448486328125, "logps/rejected": -2819.314453125, "loss": 5.0122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.7925567626953, "rewards/margins": 1.7227723598480225, "rewards/rejected": -169.51531982421875, "step": 38070 }, { "epoch": 2.2, "grad_norm": 0.0, "learning_rate": 0.0002660319671813925, "logits/chosen": -14.831883430480957, "logits/rejected": -14.765355110168457, "logps/chosen": -3153.39990234375, "logps/rejected": -2613.110595703125, "loss": 2.6275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -135.2009735107422, "rewards/margins": 16.93108558654785, "rewards/rejected": -152.13206481933594, "step": 38080 }, { "epoch": 2.2, "grad_norm": 1.2286632061004639, "learning_rate": 0.0002658384612407601, "logits/chosen": -18.61688804626465, "logits/rejected": -21.809547424316406, "logps/chosen": -2718.49853515625, "logps/rejected": -2737.13232421875, "loss": 0.7641, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -173.6548614501953, "rewards/margins": 13.08454704284668, "rewards/rejected": -186.73941040039062, "step": 38090 }, { "epoch": 2.21, "grad_norm": 0.15429528057575226, "learning_rate": 0.00026564495530012773, "logits/chosen": -15.001202583312988, "logits/rejected": -16.60430145263672, "logps/chosen": -2677.5634765625, "logps/rejected": -2648.47412109375, "loss": 3.06, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.79049682617188, "rewards/margins": 2.2131264209747314, "rewards/rejected": -162.0036163330078, "step": 38100 }, { "epoch": 2.21, "grad_norm": 2.2491378784179688, "learning_rate": 0.00026545144935949535, "logits/chosen": -13.431510925292969, "logits/rejected": -14.994305610656738, "logps/chosen": -3528.20849609375, "logps/rejected": -3337.56640625, "loss": 1.961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -59.7552490234375, "rewards/margins": 8.521714210510254, "rewards/rejected": -68.27696228027344, "step": 38110 }, { "epoch": 2.21, "grad_norm": 0.9506469964981079, "learning_rate": 0.00026525794341886297, "logits/chosen": -15.346694946289062, "logits/rejected": -14.433191299438477, "logps/chosen": -3150.243408203125, "logps/rejected": -2816.739501953125, "loss": 5.4728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -96.95384979248047, "rewards/margins": 14.801486015319824, "rewards/rejected": -111.75533294677734, "step": 38120 }, { "epoch": 2.21, "grad_norm": 8.732090606144993e-10, "learning_rate": 0.00026506443747823064, "logits/chosen": -17.461828231811523, "logits/rejected": -20.07730484008789, "logps/chosen": -3079.849853515625, "logps/rejected": -3017.94970703125, "loss": 0.6668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.33474731445312, "rewards/margins": 11.050803184509277, "rewards/rejected": -199.3855438232422, "step": 38130 }, { "epoch": 2.21, "grad_norm": 53.7440185546875, "learning_rate": 0.0002648709315375982, "logits/chosen": -19.688426971435547, "logits/rejected": -20.73639678955078, "logps/chosen": -2217.40478515625, "logps/rejected": -2423.087890625, "loss": 11.6654, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -193.0648651123047, "rewards/margins": -9.63501262664795, "rewards/rejected": -183.4298553466797, "step": 38140 }, { "epoch": 2.21, "grad_norm": 86.85553741455078, "learning_rate": 0.0002646774255969658, "logits/chosen": -16.9345760345459, "logits/rejected": -18.161943435668945, "logps/chosen": -2460.15576171875, "logps/rejected": -2484.578857421875, "loss": 3.561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.10842895507812, "rewards/margins": 11.120405197143555, "rewards/rejected": -218.2288360595703, "step": 38150 }, { "epoch": 2.21, "grad_norm": 0.00045567884808406234, "learning_rate": 0.00026448391965633344, "logits/chosen": -14.484837532043457, "logits/rejected": -16.17782211303711, "logps/chosen": -3168.341552734375, "logps/rejected": -2874.94775390625, "loss": 16.9143, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.1680145263672, "rewards/margins": 1.4948581457138062, "rewards/rejected": -181.6628875732422, "step": 38160 }, { "epoch": 2.21, "grad_norm": 120.9820556640625, "learning_rate": 0.00026429041371570105, "logits/chosen": -15.80693244934082, "logits/rejected": -16.36903953552246, "logps/chosen": -3055.764404296875, "logps/rejected": -3013.156982421875, "loss": 2.1134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -156.71092224121094, "rewards/margins": 7.6656036376953125, "rewards/rejected": -164.3765411376953, "step": 38170 }, { "epoch": 2.21, "grad_norm": 0.0004381677426863462, "learning_rate": 0.00026409690777506867, "logits/chosen": -13.518452644348145, "logits/rejected": -14.88427734375, "logps/chosen": -3090.90576171875, "logps/rejected": -3210.16650390625, "loss": 2.3918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.56951904296875, "rewards/margins": 11.111653327941895, "rewards/rejected": -183.68116760253906, "step": 38180 }, { "epoch": 2.21, "grad_norm": 87.15753936767578, "learning_rate": 0.00026390340183443634, "logits/chosen": -18.029233932495117, "logits/rejected": -18.226497650146484, "logps/chosen": -2802.45361328125, "logps/rejected": -2888.32958984375, "loss": 2.2209, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -146.55340576171875, "rewards/margins": 9.660295486450195, "rewards/rejected": -156.21371459960938, "step": 38190 }, { "epoch": 2.21, "grad_norm": 297.2491455078125, "learning_rate": 0.00026370989589380396, "logits/chosen": -16.618297576904297, "logits/rejected": -17.025691986083984, "logps/chosen": -3129.022216796875, "logps/rejected": -3532.154296875, "loss": 6.5212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.48019409179688, "rewards/margins": 12.756001472473145, "rewards/rejected": -177.23619079589844, "step": 38200 }, { "epoch": 2.21, "grad_norm": 0.06455344706773758, "learning_rate": 0.0002635163899531716, "logits/chosen": -15.037744522094727, "logits/rejected": -15.086074829101562, "logps/chosen": -3073.29833984375, "logps/rejected": -2681.286865234375, "loss": 4.3026, "rewards/accuracies": 0.5, "rewards/chosen": -134.97915649414062, "rewards/margins": 7.22036600112915, "rewards/rejected": -142.1995391845703, "step": 38210 }, { "epoch": 2.21, "grad_norm": 3.32153058052063, "learning_rate": 0.0002633228840125392, "logits/chosen": -16.661300659179688, "logits/rejected": -16.932544708251953, "logps/chosen": -3287.490234375, "logps/rejected": -2904.42041015625, "loss": 2.4576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -133.1405487060547, "rewards/margins": 17.397653579711914, "rewards/rejected": -150.53819274902344, "step": 38220 }, { "epoch": 2.21, "grad_norm": 0.0, "learning_rate": 0.0002631293780719068, "logits/chosen": -16.9365291595459, "logits/rejected": -20.83808708190918, "logps/chosen": -2911.758056640625, "logps/rejected": -3092.661376953125, "loss": 2.7993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -219.7530059814453, "rewards/margins": 26.52203369140625, "rewards/rejected": -246.27505493164062, "step": 38230 }, { "epoch": 2.21, "grad_norm": 2.6786665330291726e-06, "learning_rate": 0.0002629358721312745, "logits/chosen": -17.94540023803711, "logits/rejected": -17.919828414916992, "logps/chosen": -2729.776611328125, "logps/rejected": -2788.877197265625, "loss": 2.253, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -218.60067749023438, "rewards/margins": 7.04236364364624, "rewards/rejected": -225.64303588867188, "step": 38240 }, { "epoch": 2.21, "grad_norm": 2.8008057597617153e-06, "learning_rate": 0.00026274236619064204, "logits/chosen": -18.34713363647461, "logits/rejected": -19.880088806152344, "logps/chosen": -3118.00830078125, "logps/rejected": -3442.344482421875, "loss": 1.2202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -225.898193359375, "rewards/margins": 9.751474380493164, "rewards/rejected": -235.649658203125, "step": 38250 }, { "epoch": 2.21, "grad_norm": 2.9963715076446533, "learning_rate": 0.00026254886025000966, "logits/chosen": -18.690780639648438, "logits/rejected": -21.11615562438965, "logps/chosen": -3124.69482421875, "logps/rejected": -3104.34228515625, "loss": 3.7, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -224.5962371826172, "rewards/margins": 2.3103995323181152, "rewards/rejected": -226.9066619873047, "step": 38260 }, { "epoch": 2.22, "grad_norm": 2.0510665648537652e-13, "learning_rate": 0.0002623553543093773, "logits/chosen": -15.091119766235352, "logits/rejected": -15.24836540222168, "logps/chosen": -3016.37939453125, "logps/rejected": -2922.94384765625, "loss": 2.7428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.43679809570312, "rewards/margins": 7.521543025970459, "rewards/rejected": -148.95834350585938, "step": 38270 }, { "epoch": 2.22, "grad_norm": 5.205577373504639, "learning_rate": 0.0002621618483687449, "logits/chosen": -14.52568244934082, "logits/rejected": -17.760334014892578, "logps/chosen": -3230.5869140625, "logps/rejected": -2896.83642578125, "loss": 3.6783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.1325912475586, "rewards/margins": 1.8541101217269897, "rewards/rejected": -117.9866943359375, "step": 38280 }, { "epoch": 2.22, "grad_norm": 0.03360333293676376, "learning_rate": 0.00026196834242811257, "logits/chosen": -16.214641571044922, "logits/rejected": -16.050500869750977, "logps/chosen": -3071.004638671875, "logps/rejected": -2837.70556640625, "loss": 5.1213, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -115.8485107421875, "rewards/margins": -0.6252886056900024, "rewards/rejected": -115.22322845458984, "step": 38290 }, { "epoch": 2.22, "grad_norm": 44.96348190307617, "learning_rate": 0.0002617748364874802, "logits/chosen": -13.424662590026855, "logits/rejected": -13.580696105957031, "logps/chosen": -3083.629150390625, "logps/rejected": -2998.438720703125, "loss": 5.0426, "rewards/accuracies": 0.5, "rewards/chosen": -200.86151123046875, "rewards/margins": -1.0614149570465088, "rewards/rejected": -199.8000946044922, "step": 38300 }, { "epoch": 2.22, "grad_norm": 64.87934112548828, "learning_rate": 0.0002615813305468478, "logits/chosen": -15.879544258117676, "logits/rejected": -16.732868194580078, "logps/chosen": -2902.493896484375, "logps/rejected": -2890.685546875, "loss": 1.7617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -216.6195526123047, "rewards/margins": 7.163105010986328, "rewards/rejected": -223.78268432617188, "step": 38310 }, { "epoch": 2.22, "grad_norm": 5.290731906890869, "learning_rate": 0.0002613878246062154, "logits/chosen": -14.488525390625, "logits/rejected": -14.6217041015625, "logps/chosen": -2821.63720703125, "logps/rejected": -2845.636962890625, "loss": 0.5795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.3693389892578, "rewards/margins": 9.353022575378418, "rewards/rejected": -180.72235107421875, "step": 38320 }, { "epoch": 2.22, "grad_norm": 1.4413080862141214e-05, "learning_rate": 0.00026119431866558304, "logits/chosen": -13.8014497756958, "logits/rejected": -14.071748733520508, "logps/chosen": -3000.86279296875, "logps/rejected": -3090.86376953125, "loss": 3.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.69969177246094, "rewards/margins": 5.533003330230713, "rewards/rejected": -164.23268127441406, "step": 38330 }, { "epoch": 2.22, "grad_norm": 1.966830313904211e-05, "learning_rate": 0.0002610008127249507, "logits/chosen": -12.594293594360352, "logits/rejected": -12.646997451782227, "logps/chosen": -3299.84375, "logps/rejected": -3014.270751953125, "loss": 0.6754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -150.20523071289062, "rewards/margins": 12.967361450195312, "rewards/rejected": -163.17257690429688, "step": 38340 }, { "epoch": 2.22, "grad_norm": 9.701220449187531e-08, "learning_rate": 0.0002608073067843183, "logits/chosen": -13.71263599395752, "logits/rejected": -14.368341445922852, "logps/chosen": -3103.861328125, "logps/rejected": -2763.958740234375, "loss": 20.9721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.95697021484375, "rewards/margins": -7.648230075836182, "rewards/rejected": -159.30874633789062, "step": 38350 }, { "epoch": 2.22, "grad_norm": 4.904772758483887, "learning_rate": 0.0002606138008436859, "logits/chosen": -15.382466316223145, "logits/rejected": -15.408793449401855, "logps/chosen": -2560.961669921875, "logps/rejected": -2563.65478515625, "loss": 0.6947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -148.5189666748047, "rewards/margins": 6.132513046264648, "rewards/rejected": -154.65147399902344, "step": 38360 }, { "epoch": 2.22, "grad_norm": 3.42165113267132e-17, "learning_rate": 0.0002604202949030535, "logits/chosen": -12.204384803771973, "logits/rejected": -12.318540573120117, "logps/chosen": -3356.921875, "logps/rejected": -2914.908447265625, "loss": 2.2165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -101.41542053222656, "rewards/margins": 16.242006301879883, "rewards/rejected": -117.65742492675781, "step": 38370 }, { "epoch": 2.22, "grad_norm": 7.69877815246582, "learning_rate": 0.0002602267889624211, "logits/chosen": -15.573034286499023, "logits/rejected": -16.243520736694336, "logps/chosen": -2720.92431640625, "logps/rejected": -2709.935302734375, "loss": 3.6229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -237.43417358398438, "rewards/margins": 2.7701919078826904, "rewards/rejected": -240.2043914794922, "step": 38380 }, { "epoch": 2.22, "grad_norm": 0.00010351358650950715, "learning_rate": 0.00026003328302178874, "logits/chosen": -15.292806625366211, "logits/rejected": -15.73027515411377, "logps/chosen": -2892.44287109375, "logps/rejected": -2288.83740234375, "loss": 18.4598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.4093780517578, "rewards/margins": -12.446125030517578, "rewards/rejected": -142.96322631835938, "step": 38390 }, { "epoch": 2.22, "grad_norm": 7.01864280472364e-07, "learning_rate": 0.0002598397770811564, "logits/chosen": -13.48802375793457, "logits/rejected": -13.476930618286133, "logps/chosen": -2443.13232421875, "logps/rejected": -2105.780029296875, "loss": 7.8202, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -104.17769622802734, "rewards/margins": 0.09308032691478729, "rewards/rejected": -104.27076721191406, "step": 38400 }, { "epoch": 2.22, "grad_norm": 23.762052536010742, "learning_rate": 0.00025964627114052403, "logits/chosen": -18.752851486206055, "logits/rejected": -19.352720260620117, "logps/chosen": -2707.243408203125, "logps/rejected": -2714.78955078125, "loss": 2.9134, "rewards/accuracies": 0.5, "rewards/chosen": -230.0209503173828, "rewards/margins": 15.265820503234863, "rewards/rejected": -245.2867431640625, "step": 38410 }, { "epoch": 2.22, "grad_norm": 3.6099966340694294e-22, "learning_rate": 0.00025945276519989165, "logits/chosen": -16.912654876708984, "logits/rejected": -18.20496940612793, "logps/chosen": -2613.60107421875, "logps/rejected": -2577.198486328125, "loss": 5.0235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -216.7653350830078, "rewards/margins": 9.298308372497559, "rewards/rejected": -226.0636444091797, "step": 38420 }, { "epoch": 2.22, "grad_norm": 37.28097915649414, "learning_rate": 0.00025925925925925926, "logits/chosen": -14.414346694946289, "logits/rejected": -15.685111999511719, "logps/chosen": -2946.48583984375, "logps/rejected": -2906.250732421875, "loss": 2.1324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.90568542480469, "rewards/margins": 9.266045570373535, "rewards/rejected": -131.17172241210938, "step": 38430 }, { "epoch": 2.23, "grad_norm": 161.37649536132812, "learning_rate": 0.0002590657533186269, "logits/chosen": -14.390324592590332, "logits/rejected": -15.493448257446289, "logps/chosen": -3092.112548828125, "logps/rejected": -2661.151123046875, "loss": 9.263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -224.60787963867188, "rewards/margins": -1.6770817041397095, "rewards/rejected": -222.9307861328125, "step": 38440 }, { "epoch": 2.23, "grad_norm": 0.0026439426001161337, "learning_rate": 0.00025887224737799455, "logits/chosen": -15.445688247680664, "logits/rejected": -15.310880661010742, "logps/chosen": -3187.56005859375, "logps/rejected": -2571.83642578125, "loss": 0.7239, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -140.71939086914062, "rewards/margins": 11.87680435180664, "rewards/rejected": -152.59620666503906, "step": 38450 }, { "epoch": 2.23, "grad_norm": 4.376149263407569e-06, "learning_rate": 0.00025867874143736217, "logits/chosen": -16.069374084472656, "logits/rejected": -16.329334259033203, "logps/chosen": -3061.255126953125, "logps/rejected": -2795.95751953125, "loss": 4.661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.60488891601562, "rewards/margins": 10.005897521972656, "rewards/rejected": -174.61077880859375, "step": 38460 }, { "epoch": 2.23, "grad_norm": 60.21305465698242, "learning_rate": 0.00025848523549672973, "logits/chosen": -14.662760734558105, "logits/rejected": -14.448054313659668, "logps/chosen": -3011.817626953125, "logps/rejected": -2959.005859375, "loss": 2.0395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.01785278320312, "rewards/margins": 14.94300651550293, "rewards/rejected": -174.9608612060547, "step": 38470 }, { "epoch": 2.23, "grad_norm": 1.8762520551681519, "learning_rate": 0.00025829172955609735, "logits/chosen": -15.010807991027832, "logits/rejected": -15.162965774536133, "logps/chosen": -2614.33837890625, "logps/rejected": -2524.922607421875, "loss": 6.1331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.56663513183594, "rewards/margins": 2.3311283588409424, "rewards/rejected": -171.8977508544922, "step": 38480 }, { "epoch": 2.23, "grad_norm": 7.414317770848555e-11, "learning_rate": 0.00025809822361546497, "logits/chosen": -15.17859935760498, "logits/rejected": -15.564268112182617, "logps/chosen": -2815.072021484375, "logps/rejected": -2722.986328125, "loss": 4.1643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -175.15213012695312, "rewards/margins": 4.438906669616699, "rewards/rejected": -179.59103393554688, "step": 38490 }, { "epoch": 2.23, "grad_norm": 91.93389129638672, "learning_rate": 0.00025790471767483264, "logits/chosen": -14.537881851196289, "logits/rejected": -14.857081413269043, "logps/chosen": -2891.33251953125, "logps/rejected": -2844.711669921875, "loss": 1.7088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -131.27149963378906, "rewards/margins": 15.707555770874023, "rewards/rejected": -146.9790496826172, "step": 38500 }, { "epoch": 2.23, "grad_norm": 105.18247985839844, "learning_rate": 0.00025771121173420025, "logits/chosen": -16.665573120117188, "logits/rejected": -17.731355667114258, "logps/chosen": -3002.975341796875, "logps/rejected": -2951.723876953125, "loss": 1.2776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -219.5817108154297, "rewards/margins": 12.328109741210938, "rewards/rejected": -231.90982055664062, "step": 38510 }, { "epoch": 2.23, "grad_norm": 101.6989974975586, "learning_rate": 0.00025751770579356787, "logits/chosen": -16.101152420043945, "logits/rejected": -16.463970184326172, "logps/chosen": -3029.56494140625, "logps/rejected": -3023.35107421875, "loss": 3.4126, "rewards/accuracies": 0.5, "rewards/chosen": -226.1511688232422, "rewards/margins": 4.4466047286987305, "rewards/rejected": -230.5977783203125, "step": 38520 }, { "epoch": 2.23, "grad_norm": 0.20012067258358002, "learning_rate": 0.0002573241998529355, "logits/chosen": -12.27376937866211, "logits/rejected": -12.583921432495117, "logps/chosen": -3084.07861328125, "logps/rejected": -2844.609375, "loss": 5.6082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -119.6286849975586, "rewards/margins": 1.5305354595184326, "rewards/rejected": -121.1592025756836, "step": 38530 }, { "epoch": 2.23, "grad_norm": 1.8539984204444214e-16, "learning_rate": 0.0002571306939123031, "logits/chosen": -15.390765190124512, "logits/rejected": -15.846054077148438, "logps/chosen": -3243.223876953125, "logps/rejected": -3195.708984375, "loss": 3.3621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.7213134765625, "rewards/margins": 13.089651107788086, "rewards/rejected": -151.81094360351562, "step": 38540 }, { "epoch": 2.23, "grad_norm": 135.0651092529297, "learning_rate": 0.0002569371879716708, "logits/chosen": -16.620393753051758, "logits/rejected": -18.780189514160156, "logps/chosen": -2892.89599609375, "logps/rejected": -2812.52197265625, "loss": 6.4195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.85745239257812, "rewards/margins": 11.431916236877441, "rewards/rejected": -199.28933715820312, "step": 38550 }, { "epoch": 2.23, "grad_norm": 123.16598510742188, "learning_rate": 0.0002567436820310384, "logits/chosen": -16.69301986694336, "logits/rejected": -16.440189361572266, "logps/chosen": -3086.601806640625, "logps/rejected": -3117.375244140625, "loss": 2.7465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.18154907226562, "rewards/margins": 6.041174411773682, "rewards/rejected": -230.2227020263672, "step": 38560 }, { "epoch": 2.23, "grad_norm": 0.04167845845222473, "learning_rate": 0.00025655017609040596, "logits/chosen": -14.180401802062988, "logits/rejected": -15.745089530944824, "logps/chosen": -3353.81005859375, "logps/rejected": -3009.53466796875, "loss": 2.4591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.3002166748047, "rewards/margins": 5.703819274902344, "rewards/rejected": -160.00405883789062, "step": 38570 }, { "epoch": 2.23, "grad_norm": 96.65787506103516, "learning_rate": 0.0002563566701497736, "logits/chosen": -14.71528434753418, "logits/rejected": -14.827290534973145, "logps/chosen": -3109.91845703125, "logps/rejected": -3124.06982421875, "loss": 4.0632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.67526245117188, "rewards/margins": 3.43597674369812, "rewards/rejected": -144.11123657226562, "step": 38580 }, { "epoch": 2.23, "grad_norm": 4.090139865875244, "learning_rate": 0.0002561631642091412, "logits/chosen": -15.7166166305542, "logits/rejected": -15.868176460266113, "logps/chosen": -3129.154052734375, "logps/rejected": -3078.220947265625, "loss": 6.2649, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -201.40286254882812, "rewards/margins": -4.924414157867432, "rewards/rejected": -196.47842407226562, "step": 38590 }, { "epoch": 2.23, "grad_norm": 82.7125244140625, "learning_rate": 0.00025596965826850886, "logits/chosen": -17.405075073242188, "logits/rejected": -18.626636505126953, "logps/chosen": -3044.58349609375, "logps/rejected": -3009.63525390625, "loss": 9.7598, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -201.45135498046875, "rewards/margins": -6.112936496734619, "rewards/rejected": -195.33837890625, "step": 38600 }, { "epoch": 2.23, "grad_norm": 0.0001556936331326142, "learning_rate": 0.0002557761523278765, "logits/chosen": -17.873004913330078, "logits/rejected": -19.17344093322754, "logps/chosen": -2480.77001953125, "logps/rejected": -2360.046630859375, "loss": 14.0174, "rewards/accuracies": 0.5, "rewards/chosen": -221.2709503173828, "rewards/margins": -9.624639511108398, "rewards/rejected": -211.6463165283203, "step": 38610 }, { "epoch": 2.24, "grad_norm": 17.043365478515625, "learning_rate": 0.0002555826463872441, "logits/chosen": -16.127368927001953, "logits/rejected": -16.278451919555664, "logps/chosen": -2722.86767578125, "logps/rejected": -2393.77294921875, "loss": 0.5035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -127.7564926147461, "rewards/margins": 14.57214641571045, "rewards/rejected": -142.32864379882812, "step": 38620 }, { "epoch": 2.24, "grad_norm": 8.634801815787796e-06, "learning_rate": 0.0002553891404466117, "logits/chosen": -19.346446990966797, "logits/rejected": -18.937408447265625, "logps/chosen": -2669.47119140625, "logps/rejected": -2777.692626953125, "loss": 3.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.6597900390625, "rewards/margins": 7.2423505783081055, "rewards/rejected": -219.90213012695312, "step": 38630 }, { "epoch": 2.24, "grad_norm": 14.221207618713379, "learning_rate": 0.00025519563450597933, "logits/chosen": -20.56231689453125, "logits/rejected": -21.065372467041016, "logps/chosen": -2575.5966796875, "logps/rejected": -2843.497802734375, "loss": 4.2143, "rewards/accuracies": 0.5, "rewards/chosen": -149.00503540039062, "rewards/margins": 2.5925450325012207, "rewards/rejected": -151.5975799560547, "step": 38640 }, { "epoch": 2.24, "grad_norm": 0.05296727642416954, "learning_rate": 0.00025500212856534695, "logits/chosen": -17.728843688964844, "logits/rejected": -18.581167221069336, "logps/chosen": -2919.191162109375, "logps/rejected": -2616.090087890625, "loss": 5.6087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -182.48226928710938, "rewards/margins": 6.758669853210449, "rewards/rejected": -189.24093627929688, "step": 38650 }, { "epoch": 2.24, "grad_norm": 0.008065209724009037, "learning_rate": 0.0002548086226247146, "logits/chosen": -17.41134262084961, "logits/rejected": -17.653644561767578, "logps/chosen": -2775.783447265625, "logps/rejected": -2869.970703125, "loss": 2.7295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.23245239257812, "rewards/margins": 5.350289344787598, "rewards/rejected": -214.58273315429688, "step": 38660 }, { "epoch": 2.24, "grad_norm": 7.658796619125496e-08, "learning_rate": 0.00025461511668408224, "logits/chosen": -17.319103240966797, "logits/rejected": -17.848979949951172, "logps/chosen": -2624.835693359375, "logps/rejected": -2813.19482421875, "loss": 6.4218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.59854125976562, "rewards/margins": 3.0371031761169434, "rewards/rejected": -153.63563537597656, "step": 38670 }, { "epoch": 2.24, "grad_norm": 22.514814376831055, "learning_rate": 0.0002544216107434498, "logits/chosen": -17.407739639282227, "logits/rejected": -17.40696144104004, "logps/chosen": -3081.2900390625, "logps/rejected": -3025.313232421875, "loss": 5.3578, "rewards/accuracies": 0.5, "rewards/chosen": -168.0240020751953, "rewards/margins": -0.901480495929718, "rewards/rejected": -167.1225128173828, "step": 38680 }, { "epoch": 2.24, "grad_norm": 1.2335897281445796e-06, "learning_rate": 0.0002542281048028174, "logits/chosen": -16.06255531311035, "logits/rejected": -16.33553695678711, "logps/chosen": -2906.69970703125, "logps/rejected": -2492.59033203125, "loss": 15.7638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.15373229980469, "rewards/margins": -8.388851165771484, "rewards/rejected": -101.76487731933594, "step": 38690 }, { "epoch": 2.24, "grad_norm": 0.5766881704330444, "learning_rate": 0.00025403459886218504, "logits/chosen": -19.261157989501953, "logits/rejected": -19.438587188720703, "logps/chosen": -2601.85205078125, "logps/rejected": -2498.37841796875, "loss": 12.3097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -230.939697265625, "rewards/margins": -6.6937127113342285, "rewards/rejected": -224.2459716796875, "step": 38700 }, { "epoch": 2.24, "grad_norm": 3.019916334778827e-07, "learning_rate": 0.0002538410929215527, "logits/chosen": -13.817822456359863, "logits/rejected": -13.843690872192383, "logps/chosen": -3481.630859375, "logps/rejected": -3281.624267578125, "loss": 1.4022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -76.94627380371094, "rewards/margins": 3.7516989707946777, "rewards/rejected": -80.69796752929688, "step": 38710 }, { "epoch": 2.24, "grad_norm": 0.0004806615470442921, "learning_rate": 0.0002536475869809203, "logits/chosen": -15.975252151489258, "logits/rejected": -16.166940689086914, "logps/chosen": -2908.79443359375, "logps/rejected": -2330.46044921875, "loss": 1.0163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -121.16218566894531, "rewards/margins": 20.09884262084961, "rewards/rejected": -141.2610321044922, "step": 38720 }, { "epoch": 2.24, "grad_norm": 7.272271318470303e-08, "learning_rate": 0.00025345408104028794, "logits/chosen": -18.02541160583496, "logits/rejected": -19.114097595214844, "logps/chosen": -2917.57275390625, "logps/rejected": -2969.64794921875, "loss": 3.1628, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -185.0098419189453, "rewards/margins": 12.532689094543457, "rewards/rejected": -197.5425567626953, "step": 38730 }, { "epoch": 2.24, "grad_norm": 8.771748127855972e-08, "learning_rate": 0.00025326057509965556, "logits/chosen": -16.22815704345703, "logits/rejected": -16.267236709594727, "logps/chosen": -2732.9482421875, "logps/rejected": -2559.90869140625, "loss": 12.5269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.94789123535156, "rewards/margins": -6.04599666595459, "rewards/rejected": -154.90188598632812, "step": 38740 }, { "epoch": 2.24, "grad_norm": 6.963902232648422e-10, "learning_rate": 0.0002530670691590232, "logits/chosen": -16.24583625793457, "logits/rejected": -17.14292335510254, "logps/chosen": -2928.729736328125, "logps/rejected": -2711.597412109375, "loss": 0.1585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -132.98336791992188, "rewards/margins": 13.90483283996582, "rewards/rejected": -146.88819885253906, "step": 38750 }, { "epoch": 2.24, "grad_norm": 0.009207426570355892, "learning_rate": 0.00025287356321839085, "logits/chosen": -15.631794929504395, "logits/rejected": -15.958230972290039, "logps/chosen": -2843.96630859375, "logps/rejected": -2737.682373046875, "loss": 2.7663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -201.1683349609375, "rewards/margins": 7.058821678161621, "rewards/rejected": -208.22714233398438, "step": 38760 }, { "epoch": 2.24, "grad_norm": 0.8500522375106812, "learning_rate": 0.00025268005727775846, "logits/chosen": -16.9083251953125, "logits/rejected": -17.228336334228516, "logps/chosen": -2891.21875, "logps/rejected": -2961.01953125, "loss": 2.9863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -251.21829223632812, "rewards/margins": 8.719938278198242, "rewards/rejected": -259.938232421875, "step": 38770 }, { "epoch": 2.24, "grad_norm": 8.534086326505985e-09, "learning_rate": 0.0002524865513371261, "logits/chosen": -16.58814811706543, "logits/rejected": -16.96476936340332, "logps/chosen": -3050.79248046875, "logps/rejected": -2587.36474609375, "loss": 0.8056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -168.96420288085938, "rewards/margins": 9.498188018798828, "rewards/rejected": -178.46240234375, "step": 38780 }, { "epoch": 2.25, "grad_norm": 9.396288191965141e-08, "learning_rate": 0.00025229304539649364, "logits/chosen": -17.312044143676758, "logits/rejected": -17.656185150146484, "logps/chosen": -3224.75244140625, "logps/rejected": -3117.796630859375, "loss": 14.6743, "rewards/accuracies": 0.5, "rewards/chosen": -248.6461944580078, "rewards/margins": -7.152016639709473, "rewards/rejected": -241.4941864013672, "step": 38790 }, { "epoch": 2.25, "grad_norm": 0.016030598431825638, "learning_rate": 0.00025209953945586126, "logits/chosen": -15.464472770690918, "logits/rejected": -16.092111587524414, "logps/chosen": -2816.47802734375, "logps/rejected": -2464.7646484375, "loss": 0.4323, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -141.5458984375, "rewards/margins": 11.209691047668457, "rewards/rejected": -152.75558471679688, "step": 38800 }, { "epoch": 2.25, "grad_norm": 0.002306624548509717, "learning_rate": 0.00025190603351522893, "logits/chosen": -14.156161308288574, "logits/rejected": -14.621793746948242, "logps/chosen": -2734.30419921875, "logps/rejected": -2502.2705078125, "loss": 2.605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.3719482421875, "rewards/margins": 23.894258499145508, "rewards/rejected": -190.26620483398438, "step": 38810 }, { "epoch": 2.25, "grad_norm": 49.3006706237793, "learning_rate": 0.00025171252757459655, "logits/chosen": -15.909688949584961, "logits/rejected": -16.42860221862793, "logps/chosen": -2784.220703125, "logps/rejected": -2886.885986328125, "loss": 17.1687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.39418029785156, "rewards/margins": -9.873757362365723, "rewards/rejected": -185.52041625976562, "step": 38820 }, { "epoch": 2.25, "grad_norm": 21.94658088684082, "learning_rate": 0.00025151902163396417, "logits/chosen": -14.52003002166748, "logits/rejected": -16.048913955688477, "logps/chosen": -2819.09326171875, "logps/rejected": -2696.217529296875, "loss": 8.2223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -208.21456909179688, "rewards/margins": -0.3488601744174957, "rewards/rejected": -207.86572265625, "step": 38830 }, { "epoch": 2.25, "grad_norm": 0.00023170496569946408, "learning_rate": 0.0002513255156933318, "logits/chosen": -15.930331230163574, "logits/rejected": -16.092117309570312, "logps/chosen": -2393.8994140625, "logps/rejected": -2482.185791015625, "loss": 0.7512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -200.62753295898438, "rewards/margins": 7.962094306945801, "rewards/rejected": -208.58963012695312, "step": 38840 }, { "epoch": 2.25, "grad_norm": 1.275855302810669, "learning_rate": 0.0002511320097526994, "logits/chosen": -13.378198623657227, "logits/rejected": -14.492101669311523, "logps/chosen": -2739.66162109375, "logps/rejected": -2869.88623046875, "loss": 1.9044, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -173.34971618652344, "rewards/margins": 8.386049270629883, "rewards/rejected": -181.73577880859375, "step": 38850 }, { "epoch": 2.25, "grad_norm": 6.065029538149247e-06, "learning_rate": 0.000250938503812067, "logits/chosen": -15.498750686645508, "logits/rejected": -15.9945707321167, "logps/chosen": -2475.517578125, "logps/rejected": -2523.13037109375, "loss": 1.3893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.1244354248047, "rewards/margins": 12.362375259399414, "rewards/rejected": -224.48681640625, "step": 38860 }, { "epoch": 2.25, "grad_norm": 5.938079993939027e-06, "learning_rate": 0.0002507449978714347, "logits/chosen": -15.428471565246582, "logits/rejected": -16.07602310180664, "logps/chosen": -2943.678466796875, "logps/rejected": -3012.994384765625, "loss": 1.4111, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -178.34988403320312, "rewards/margins": 10.745767593383789, "rewards/rejected": -189.09564208984375, "step": 38870 }, { "epoch": 2.25, "grad_norm": 0.1341775357723236, "learning_rate": 0.0002505514919308023, "logits/chosen": -14.832490921020508, "logits/rejected": -17.65556526184082, "logps/chosen": -2857.362548828125, "logps/rejected": -3081.725830078125, "loss": 0.375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.31417846679688, "rewards/margins": 9.654489517211914, "rewards/rejected": -168.96865844726562, "step": 38880 }, { "epoch": 2.25, "grad_norm": 0.000974534428678453, "learning_rate": 0.0002503579859901699, "logits/chosen": -16.52142906188965, "logits/rejected": -16.04198455810547, "logps/chosen": -2922.498779296875, "logps/rejected": -3115.579345703125, "loss": 2.1206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -258.393798828125, "rewards/margins": 14.93433952331543, "rewards/rejected": -273.3281555175781, "step": 38890 }, { "epoch": 2.25, "grad_norm": 11.170071601867676, "learning_rate": 0.0002501644800495375, "logits/chosen": -16.181299209594727, "logits/rejected": -17.47407341003418, "logps/chosen": -2715.409423828125, "logps/rejected": -2621.73681640625, "loss": 3.505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -196.39308166503906, "rewards/margins": 3.8915622234344482, "rewards/rejected": -200.28463745117188, "step": 38900 }, { "epoch": 2.25, "grad_norm": 69.99031829833984, "learning_rate": 0.00024997097410890516, "logits/chosen": -15.176004409790039, "logits/rejected": -15.441740036010742, "logps/chosen": -2992.90478515625, "logps/rejected": -2783.639404296875, "loss": 4.4428, "rewards/accuracies": 0.5, "rewards/chosen": -159.76475524902344, "rewards/margins": -0.24239110946655273, "rewards/rejected": -159.52236938476562, "step": 38910 }, { "epoch": 2.25, "grad_norm": 1.0353374481201172, "learning_rate": 0.0002497774681682728, "logits/chosen": -16.23763084411621, "logits/rejected": -16.331157684326172, "logps/chosen": -3179.04931640625, "logps/rejected": -3194.766845703125, "loss": 2.3181, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.16012573242188, "rewards/margins": 0.4698794484138489, "rewards/rejected": -170.6300048828125, "step": 38920 }, { "epoch": 2.25, "grad_norm": 2.1515674234251492e-05, "learning_rate": 0.0002495839622276404, "logits/chosen": -14.968225479125977, "logits/rejected": -14.906393051147461, "logps/chosen": -2880.955322265625, "logps/rejected": -2624.433349609375, "loss": 3.4128, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.92630004882812, "rewards/margins": 5.0684003829956055, "rewards/rejected": -182.99468994140625, "step": 38930 }, { "epoch": 2.25, "grad_norm": 2.7824260087072616e-06, "learning_rate": 0.000249390456287008, "logits/chosen": -17.40938949584961, "logits/rejected": -19.805889129638672, "logps/chosen": -2749.8447265625, "logps/rejected": -2370.44677734375, "loss": 5.8399, "rewards/accuracies": 0.5, "rewards/chosen": -170.7592315673828, "rewards/margins": 13.638994216918945, "rewards/rejected": -184.39822387695312, "step": 38940 }, { "epoch": 2.25, "grad_norm": 1.4365031120178173e-07, "learning_rate": 0.00024919695034637563, "logits/chosen": -15.387794494628906, "logits/rejected": -15.52247142791748, "logps/chosen": -2612.56298828125, "logps/rejected": -2465.910888671875, "loss": 11.8482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.60006713867188, "rewards/margins": -5.883747100830078, "rewards/rejected": -162.71632385253906, "step": 38950 }, { "epoch": 2.26, "grad_norm": 51.95261001586914, "learning_rate": 0.00024900344440574324, "logits/chosen": -18.76837730407715, "logits/rejected": -18.194644927978516, "logps/chosen": -2398.59619140625, "logps/rejected": -2263.7666015625, "loss": 19.3156, "rewards/accuracies": 0.5, "rewards/chosen": -213.88632202148438, "rewards/margins": -14.64466381072998, "rewards/rejected": -199.24166870117188, "step": 38960 }, { "epoch": 2.26, "grad_norm": 4.1513166427612305, "learning_rate": 0.00024880993846511086, "logits/chosen": -15.977357864379883, "logits/rejected": -16.038131713867188, "logps/chosen": -2551.66796875, "logps/rejected": -2329.14013671875, "loss": 3.8396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.77210998535156, "rewards/margins": 6.192986965179443, "rewards/rejected": -127.96507263183594, "step": 38970 }, { "epoch": 2.26, "grad_norm": 2.1436154185039413e-08, "learning_rate": 0.00024861643252447853, "logits/chosen": -19.34918975830078, "logits/rejected": -19.81741714477539, "logps/chosen": -3125.104736328125, "logps/rejected": -2553.0546875, "loss": 12.1301, "rewards/accuracies": 0.5, "rewards/chosen": -136.91334533691406, "rewards/margins": 1.4175949096679688, "rewards/rejected": -138.33096313476562, "step": 38980 }, { "epoch": 2.26, "grad_norm": 0.06455078721046448, "learning_rate": 0.00024842292658384615, "logits/chosen": -16.812898635864258, "logits/rejected": -17.58980369567871, "logps/chosen": -3178.030029296875, "logps/rejected": -3097.02001953125, "loss": 9.8256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -199.5381317138672, "rewards/margins": 2.835287570953369, "rewards/rejected": -202.37342834472656, "step": 38990 }, { "epoch": 2.26, "grad_norm": 5.352358493837528e-05, "learning_rate": 0.00024822942064321377, "logits/chosen": -16.910045623779297, "logits/rejected": -18.300039291381836, "logps/chosen": -3156.24072265625, "logps/rejected": -3218.091552734375, "loss": 1.1444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -134.52093505859375, "rewards/margins": 12.856117248535156, "rewards/rejected": -147.37705993652344, "step": 39000 }, { "epoch": 2.26, "grad_norm": 143.7860565185547, "learning_rate": 0.0002480359147025814, "logits/chosen": -18.316455841064453, "logits/rejected": -18.147111892700195, "logps/chosen": -2965.87451171875, "logps/rejected": -2685.9716796875, "loss": 3.1091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.0027313232422, "rewards/margins": 4.306826591491699, "rewards/rejected": -158.3095703125, "step": 39010 }, { "epoch": 2.26, "grad_norm": 132.67259216308594, "learning_rate": 0.000247842408761949, "logits/chosen": -18.449323654174805, "logits/rejected": -18.58378791809082, "logps/chosen": -2852.833984375, "logps/rejected": -2926.63818359375, "loss": 0.9272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -141.73959350585938, "rewards/margins": 12.768423080444336, "rewards/rejected": -154.50802612304688, "step": 39020 }, { "epoch": 2.26, "grad_norm": 8.647536277770996, "learning_rate": 0.0002476489028213166, "logits/chosen": -18.83651351928711, "logits/rejected": -18.74681282043457, "logps/chosen": -2856.7841796875, "logps/rejected": -2864.593505859375, "loss": 2.8665, "rewards/accuracies": 0.5, "rewards/chosen": -186.0424346923828, "rewards/margins": 0.9328594207763672, "rewards/rejected": -186.97531127929688, "step": 39030 }, { "epoch": 2.26, "grad_norm": 7.325205615416053e-07, "learning_rate": 0.00024745539688068424, "logits/chosen": -18.53121566772461, "logits/rejected": -19.33173370361328, "logps/chosen": -2954.119384765625, "logps/rejected": -2926.778564453125, "loss": 14.7537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.33383178710938, "rewards/margins": -3.6157989501953125, "rewards/rejected": -146.71800231933594, "step": 39040 }, { "epoch": 2.26, "grad_norm": 5.15493631362915, "learning_rate": 0.00024726189094005185, "logits/chosen": -18.271751403808594, "logits/rejected": -20.176868438720703, "logps/chosen": -2923.722900390625, "logps/rejected": -2943.79345703125, "loss": 0.4894, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -216.8374481201172, "rewards/margins": 14.391505241394043, "rewards/rejected": -231.2289581298828, "step": 39050 }, { "epoch": 2.26, "grad_norm": 0.2180999219417572, "learning_rate": 0.0002470683849994195, "logits/chosen": -19.435138702392578, "logits/rejected": -19.825197219848633, "logps/chosen": -2745.533447265625, "logps/rejected": -2789.27294921875, "loss": 1.3089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -240.0293731689453, "rewards/margins": 6.426194667816162, "rewards/rejected": -246.45553588867188, "step": 39060 }, { "epoch": 2.26, "grad_norm": 1.0832574367523193, "learning_rate": 0.0002468748790587871, "logits/chosen": -17.439537048339844, "logits/rejected": -17.425029754638672, "logps/chosen": -2834.3955078125, "logps/rejected": -2902.75439453125, "loss": 1.0071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -226.5455322265625, "rewards/margins": 11.285599708557129, "rewards/rejected": -237.8311004638672, "step": 39070 }, { "epoch": 2.26, "grad_norm": 22.949092864990234, "learning_rate": 0.0002466813731181547, "logits/chosen": -15.281306266784668, "logits/rejected": -15.354777336120605, "logps/chosen": -3090.616455078125, "logps/rejected": -2980.44384765625, "loss": 7.0358, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -140.12974548339844, "rewards/margins": -0.15740099549293518, "rewards/rejected": -139.9723358154297, "step": 39080 }, { "epoch": 2.26, "grad_norm": 0.0002758472692221403, "learning_rate": 0.0002464878671775224, "logits/chosen": -19.237064361572266, "logits/rejected": -20.33078384399414, "logps/chosen": -2940.810546875, "logps/rejected": -3065.36181640625, "loss": 0.3863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -261.1905822753906, "rewards/margins": 17.718042373657227, "rewards/rejected": -278.90863037109375, "step": 39090 }, { "epoch": 2.26, "grad_norm": 1.096201913242112e-06, "learning_rate": 0.00024629436123689, "logits/chosen": -20.119993209838867, "logits/rejected": -21.116174697875977, "logps/chosen": -2870.11669921875, "logps/rejected": -2927.1240234375, "loss": 0.1582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -185.49327087402344, "rewards/margins": 9.500059127807617, "rewards/rejected": -194.9933319091797, "step": 39100 }, { "epoch": 2.26, "grad_norm": 68.8990478515625, "learning_rate": 0.0002461008552962576, "logits/chosen": -17.633546829223633, "logits/rejected": -18.629283905029297, "logps/chosen": -3139.218017578125, "logps/rejected": -3066.720947265625, "loss": 2.1671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.0120086669922, "rewards/margins": 8.593620300292969, "rewards/rejected": -212.60562133789062, "step": 39110 }, { "epoch": 2.26, "grad_norm": 30.23936653137207, "learning_rate": 0.00024590734935562523, "logits/chosen": -16.743764877319336, "logits/rejected": -16.755233764648438, "logps/chosen": -2784.2314453125, "logps/rejected": -2706.67041015625, "loss": 2.0596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.4276580810547, "rewards/margins": 11.911707878112793, "rewards/rejected": -149.33935546875, "step": 39120 }, { "epoch": 2.26, "grad_norm": 104.48724365234375, "learning_rate": 0.00024571384341499285, "logits/chosen": -17.89826202392578, "logits/rejected": -18.975175857543945, "logps/chosen": -2608.189697265625, "logps/rejected": -2666.061279296875, "loss": 3.4518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -176.4268341064453, "rewards/margins": 7.549508094787598, "rewards/rejected": -183.976318359375, "step": 39130 }, { "epoch": 2.27, "grad_norm": 104.35111999511719, "learning_rate": 0.00024552033747436046, "logits/chosen": -15.945060729980469, "logits/rejected": -15.92113208770752, "logps/chosen": -2715.159912109375, "logps/rejected": -2813.756591796875, "loss": 3.9097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.44354248046875, "rewards/margins": 16.06011199951172, "rewards/rejected": -148.50364685058594, "step": 39140 }, { "epoch": 2.27, "grad_norm": 51.443607330322266, "learning_rate": 0.0002453268315337281, "logits/chosen": -19.50392723083496, "logits/rejected": -19.337467193603516, "logps/chosen": -2967.77099609375, "logps/rejected": -3070.130126953125, "loss": 4.8433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -267.9001159667969, "rewards/margins": 11.57823657989502, "rewards/rejected": -279.47833251953125, "step": 39150 }, { "epoch": 2.27, "grad_norm": 0.6468173265457153, "learning_rate": 0.0002451333255930957, "logits/chosen": -17.27675437927246, "logits/rejected": -17.538055419921875, "logps/chosen": -2663.653564453125, "logps/rejected": -2789.97900390625, "loss": 1.2133, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -154.63223266601562, "rewards/margins": 16.77696990966797, "rewards/rejected": -171.40921020507812, "step": 39160 }, { "epoch": 2.27, "grad_norm": 25.13298988342285, "learning_rate": 0.00024493981965246337, "logits/chosen": -15.706072807312012, "logits/rejected": -15.80872631072998, "logps/chosen": -2622.9208984375, "logps/rejected": -2662.896728515625, "loss": 4.6377, "rewards/accuracies": 0.5, "rewards/chosen": -193.24212646484375, "rewards/margins": 3.2986671924591064, "rewards/rejected": -196.54080200195312, "step": 39170 }, { "epoch": 2.27, "grad_norm": 0.04103995859622955, "learning_rate": 0.00024474631371183093, "logits/chosen": -13.579035758972168, "logits/rejected": -13.692428588867188, "logps/chosen": -3115.00634765625, "logps/rejected": -3257.72802734375, "loss": 1.9782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -211.0945281982422, "rewards/margins": 6.658295631408691, "rewards/rejected": -217.75283813476562, "step": 39180 }, { "epoch": 2.27, "grad_norm": 0.012791779823601246, "learning_rate": 0.0002445528077711986, "logits/chosen": -12.722164154052734, "logits/rejected": -11.580266952514648, "logps/chosen": -3451.147216796875, "logps/rejected": -3033.89208984375, "loss": 5.1408, "rewards/accuracies": 0.5, "rewards/chosen": -132.48220825195312, "rewards/margins": 4.909514427185059, "rewards/rejected": -137.3917236328125, "step": 39190 }, { "epoch": 2.27, "grad_norm": 2.333265136655882e-09, "learning_rate": 0.0002443593018305662, "logits/chosen": -13.397976875305176, "logits/rejected": -14.039663314819336, "logps/chosen": -3325.748046875, "logps/rejected": -3077.927001953125, "loss": 2.2811, "rewards/accuracies": 0.5, "rewards/chosen": -129.72811889648438, "rewards/margins": 3.5594534873962402, "rewards/rejected": -133.28756713867188, "step": 39200 }, { "epoch": 2.27, "grad_norm": 3.190362107829969e-08, "learning_rate": 0.00024416579588993384, "logits/chosen": -11.666779518127441, "logits/rejected": -11.822669982910156, "logps/chosen": -2641.72216796875, "logps/rejected": -2544.38916015625, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -137.5406951904297, "rewards/margins": 13.216604232788086, "rewards/rejected": -150.75730895996094, "step": 39210 }, { "epoch": 2.27, "grad_norm": 77.82946014404297, "learning_rate": 0.00024397228994930145, "logits/chosen": -12.697383880615234, "logits/rejected": -12.85998821258545, "logps/chosen": -3222.88623046875, "logps/rejected": -3129.454345703125, "loss": 0.6105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -134.62754821777344, "rewards/margins": 15.91931438446045, "rewards/rejected": -150.54685974121094, "step": 39220 }, { "epoch": 2.27, "grad_norm": 0.00019380422600079328, "learning_rate": 0.00024377878400866907, "logits/chosen": -17.24064826965332, "logits/rejected": -15.582273483276367, "logps/chosen": -3251.6962890625, "logps/rejected": -2787.2607421875, "loss": 2.2145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -137.37344360351562, "rewards/margins": 5.141529083251953, "rewards/rejected": -142.51498413085938, "step": 39230 }, { "epoch": 2.27, "grad_norm": 41.26127243041992, "learning_rate": 0.0002435852780680367, "logits/chosen": -14.991229057312012, "logits/rejected": -16.007221221923828, "logps/chosen": -3153.63671875, "logps/rejected": -2906.513427734375, "loss": 3.3304, "rewards/accuracies": 0.5, "rewards/chosen": -206.2112579345703, "rewards/margins": 0.4804078936576843, "rewards/rejected": -206.691650390625, "step": 39240 }, { "epoch": 2.27, "grad_norm": 4.228877969580935e-06, "learning_rate": 0.00024339177212740433, "logits/chosen": -14.542097091674805, "logits/rejected": -15.941293716430664, "logps/chosen": -2838.67236328125, "logps/rejected": -3013.967041015625, "loss": 2.2897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -150.9506378173828, "rewards/margins": 10.581171989440918, "rewards/rejected": -161.5318145751953, "step": 39250 }, { "epoch": 2.27, "grad_norm": 25.169273376464844, "learning_rate": 0.00024319826618677192, "logits/chosen": -15.963091850280762, "logits/rejected": -15.993464469909668, "logps/chosen": -3093.313720703125, "logps/rejected": -2882.0810546875, "loss": 3.5417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -208.3681182861328, "rewards/margins": 23.762174606323242, "rewards/rejected": -232.13027954101562, "step": 39260 }, { "epoch": 2.27, "grad_norm": 61.656028747558594, "learning_rate": 0.00024300476024613957, "logits/chosen": -17.015439987182617, "logits/rejected": -17.566373825073242, "logps/chosen": -2804.84814453125, "logps/rejected": -2436.1533203125, "loss": 18.8588, "rewards/accuracies": 0.5, "rewards/chosen": -174.66278076171875, "rewards/margins": -12.926612854003906, "rewards/rejected": -161.73617553710938, "step": 39270 }, { "epoch": 2.27, "grad_norm": 94.54654693603516, "learning_rate": 0.00024281125430550718, "logits/chosen": -13.988555908203125, "logits/rejected": -14.7778902053833, "logps/chosen": -3052.117919921875, "logps/rejected": -2926.86279296875, "loss": 4.4053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.35142517089844, "rewards/margins": 7.108696937561035, "rewards/rejected": -138.46011352539062, "step": 39280 }, { "epoch": 2.27, "grad_norm": 36.06351089477539, "learning_rate": 0.0002426177483648748, "logits/chosen": -15.90429401397705, "logits/rejected": -15.692072868347168, "logps/chosen": -2995.05712890625, "logps/rejected": -3059.561279296875, "loss": 3.0954, "rewards/accuracies": 0.5, "rewards/chosen": -224.1992645263672, "rewards/margins": 7.577375888824463, "rewards/rejected": -231.77664184570312, "step": 39290 }, { "epoch": 2.27, "grad_norm": 0.4328824579715729, "learning_rate": 0.00024242424242424245, "logits/chosen": -16.267133712768555, "logits/rejected": -17.436054229736328, "logps/chosen": -2831.4326171875, "logps/rejected": -2800.80419921875, "loss": 0.2699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -141.58319091796875, "rewards/margins": 13.3864164352417, "rewards/rejected": -154.96958923339844, "step": 39300 }, { "epoch": 2.28, "grad_norm": 7.062702707116841e-07, "learning_rate": 0.00024223073648361004, "logits/chosen": -17.334941864013672, "logits/rejected": -17.760353088378906, "logps/chosen": -2515.5537109375, "logps/rejected": -2499.037109375, "loss": 1.3001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.82272338867188, "rewards/margins": 9.779123306274414, "rewards/rejected": -227.6018829345703, "step": 39310 }, { "epoch": 2.28, "grad_norm": 0.0107555091381073, "learning_rate": 0.00024203723054297768, "logits/chosen": -20.171239852905273, "logits/rejected": -20.40865707397461, "logps/chosen": -2730.85498046875, "logps/rejected": -2571.70263671875, "loss": 15.9654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -210.0247802734375, "rewards/margins": -10.081876754760742, "rewards/rejected": -199.94290161132812, "step": 39320 }, { "epoch": 2.28, "grad_norm": 0.0001312734530074522, "learning_rate": 0.0002418437246023453, "logits/chosen": -18.662622451782227, "logits/rejected": -19.837419509887695, "logps/chosen": -2830.30908203125, "logps/rejected": -2920.894775390625, "loss": 0.8889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -174.23428344726562, "rewards/margins": 10.066915512084961, "rewards/rejected": -184.30120849609375, "step": 39330 }, { "epoch": 2.28, "grad_norm": 4.4806828469745597e-08, "learning_rate": 0.00024165021866171291, "logits/chosen": -15.68644905090332, "logits/rejected": -16.350116729736328, "logps/chosen": -2959.239501953125, "logps/rejected": -2913.641845703125, "loss": 7.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -222.006103515625, "rewards/margins": -1.2610397338867188, "rewards/rejected": -220.7450408935547, "step": 39340 }, { "epoch": 2.28, "grad_norm": 0.00015277879720088094, "learning_rate": 0.00024145671272108056, "logits/chosen": -17.751720428466797, "logits/rejected": -18.058975219726562, "logps/chosen": -2901.17919921875, "logps/rejected": -2678.7568359375, "loss": 6.6112, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -212.72933959960938, "rewards/margins": 5.8076887130737305, "rewards/rejected": -218.5370330810547, "step": 39350 }, { "epoch": 2.28, "grad_norm": 2.1544313430786133, "learning_rate": 0.00024126320678044818, "logits/chosen": -18.521968841552734, "logits/rejected": -20.381786346435547, "logps/chosen": -2824.81396484375, "logps/rejected": -2748.556396484375, "loss": 3.1326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -215.8360595703125, "rewards/margins": 1.6672375202178955, "rewards/rejected": -217.5032958984375, "step": 39360 }, { "epoch": 2.28, "grad_norm": 1.1173936798059003e-07, "learning_rate": 0.00024106970083981577, "logits/chosen": -13.134173393249512, "logits/rejected": -13.125322341918945, "logps/chosen": -3290.85595703125, "logps/rejected": -2962.966552734375, "loss": 2.8447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -41.816810607910156, "rewards/margins": 12.338092803955078, "rewards/rejected": -54.1549072265625, "step": 39370 }, { "epoch": 2.28, "grad_norm": 0.0, "learning_rate": 0.0002408761948991834, "logits/chosen": -17.647829055786133, "logits/rejected": -19.103557586669922, "logps/chosen": -2713.975341796875, "logps/rejected": -2538.19775390625, "loss": 3.1115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.73471069335938, "rewards/margins": 6.007058620452881, "rewards/rejected": -147.74179077148438, "step": 39380 }, { "epoch": 2.28, "grad_norm": 2.300587798779047e-21, "learning_rate": 0.00024068268895855103, "logits/chosen": -16.440462112426758, "logits/rejected": -16.836456298828125, "logps/chosen": -2554.45068359375, "logps/rejected": -2458.41455078125, "loss": 3.4302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.43789672851562, "rewards/margins": 14.528974533081055, "rewards/rejected": -199.96685791015625, "step": 39390 }, { "epoch": 2.28, "grad_norm": 53.12958908081055, "learning_rate": 0.00024048918301791867, "logits/chosen": -16.40176773071289, "logits/rejected": -16.59316062927246, "logps/chosen": -2606.10009765625, "logps/rejected": -2783.264404296875, "loss": 19.3464, "rewards/accuracies": 0.5, "rewards/chosen": -161.1728515625, "rewards/margins": -12.559366226196289, "rewards/rejected": -148.61349487304688, "step": 39400 }, { "epoch": 2.28, "grad_norm": 3.112522506398818e-08, "learning_rate": 0.0002402956770772863, "logits/chosen": -16.737621307373047, "logits/rejected": -17.14153480529785, "logps/chosen": -2647.26806640625, "logps/rejected": -2647.91015625, "loss": 2.0633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.16160583496094, "rewards/margins": 10.15672779083252, "rewards/rejected": -174.31832885742188, "step": 39410 }, { "epoch": 2.28, "grad_norm": 0.009165425784885883, "learning_rate": 0.00024010217113665388, "logits/chosen": -14.1541109085083, "logits/rejected": -16.15494155883789, "logps/chosen": -3150.0380859375, "logps/rejected": -2682.256103515625, "loss": 0.6364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -88.59573364257812, "rewards/margins": 16.017168045043945, "rewards/rejected": -104.6128921508789, "step": 39420 }, { "epoch": 2.28, "grad_norm": 0.024423377588391304, "learning_rate": 0.00023990866519602152, "logits/chosen": -14.158378601074219, "logits/rejected": -14.570388793945312, "logps/chosen": -3147.69140625, "logps/rejected": -3231.9951171875, "loss": 3.1047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.37069702148438, "rewards/margins": 3.3569037914276123, "rewards/rejected": -143.72763061523438, "step": 39430 }, { "epoch": 2.28, "grad_norm": 6.60382080078125, "learning_rate": 0.00023971515925538914, "logits/chosen": -17.757736206054688, "logits/rejected": -17.691633224487305, "logps/chosen": -2753.12646484375, "logps/rejected": -2774.734375, "loss": 3.6723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.9835205078125, "rewards/margins": 2.0657413005828857, "rewards/rejected": -205.0492706298828, "step": 39440 }, { "epoch": 2.28, "grad_norm": 3.698995953982376e-07, "learning_rate": 0.00023952165331475676, "logits/chosen": -14.076220512390137, "logits/rejected": -16.10068130493164, "logps/chosen": -3232.81787109375, "logps/rejected": -2884.21826171875, "loss": 1.838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.7644500732422, "rewards/margins": 17.340118408203125, "rewards/rejected": -189.10458374023438, "step": 39450 }, { "epoch": 2.28, "grad_norm": 0.00024277414195239544, "learning_rate": 0.0002393281473741244, "logits/chosen": -16.28803062438965, "logits/rejected": -17.463171005249023, "logps/chosen": -2931.75927734375, "logps/rejected": -2810.676025390625, "loss": 0.663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -183.42364501953125, "rewards/margins": 20.666025161743164, "rewards/rejected": -204.0896759033203, "step": 39460 }, { "epoch": 2.28, "grad_norm": 0.5725802779197693, "learning_rate": 0.00023913464143349202, "logits/chosen": -17.6317195892334, "logits/rejected": -18.463659286499023, "logps/chosen": -2720.0595703125, "logps/rejected": -2900.42919921875, "loss": 2.9572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.84674072265625, "rewards/margins": 2.447404384613037, "rewards/rejected": -158.2941436767578, "step": 39470 }, { "epoch": 2.29, "grad_norm": 0.005328808911144733, "learning_rate": 0.00023894113549285964, "logits/chosen": -17.12151336669922, "logits/rejected": -20.213891983032227, "logps/chosen": -2848.67041015625, "logps/rejected": -2567.64453125, "loss": 0.4145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -174.44723510742188, "rewards/margins": 10.677305221557617, "rewards/rejected": -185.12454223632812, "step": 39480 }, { "epoch": 2.29, "grad_norm": 0.0, "learning_rate": 0.00023874762955222725, "logits/chosen": -18.255197525024414, "logits/rejected": -19.796083450317383, "logps/chosen": -2735.26318359375, "logps/rejected": -2662.900634765625, "loss": 3.9253, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.9628143310547, "rewards/margins": 13.160229682922363, "rewards/rejected": -219.123046875, "step": 39490 }, { "epoch": 2.29, "grad_norm": 5.1682956581089456e-08, "learning_rate": 0.00023855412361159487, "logits/chosen": -17.387493133544922, "logits/rejected": -18.0288028717041, "logps/chosen": -2618.427490234375, "logps/rejected": -2549.32080078125, "loss": 0.8337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -191.98025512695312, "rewards/margins": 18.61301612854004, "rewards/rejected": -210.59326171875, "step": 39500 }, { "epoch": 2.29, "grad_norm": 1288.1439208984375, "learning_rate": 0.00023836061767096252, "logits/chosen": -15.111017227172852, "logits/rejected": -15.442591667175293, "logps/chosen": -2685.350830078125, "logps/rejected": -2335.81396484375, "loss": 2.7798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -127.44136047363281, "rewards/margins": 23.9884090423584, "rewards/rejected": -151.42974853515625, "step": 39510 }, { "epoch": 2.29, "grad_norm": 4.921811580657959, "learning_rate": 0.00023816711173033013, "logits/chosen": -14.39710521697998, "logits/rejected": -14.678024291992188, "logps/chosen": -2870.032470703125, "logps/rejected": -2840.213623046875, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -138.73263549804688, "rewards/margins": 15.316927909851074, "rewards/rejected": -154.04959106445312, "step": 39520 }, { "epoch": 2.29, "grad_norm": 0.05118131265044212, "learning_rate": 0.00023797360578969775, "logits/chosen": -17.857833862304688, "logits/rejected": -18.142475128173828, "logps/chosen": -2760.791748046875, "logps/rejected": -2801.561767578125, "loss": 5.0086, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -200.50442504882812, "rewards/margins": 1.218313217163086, "rewards/rejected": -201.72274780273438, "step": 39530 }, { "epoch": 2.29, "grad_norm": 0.02015691250562668, "learning_rate": 0.00023778009984906537, "logits/chosen": -16.219690322875977, "logits/rejected": -16.244022369384766, "logps/chosen": -3256.21826171875, "logps/rejected": -3343.785888671875, "loss": 0.1293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -205.9517059326172, "rewards/margins": 10.406204223632812, "rewards/rejected": -216.35791015625, "step": 39540 }, { "epoch": 2.29, "grad_norm": 108.3630142211914, "learning_rate": 0.00023758659390843298, "logits/chosen": -12.937283515930176, "logits/rejected": -13.62103271484375, "logps/chosen": -2934.18359375, "logps/rejected": -3081.991455078125, "loss": 8.0277, "rewards/accuracies": 0.5, "rewards/chosen": -134.85267639160156, "rewards/margins": -0.5365182757377625, "rewards/rejected": -134.31613159179688, "step": 39550 }, { "epoch": 2.29, "grad_norm": 0.031312212347984314, "learning_rate": 0.00023739308796780063, "logits/chosen": -16.49297332763672, "logits/rejected": -16.716079711914062, "logps/chosen": -2804.3447265625, "logps/rejected": -2080.65625, "loss": 9.0841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -176.82374572753906, "rewards/margins": -0.7629092931747437, "rewards/rejected": -176.06082153320312, "step": 39560 }, { "epoch": 2.29, "grad_norm": 73.10034942626953, "learning_rate": 0.00023719958202716825, "logits/chosen": -17.25486183166504, "logits/rejected": -17.570232391357422, "logps/chosen": -2775.552001953125, "logps/rejected": -3094.775146484375, "loss": 4.4064, "rewards/accuracies": 0.5, "rewards/chosen": -176.66500854492188, "rewards/margins": -0.4783479571342468, "rewards/rejected": -176.1866455078125, "step": 39570 }, { "epoch": 2.29, "grad_norm": 0.0005072591593489051, "learning_rate": 0.00023700607608653586, "logits/chosen": -16.725027084350586, "logits/rejected": -19.02760887145996, "logps/chosen": -2838.869384765625, "logps/rejected": -2751.38623046875, "loss": 10.1742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.5630645751953, "rewards/margins": -1.0916106700897217, "rewards/rejected": -203.471435546875, "step": 39580 }, { "epoch": 2.29, "grad_norm": 42.50901412963867, "learning_rate": 0.00023681257014590348, "logits/chosen": -15.142718315124512, "logits/rejected": -15.592023849487305, "logps/chosen": -3169.642822265625, "logps/rejected": -2749.55322265625, "loss": 8.9374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.44369506835938, "rewards/margins": 1.1306641101837158, "rewards/rejected": -205.5743408203125, "step": 39590 }, { "epoch": 2.29, "grad_norm": 222.91993713378906, "learning_rate": 0.0002366190642052711, "logits/chosen": -17.513765335083008, "logits/rejected": -18.73705291748047, "logps/chosen": -3131.59619140625, "logps/rejected": -3058.77783203125, "loss": 13.2985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.0779571533203, "rewards/margins": -6.223835468292236, "rewards/rejected": -170.85414123535156, "step": 39600 }, { "epoch": 2.29, "grad_norm": 2.7272835723124444e-07, "learning_rate": 0.00023642555826463874, "logits/chosen": -15.246174812316895, "logits/rejected": -15.633996963500977, "logps/chosen": -2792.16650390625, "logps/rejected": -2815.74755859375, "loss": 4.4993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -135.10031127929688, "rewards/margins": 1.8884894847869873, "rewards/rejected": -136.98880004882812, "step": 39610 }, { "epoch": 2.29, "grad_norm": 0.0, "learning_rate": 0.00023623205232400636, "logits/chosen": -11.557501792907715, "logits/rejected": -11.791520118713379, "logps/chosen": -3894.49267578125, "logps/rejected": -3466.32763671875, "loss": 4.1887, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -91.33329010009766, "rewards/margins": 5.404139518737793, "rewards/rejected": -96.73744201660156, "step": 39620 }, { "epoch": 2.29, "grad_norm": 0.14756500720977783, "learning_rate": 0.00023603854638337398, "logits/chosen": -16.19045639038086, "logits/rejected": -16.805667877197266, "logps/chosen": -2609.6279296875, "logps/rejected": -2866.68994140625, "loss": 0.4712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -179.12518310546875, "rewards/margins": 17.969406127929688, "rewards/rejected": -197.09458923339844, "step": 39630 }, { "epoch": 2.29, "grad_norm": 0.10259648412466049, "learning_rate": 0.0002358450404427416, "logits/chosen": -18.798538208007812, "logits/rejected": -18.796123504638672, "logps/chosen": -3106.464111328125, "logps/rejected": -2790.296630859375, "loss": 36.107, "rewards/accuracies": 0.5, "rewards/chosen": -218.32192993164062, "rewards/margins": -32.951393127441406, "rewards/rejected": -185.3705291748047, "step": 39640 }, { "epoch": 2.3, "grad_norm": 1.4528750398312695e-05, "learning_rate": 0.0002356515345021092, "logits/chosen": -16.706806182861328, "logits/rejected": -16.881759643554688, "logps/chosen": -2900.20166015625, "logps/rejected": -2978.5654296875, "loss": 3.9817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.3776397705078, "rewards/margins": 1.7031433582305908, "rewards/rejected": -196.08078002929688, "step": 39650 }, { "epoch": 2.3, "grad_norm": 21.182207107543945, "learning_rate": 0.00023545802856147685, "logits/chosen": -15.702657699584961, "logits/rejected": -17.841493606567383, "logps/chosen": -3082.956298828125, "logps/rejected": -2822.33544921875, "loss": 9.1578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.0368194580078, "rewards/margins": 0.05770816653966904, "rewards/rejected": -205.0945281982422, "step": 39660 }, { "epoch": 2.3, "grad_norm": 1.1575315284062526e-06, "learning_rate": 0.00023526452262084447, "logits/chosen": -13.41088581085205, "logits/rejected": -13.471246719360352, "logps/chosen": -3354.624267578125, "logps/rejected": -3334.240966796875, "loss": 5.0407, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -112.08760070800781, "rewards/margins": 15.284390449523926, "rewards/rejected": -127.37198638916016, "step": 39670 }, { "epoch": 2.3, "grad_norm": 6.532321477356953e-15, "learning_rate": 0.0002350710166802121, "logits/chosen": -16.665674209594727, "logits/rejected": -19.396120071411133, "logps/chosen": -3028.224853515625, "logps/rejected": -2730.05419921875, "loss": 10.198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.553466796875, "rewards/margins": 0.08583106845617294, "rewards/rejected": -158.6392822265625, "step": 39680 }, { "epoch": 2.3, "grad_norm": 2.280762471330904e-16, "learning_rate": 0.00023487751073957973, "logits/chosen": -14.432649612426758, "logits/rejected": -15.063755989074707, "logps/chosen": -3315.73486328125, "logps/rejected": -3378.026611328125, "loss": 2.2542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.57554626464844, "rewards/margins": 12.114175796508789, "rewards/rejected": -197.6897430419922, "step": 39690 }, { "epoch": 2.3, "grad_norm": 34.40160369873047, "learning_rate": 0.00023468400479894732, "logits/chosen": -14.979898452758789, "logits/rejected": -16.436031341552734, "logps/chosen": -2842.751953125, "logps/rejected": -2867.89892578125, "loss": 1.184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -142.53311157226562, "rewards/margins": 12.68285083770752, "rewards/rejected": -155.21595764160156, "step": 39700 }, { "epoch": 2.3, "grad_norm": 0.2931186556816101, "learning_rate": 0.00023449049885831494, "logits/chosen": -16.874313354492188, "logits/rejected": -17.784870147705078, "logps/chosen": -3181.107666015625, "logps/rejected": -3183.74365234375, "loss": 2.2376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.09268188476562, "rewards/margins": 4.636094570159912, "rewards/rejected": -211.728759765625, "step": 39710 }, { "epoch": 2.3, "grad_norm": 105.10953521728516, "learning_rate": 0.00023429699291768258, "logits/chosen": -14.569000244140625, "logits/rejected": -14.731544494628906, "logps/chosen": -3229.75146484375, "logps/rejected": -2944.999755859375, "loss": 2.1344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.93472290039062, "rewards/margins": 1.9410651922225952, "rewards/rejected": -149.87579345703125, "step": 39720 }, { "epoch": 2.3, "grad_norm": 260.26458740234375, "learning_rate": 0.0002341034869770502, "logits/chosen": -13.059402465820312, "logits/rejected": -12.923965454101562, "logps/chosen": -3221.870361328125, "logps/rejected": -2740.01171875, "loss": 4.1343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.78690338134766, "rewards/margins": 10.391447067260742, "rewards/rejected": -120.1783447265625, "step": 39730 }, { "epoch": 2.3, "grad_norm": 1.3587267311265805e-08, "learning_rate": 0.00023390998103641785, "logits/chosen": -18.678003311157227, "logits/rejected": -16.840991973876953, "logps/chosen": -3168.07177734375, "logps/rejected": -3152.487548828125, "loss": 17.3074, "rewards/accuracies": 0.5, "rewards/chosen": -168.56085205078125, "rewards/margins": -9.185872077941895, "rewards/rejected": -159.37498474121094, "step": 39740 }, { "epoch": 2.3, "grad_norm": 0.0018164993962273002, "learning_rate": 0.00023371647509578544, "logits/chosen": -16.75286102294922, "logits/rejected": -16.51546859741211, "logps/chosen": -3197.3251953125, "logps/rejected": -3341.680419921875, "loss": 2.9069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -242.40170288085938, "rewards/margins": 8.009799003601074, "rewards/rejected": -250.41152954101562, "step": 39750 }, { "epoch": 2.3, "grad_norm": 293.1281433105469, "learning_rate": 0.00023352296915515305, "logits/chosen": -17.958465576171875, "logits/rejected": -19.65927505493164, "logps/chosen": -2747.99755859375, "logps/rejected": -3057.88037109375, "loss": 3.4074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.4769287109375, "rewards/margins": 5.04589319229126, "rewards/rejected": -231.5228271484375, "step": 39760 }, { "epoch": 2.3, "grad_norm": 55.19324493408203, "learning_rate": 0.0002333294632145207, "logits/chosen": -15.17662525177002, "logits/rejected": -16.847461700439453, "logps/chosen": -3111.865478515625, "logps/rejected": -3164.39501953125, "loss": 0.3217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -193.7452850341797, "rewards/margins": 20.380878448486328, "rewards/rejected": -214.1261749267578, "step": 39770 }, { "epoch": 2.3, "grad_norm": 118.91934204101562, "learning_rate": 0.00023313595727388831, "logits/chosen": -18.245136260986328, "logits/rejected": -21.08700942993164, "logps/chosen": -2964.1533203125, "logps/rejected": -2968.602783203125, "loss": 11.2509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -239.0050048828125, "rewards/margins": -3.723174571990967, "rewards/rejected": -235.2818145751953, "step": 39780 }, { "epoch": 2.3, "grad_norm": 0.26606765389442444, "learning_rate": 0.00023294245133325593, "logits/chosen": -15.86597728729248, "logits/rejected": -16.587100982666016, "logps/chosen": -3158.531494140625, "logps/rejected": -2996.52490234375, "loss": 1.6311, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -136.74111938476562, "rewards/margins": 9.207077980041504, "rewards/rejected": -145.9481964111328, "step": 39790 }, { "epoch": 2.3, "grad_norm": 76.78132629394531, "learning_rate": 0.00023274894539262355, "logits/chosen": -17.39043426513672, "logits/rejected": -19.939498901367188, "logps/chosen": -3263.57177734375, "logps/rejected": -3288.78466796875, "loss": 3.3356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.52688598632812, "rewards/margins": 6.9789557456970215, "rewards/rejected": -191.50582885742188, "step": 39800 }, { "epoch": 2.3, "grad_norm": 1.589888700212141e-19, "learning_rate": 0.00023255543945199117, "logits/chosen": -19.893295288085938, "logits/rejected": -19.017410278320312, "logps/chosen": -2713.55322265625, "logps/rejected": -2749.4365234375, "loss": 2.3298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -160.9600067138672, "rewards/margins": 7.859847068786621, "rewards/rejected": -168.81985473632812, "step": 39810 }, { "epoch": 2.3, "grad_norm": 0.0001152217373601161, "learning_rate": 0.0002323619335113588, "logits/chosen": -19.36032485961914, "logits/rejected": -21.339908599853516, "logps/chosen": -2616.716552734375, "logps/rejected": -2906.808349609375, "loss": 7.0231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -198.30783081054688, "rewards/margins": 7.775348663330078, "rewards/rejected": -206.08316040039062, "step": 39820 }, { "epoch": 2.31, "grad_norm": 82.15790557861328, "learning_rate": 0.00023216842757072643, "logits/chosen": -14.969929695129395, "logits/rejected": -14.948160171508789, "logps/chosen": -3331.65185546875, "logps/rejected": -3235.714599609375, "loss": 3.824, "rewards/accuracies": 0.5, "rewards/chosen": -140.90695190429688, "rewards/margins": 2.1564152240753174, "rewards/rejected": -143.0633544921875, "step": 39830 }, { "epoch": 2.31, "grad_norm": 5.9236635934212245e-06, "learning_rate": 0.00023197492163009404, "logits/chosen": -16.34530258178711, "logits/rejected": -17.039173126220703, "logps/chosen": -3124.097412109375, "logps/rejected": -3120.421630859375, "loss": 4.3418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -211.1379852294922, "rewards/margins": 5.205541133880615, "rewards/rejected": -216.34353637695312, "step": 39840 }, { "epoch": 2.31, "grad_norm": 0.011497074738144875, "learning_rate": 0.0002317814156894617, "logits/chosen": -15.548291206359863, "logits/rejected": -15.793252944946289, "logps/chosen": -2895.01220703125, "logps/rejected": -2836.130859375, "loss": 1.1716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -178.43997192382812, "rewards/margins": 13.777432441711426, "rewards/rejected": -192.21742248535156, "step": 39850 }, { "epoch": 2.31, "grad_norm": 2.473776133719537e-16, "learning_rate": 0.00023158790974882928, "logits/chosen": -20.229944229125977, "logits/rejected": -19.945480346679688, "logps/chosen": -2939.60107421875, "logps/rejected": -3085.317138671875, "loss": 2.7568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.6188201904297, "rewards/margins": 13.08477783203125, "rewards/rejected": -202.70358276367188, "step": 39860 }, { "epoch": 2.31, "grad_norm": 2.0370985112094786e-06, "learning_rate": 0.00023139440380819692, "logits/chosen": -17.813858032226562, "logits/rejected": -16.779468536376953, "logps/chosen": -2611.87109375, "logps/rejected": -2707.78369140625, "loss": 1.1562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.9699249267578, "rewards/margins": 14.354449272155762, "rewards/rejected": -157.32437133789062, "step": 39870 }, { "epoch": 2.31, "grad_norm": 0.00045587585191242397, "learning_rate": 0.00023120089786756454, "logits/chosen": -17.314167022705078, "logits/rejected": -18.8978214263916, "logps/chosen": -3123.410400390625, "logps/rejected": -2795.250732421875, "loss": 13.1453, "rewards/accuracies": 0.5, "rewards/chosen": -212.5631866455078, "rewards/margins": -6.259319305419922, "rewards/rejected": -206.30386352539062, "step": 39880 }, { "epoch": 2.31, "grad_norm": 0.0004973485483787954, "learning_rate": 0.00023100739192693216, "logits/chosen": -14.16443157196045, "logits/rejected": -14.505208969116211, "logps/chosen": -3077.26416015625, "logps/rejected": -3138.64111328125, "loss": 2.3184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -121.942626953125, "rewards/margins": 5.532523155212402, "rewards/rejected": -127.47513580322266, "step": 39890 }, { "epoch": 2.31, "grad_norm": 1.2562006711959839, "learning_rate": 0.0002308138859862998, "logits/chosen": -16.31276512145996, "logits/rejected": -18.469165802001953, "logps/chosen": -3060.537841796875, "logps/rejected": -3013.5078125, "loss": 13.111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.4931182861328, "rewards/margins": -8.435769081115723, "rewards/rejected": -190.05734252929688, "step": 39900 }, { "epoch": 2.31, "grad_norm": 5.055075455451252e-11, "learning_rate": 0.0002306203800456674, "logits/chosen": -19.056360244750977, "logits/rejected": -20.41060447692871, "logps/chosen": -3007.1005859375, "logps/rejected": -3022.19482421875, "loss": 0.6211, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -262.9666748046875, "rewards/margins": 10.981346130371094, "rewards/rejected": -273.9480285644531, "step": 39910 }, { "epoch": 2.31, "grad_norm": 0.5660436153411865, "learning_rate": 0.000230426874105035, "logits/chosen": -17.36138916015625, "logits/rejected": -19.56064224243164, "logps/chosen": -3146.40234375, "logps/rejected": -2981.82275390625, "loss": 4.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -196.14849853515625, "rewards/margins": 4.026275634765625, "rewards/rejected": -200.17477416992188, "step": 39920 }, { "epoch": 2.31, "grad_norm": 0.00011328004620736465, "learning_rate": 0.00023023336816440265, "logits/chosen": -19.077838897705078, "logits/rejected": -18.69328498840332, "logps/chosen": -2704.064697265625, "logps/rejected": -2798.294921875, "loss": 5.485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -239.6980438232422, "rewards/margins": 6.1596574783325195, "rewards/rejected": -245.85769653320312, "step": 39930 }, { "epoch": 2.31, "grad_norm": 0.0003755353973247111, "learning_rate": 0.00023003986222377027, "logits/chosen": -16.58694839477539, "logits/rejected": -16.938945770263672, "logps/chosen": -2838.3076171875, "logps/rejected": -2822.576416015625, "loss": 4.3005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.39962768554688, "rewards/margins": 2.5581278800964355, "rewards/rejected": -216.957763671875, "step": 39940 }, { "epoch": 2.31, "grad_norm": 211.76687622070312, "learning_rate": 0.00022984635628313792, "logits/chosen": -15.177787780761719, "logits/rejected": -16.474857330322266, "logps/chosen": -2811.515625, "logps/rejected": -2776.849609375, "loss": 1.8926, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -187.68011474609375, "rewards/margins": 10.687138557434082, "rewards/rejected": -198.3672332763672, "step": 39950 }, { "epoch": 2.31, "grad_norm": 0.0005627276841551065, "learning_rate": 0.00022965285034250553, "logits/chosen": -19.645408630371094, "logits/rejected": -20.199893951416016, "logps/chosen": -2840.5107421875, "logps/rejected": -2771.553955078125, "loss": 0.44, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -215.0989532470703, "rewards/margins": 5.860983371734619, "rewards/rejected": -220.9599151611328, "step": 39960 }, { "epoch": 2.31, "grad_norm": 74.67970275878906, "learning_rate": 0.00022945934440187312, "logits/chosen": -14.346368789672852, "logits/rejected": -14.052373886108398, "logps/chosen": -3151.302490234375, "logps/rejected": -3265.275390625, "loss": 1.9358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -87.57536315917969, "rewards/margins": 4.436322212219238, "rewards/rejected": -92.01168823242188, "step": 39970 }, { "epoch": 2.31, "grad_norm": 2.2853399059385993e-05, "learning_rate": 0.00022926583846124077, "logits/chosen": -20.78999900817871, "logits/rejected": -21.839651107788086, "logps/chosen": -2698.961669921875, "logps/rejected": -2610.658447265625, "loss": 14.557, "rewards/accuracies": 0.5, "rewards/chosen": -249.6952362060547, "rewards/margins": -9.346824645996094, "rewards/rejected": -240.3484344482422, "step": 39980 }, { "epoch": 2.31, "grad_norm": 0.003609491977840662, "learning_rate": 0.00022907233252060838, "logits/chosen": -17.760456085205078, "logits/rejected": -17.858068466186523, "logps/chosen": -2710.802001953125, "logps/rejected": -2597.52490234375, "loss": 14.1253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -208.9216766357422, "rewards/margins": -10.74787712097168, "rewards/rejected": -198.17379760742188, "step": 39990 }, { "epoch": 2.32, "grad_norm": 4.9333843890053686e-06, "learning_rate": 0.00022887882657997603, "logits/chosen": -16.351106643676758, "logits/rejected": -18.406585693359375, "logps/chosen": -3056.015380859375, "logps/rejected": -2993.75830078125, "loss": 2.3425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -131.66848754882812, "rewards/margins": 7.443516731262207, "rewards/rejected": -139.11199951171875, "step": 40000 }, { "epoch": 2.32, "grad_norm": 26.94954490661621, "learning_rate": 0.00022868532063934365, "logits/chosen": -16.963634490966797, "logits/rejected": -17.734424591064453, "logps/chosen": -2849.740234375, "logps/rejected": -2612.328369140625, "loss": 2.3983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.62222290039062, "rewards/margins": 12.323564529418945, "rewards/rejected": -191.94581604003906, "step": 40010 }, { "epoch": 2.32, "grad_norm": 9.165154457092285, "learning_rate": 0.00022849181469871124, "logits/chosen": -19.550086975097656, "logits/rejected": -19.482364654541016, "logps/chosen": -2873.3642578125, "logps/rejected": -2630.636474609375, "loss": 5.2106, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -195.15200805664062, "rewards/margins": -3.269564151763916, "rewards/rejected": -191.88243103027344, "step": 40020 }, { "epoch": 2.32, "grad_norm": 89.65940856933594, "learning_rate": 0.00022829830875807888, "logits/chosen": -20.481369018554688, "logits/rejected": -20.015729904174805, "logps/chosen": -2856.2587890625, "logps/rejected": -2733.74658203125, "loss": 7.2109, "rewards/accuracies": 0.5, "rewards/chosen": -221.2667999267578, "rewards/margins": -0.8773528933525085, "rewards/rejected": -220.3894500732422, "step": 40030 }, { "epoch": 2.32, "grad_norm": 5.889860153198242, "learning_rate": 0.0002281048028174465, "logits/chosen": -15.253732681274414, "logits/rejected": -16.37787437438965, "logps/chosen": -3299.087890625, "logps/rejected": -3058.846923828125, "loss": 3.643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.53262329101562, "rewards/margins": 9.079830169677734, "rewards/rejected": -213.6124725341797, "step": 40040 }, { "epoch": 2.32, "grad_norm": 0.01882338337600231, "learning_rate": 0.00022791129687681411, "logits/chosen": -17.767671585083008, "logits/rejected": -18.5241641998291, "logps/chosen": -2908.263916015625, "logps/rejected": -2537.09130859375, "loss": 1.7771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.68215942382812, "rewards/margins": 6.1512885093688965, "rewards/rejected": -151.83346557617188, "step": 40050 }, { "epoch": 2.32, "grad_norm": 129.55026245117188, "learning_rate": 0.00022771779093618176, "logits/chosen": -17.405452728271484, "logits/rejected": -17.502193450927734, "logps/chosen": -2919.853515625, "logps/rejected": -2541.45556640625, "loss": 18.2393, "rewards/accuracies": 0.5, "rewards/chosen": -139.37686157226562, "rewards/margins": -9.409319877624512, "rewards/rejected": -129.96755981445312, "step": 40060 }, { "epoch": 2.32, "grad_norm": 1.0028576850891113, "learning_rate": 0.00022752428499554938, "logits/chosen": -16.6378231048584, "logits/rejected": -16.520030975341797, "logps/chosen": -2990.525146484375, "logps/rejected": -2997.108154296875, "loss": 4.704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -197.2658233642578, "rewards/margins": 5.245844841003418, "rewards/rejected": -202.5116424560547, "step": 40070 }, { "epoch": 2.32, "grad_norm": 8.914853231801703e-19, "learning_rate": 0.000227330779054917, "logits/chosen": -16.11050796508789, "logits/rejected": -16.86594581604004, "logps/chosen": -3366.462890625, "logps/rejected": -2905.405029296875, "loss": 1.6023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -194.92654418945312, "rewards/margins": 8.735467910766602, "rewards/rejected": -203.66201782226562, "step": 40080 }, { "epoch": 2.32, "grad_norm": 102.2341537475586, "learning_rate": 0.0002271372731142846, "logits/chosen": -17.301746368408203, "logits/rejected": -17.60495948791504, "logps/chosen": -3088.39208984375, "logps/rejected": -2538.05078125, "loss": 0.2766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -197.82574462890625, "rewards/margins": 17.1586971282959, "rewards/rejected": -214.9844512939453, "step": 40090 }, { "epoch": 2.32, "grad_norm": 6.212928771972656, "learning_rate": 0.00022694376717365223, "logits/chosen": -14.8303861618042, "logits/rejected": -15.598223686218262, "logps/chosen": -2718.15576171875, "logps/rejected": -2862.349609375, "loss": 6.3327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.23899841308594, "rewards/margins": -1.607343077659607, "rewards/rejected": -152.63165283203125, "step": 40100 }, { "epoch": 2.32, "grad_norm": 361.83428955078125, "learning_rate": 0.00022675026123301987, "logits/chosen": -16.734920501708984, "logits/rejected": -18.2274227142334, "logps/chosen": -2671.517578125, "logps/rejected": -2433.542236328125, "loss": 6.6969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -195.90396118164062, "rewards/margins": 1.4399089813232422, "rewards/rejected": -197.34384155273438, "step": 40110 }, { "epoch": 2.32, "grad_norm": 2.1844716684427112e-05, "learning_rate": 0.0002265567552923875, "logits/chosen": -14.151092529296875, "logits/rejected": -14.653818130493164, "logps/chosen": -2958.044677734375, "logps/rejected": -2592.319580078125, "loss": 4.1855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -95.5836410522461, "rewards/margins": 2.1964786052703857, "rewards/rejected": -97.78011322021484, "step": 40120 }, { "epoch": 2.32, "grad_norm": 1.920054946580538e-16, "learning_rate": 0.00022636324935175508, "logits/chosen": -15.751070976257324, "logits/rejected": -17.723743438720703, "logps/chosen": -3116.322998046875, "logps/rejected": -3168.06005859375, "loss": 1.3056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -197.481689453125, "rewards/margins": 19.555644989013672, "rewards/rejected": -217.037353515625, "step": 40130 }, { "epoch": 2.32, "grad_norm": 50.41615295410156, "learning_rate": 0.00022616974341112272, "logits/chosen": -13.621068954467773, "logits/rejected": -13.66932487487793, "logps/chosen": -3453.171142578125, "logps/rejected": -3430.785888671875, "loss": 3.8512, "rewards/accuracies": 0.5, "rewards/chosen": -28.782352447509766, "rewards/margins": 1.0732910633087158, "rewards/rejected": -29.855642318725586, "step": 40140 }, { "epoch": 2.32, "grad_norm": 32.947486877441406, "learning_rate": 0.00022597623747049034, "logits/chosen": -18.90250015258789, "logits/rejected": -18.781057357788086, "logps/chosen": -2973.10498046875, "logps/rejected": -2870.415283203125, "loss": 2.6505, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -232.1444549560547, "rewards/margins": -0.6876245737075806, "rewards/rejected": -231.4568328857422, "step": 40150 }, { "epoch": 2.32, "grad_norm": 2.152755200768297e-07, "learning_rate": 0.00022578273152985798, "logits/chosen": -12.51911735534668, "logits/rejected": -13.340845108032227, "logps/chosen": -3418.08251953125, "logps/rejected": -3294.845703125, "loss": 2.1278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -69.19258117675781, "rewards/margins": 11.664689064025879, "rewards/rejected": -80.85726928710938, "step": 40160 }, { "epoch": 2.33, "grad_norm": 0.0, "learning_rate": 0.0002255892255892256, "logits/chosen": -14.499041557312012, "logits/rejected": -15.315633773803711, "logps/chosen": -2620.053466796875, "logps/rejected": -2832.974609375, "loss": 1.7193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -98.73506164550781, "rewards/margins": 21.5599422454834, "rewards/rejected": -120.29500579833984, "step": 40170 }, { "epoch": 2.33, "grad_norm": 3.651925289949176e-11, "learning_rate": 0.00022539571964859322, "logits/chosen": -14.26489543914795, "logits/rejected": -15.188013076782227, "logps/chosen": -2770.27294921875, "logps/rejected": -2798.594482421875, "loss": 0.1921, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -136.00404357910156, "rewards/margins": 17.679365158081055, "rewards/rejected": -153.6834259033203, "step": 40180 }, { "epoch": 2.33, "grad_norm": 85.85981750488281, "learning_rate": 0.00022520221370796084, "logits/chosen": -17.149662017822266, "logits/rejected": -19.25979995727539, "logps/chosen": -2766.64599609375, "logps/rejected": -2762.763671875, "loss": 0.1982, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -149.67401123046875, "rewards/margins": 28.700817108154297, "rewards/rejected": -178.37481689453125, "step": 40190 }, { "epoch": 2.33, "grad_norm": 0.0005556250107474625, "learning_rate": 0.00022500870776732845, "logits/chosen": -18.461450576782227, "logits/rejected": -20.388877868652344, "logps/chosen": -2848.208984375, "logps/rejected": -2777.918212890625, "loss": 1.6425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.85256958007812, "rewards/margins": 20.151273727416992, "rewards/rejected": -235.0038604736328, "step": 40200 }, { "epoch": 2.33, "grad_norm": 5.446108407997086e-12, "learning_rate": 0.0002248152018266961, "logits/chosen": -18.108898162841797, "logits/rejected": -20.881088256835938, "logps/chosen": -2755.546142578125, "logps/rejected": -2825.70263671875, "loss": 1.3703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.76541137695312, "rewards/margins": 9.07444953918457, "rewards/rejected": -182.83985900878906, "step": 40210 }, { "epoch": 2.33, "grad_norm": 1.2615943845162292e-08, "learning_rate": 0.00022462169588606371, "logits/chosen": -17.475177764892578, "logits/rejected": -19.391742706298828, "logps/chosen": -2828.46337890625, "logps/rejected": -3002.36181640625, "loss": 4.4494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -183.53799438476562, "rewards/margins": 13.499844551086426, "rewards/rejected": -197.037841796875, "step": 40220 }, { "epoch": 2.33, "grad_norm": 0.0001677974360063672, "learning_rate": 0.00022442818994543133, "logits/chosen": -17.50876235961914, "logits/rejected": -18.2436580657959, "logps/chosen": -2984.51123046875, "logps/rejected": -2970.96826171875, "loss": 6.9835, "rewards/accuracies": 0.5, "rewards/chosen": -201.77365112304688, "rewards/margins": -0.8709793090820312, "rewards/rejected": -200.90264892578125, "step": 40230 }, { "epoch": 2.33, "grad_norm": 54.0831184387207, "learning_rate": 0.00022423468400479895, "logits/chosen": -17.81661033630371, "logits/rejected": -19.038522720336914, "logps/chosen": -2925.468994140625, "logps/rejected": -2469.85888671875, "loss": 12.6192, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -199.24383544921875, "rewards/margins": -8.336636543273926, "rewards/rejected": -190.9071807861328, "step": 40240 }, { "epoch": 2.33, "grad_norm": 86.84198760986328, "learning_rate": 0.00022404117806416657, "logits/chosen": -17.005720138549805, "logits/rejected": -16.68960952758789, "logps/chosen": -2718.23876953125, "logps/rejected": -2855.616455078125, "loss": 2.2264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -143.62994384765625, "rewards/margins": 8.307710647583008, "rewards/rejected": -151.93765258789062, "step": 40250 }, { "epoch": 2.33, "grad_norm": 2.6272235942315092e-08, "learning_rate": 0.00022384767212353418, "logits/chosen": -19.040508270263672, "logits/rejected": -22.66115951538086, "logps/chosen": -2638.30029296875, "logps/rejected": -2727.9501953125, "loss": 7.0748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.85348510742188, "rewards/margins": 18.958791732788086, "rewards/rejected": -175.81227111816406, "step": 40260 }, { "epoch": 2.33, "grad_norm": 1.2279894878730602e-09, "learning_rate": 0.00022365416618290183, "logits/chosen": -17.325454711914062, "logits/rejected": -18.634281158447266, "logps/chosen": -2897.843017578125, "logps/rejected": -2837.052734375, "loss": 2.7626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.10147094726562, "rewards/margins": 8.709617614746094, "rewards/rejected": -149.81106567382812, "step": 40270 }, { "epoch": 2.33, "grad_norm": 0.0002729725674726069, "learning_rate": 0.00022346066024226944, "logits/chosen": -19.433277130126953, "logits/rejected": -23.623319625854492, "logps/chosen": -2744.91943359375, "logps/rejected": -2856.601806640625, "loss": 1.1408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -222.30081176757812, "rewards/margins": 14.0189790725708, "rewards/rejected": -236.31979370117188, "step": 40280 }, { "epoch": 2.33, "grad_norm": 1.199127837026026e-07, "learning_rate": 0.0002232671543016371, "logits/chosen": -18.751407623291016, "logits/rejected": -22.225727081298828, "logps/chosen": -3170.67138671875, "logps/rejected": -2899.623779296875, "loss": 2.3824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.7767791748047, "rewards/margins": 11.85589599609375, "rewards/rejected": -181.63265991210938, "step": 40290 }, { "epoch": 2.33, "grad_norm": 3.344793640280841e-06, "learning_rate": 0.00022307364836100468, "logits/chosen": -21.583948135375977, "logits/rejected": -23.379907608032227, "logps/chosen": -2914.51416015625, "logps/rejected": -2789.48828125, "loss": 0.7721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -229.46286010742188, "rewards/margins": 11.093450546264648, "rewards/rejected": -240.5563201904297, "step": 40300 }, { "epoch": 2.33, "grad_norm": 105.0985336303711, "learning_rate": 0.0002228801424203723, "logits/chosen": -16.61543846130371, "logits/rejected": -17.38194465637207, "logps/chosen": -2300.416748046875, "logps/rejected": -2049.011474609375, "loss": 6.7848, "rewards/accuracies": 0.5, "rewards/chosen": -80.5030746459961, "rewards/margins": 11.403828620910645, "rewards/rejected": -91.90690612792969, "step": 40310 }, { "epoch": 2.33, "grad_norm": 6.703853014392123e-18, "learning_rate": 0.00022268663647973994, "logits/chosen": -19.825437545776367, "logits/rejected": -19.822154998779297, "logps/chosen": -2868.566162109375, "logps/rejected": -2810.62841796875, "loss": 3.9258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -208.9706573486328, "rewards/margins": 11.267454147338867, "rewards/rejected": -220.23812866210938, "step": 40320 }, { "epoch": 2.33, "grad_norm": 1.02262266546188e-15, "learning_rate": 0.00022249313053910756, "logits/chosen": -14.058491706848145, "logits/rejected": -14.708663940429688, "logps/chosen": -3120.98291015625, "logps/rejected": -2966.683837890625, "loss": 6.1195, "rewards/accuracies": 0.5, "rewards/chosen": -69.69664001464844, "rewards/margins": 6.813855171203613, "rewards/rejected": -76.510498046875, "step": 40330 }, { "epoch": 2.34, "grad_norm": 0.000509298755787313, "learning_rate": 0.00022229962459847518, "logits/chosen": -16.738327026367188, "logits/rejected": -18.19580078125, "logps/chosen": -3306.303955078125, "logps/rejected": -3070.849853515625, "loss": 2.0534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -109.78348541259766, "rewards/margins": 7.9989118576049805, "rewards/rejected": -117.78238677978516, "step": 40340 }, { "epoch": 2.34, "grad_norm": 59.2848014831543, "learning_rate": 0.0002221061186578428, "logits/chosen": -18.065998077392578, "logits/rejected": -19.332550048828125, "logps/chosen": -2978.893310546875, "logps/rejected": -2922.93798828125, "loss": 1.2044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -158.31942749023438, "rewards/margins": 8.75040054321289, "rewards/rejected": -167.06983947753906, "step": 40350 }, { "epoch": 2.34, "grad_norm": 53.36738586425781, "learning_rate": 0.0002219126127172104, "logits/chosen": -20.366010665893555, "logits/rejected": -20.314678192138672, "logps/chosen": -2581.99755859375, "logps/rejected": -2425.47607421875, "loss": 3.9883, "rewards/accuracies": 0.5, "rewards/chosen": -170.25979614257812, "rewards/margins": -0.29902711510658264, "rewards/rejected": -169.96075439453125, "step": 40360 }, { "epoch": 2.34, "grad_norm": 7.831454240658786e-08, "learning_rate": 0.00022171910677657805, "logits/chosen": -18.306684494018555, "logits/rejected": -19.106529235839844, "logps/chosen": -3009.25732421875, "logps/rejected": -3000.166259765625, "loss": 0.8427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -221.74856567382812, "rewards/margins": 11.199949264526367, "rewards/rejected": -232.94851684570312, "step": 40370 }, { "epoch": 2.34, "grad_norm": 51.474464416503906, "learning_rate": 0.00022152560083594567, "logits/chosen": -15.563783645629883, "logits/rejected": -16.307125091552734, "logps/chosen": -3090.622802734375, "logps/rejected": -2658.701171875, "loss": 3.0741, "rewards/accuracies": 0.5, "rewards/chosen": -157.2804718017578, "rewards/margins": 9.364568710327148, "rewards/rejected": -166.64503479003906, "step": 40380 }, { "epoch": 2.34, "grad_norm": 112.10197448730469, "learning_rate": 0.0002213320948953133, "logits/chosen": -17.01676368713379, "logits/rejected": -18.696250915527344, "logps/chosen": -2445.96337890625, "logps/rejected": -2734.027587890625, "loss": 4.6657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -160.706298828125, "rewards/margins": 4.0588765144348145, "rewards/rejected": -164.76516723632812, "step": 40390 }, { "epoch": 2.34, "grad_norm": 1.6903986761462875e-06, "learning_rate": 0.00022113858895468093, "logits/chosen": -15.804885864257812, "logits/rejected": -15.76930046081543, "logps/chosen": -2570.63037109375, "logps/rejected": -2864.00390625, "loss": 4.3219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -99.49571228027344, "rewards/margins": 20.57514190673828, "rewards/rejected": -120.07086181640625, "step": 40400 }, { "epoch": 2.34, "grad_norm": 1.3938150633663327e-09, "learning_rate": 0.00022094508301404852, "logits/chosen": -17.545495986938477, "logits/rejected": -20.445842742919922, "logps/chosen": -2522.45703125, "logps/rejected": -2479.055419921875, "loss": 0.81, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.14697265625, "rewards/margins": 14.966565132141113, "rewards/rejected": -213.113525390625, "step": 40410 }, { "epoch": 2.34, "grad_norm": 48.61238098144531, "learning_rate": 0.00022075157707341617, "logits/chosen": -17.516206741333008, "logits/rejected": -16.766632080078125, "logps/chosen": -2743.5224609375, "logps/rejected": -2721.74951171875, "loss": 3.2988, "rewards/accuracies": 0.5, "rewards/chosen": -202.11959838867188, "rewards/margins": 0.45604246854782104, "rewards/rejected": -202.57565307617188, "step": 40420 }, { "epoch": 2.34, "grad_norm": 97.21720123291016, "learning_rate": 0.00022055807113278378, "logits/chosen": -18.07916259765625, "logits/rejected": -16.966257095336914, "logps/chosen": -2681.614501953125, "logps/rejected": -2609.36865234375, "loss": 3.8807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.4918670654297, "rewards/margins": 19.734615325927734, "rewards/rejected": -180.22647094726562, "step": 40430 }, { "epoch": 2.34, "grad_norm": 2.1207668510214717e-08, "learning_rate": 0.0002203645651921514, "logits/chosen": -16.847919464111328, "logits/rejected": -18.26017189025879, "logps/chosen": -2714.171875, "logps/rejected": -2386.78759765625, "loss": 1.6951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -109.42264556884766, "rewards/margins": 9.19178295135498, "rewards/rejected": -118.61442565917969, "step": 40440 }, { "epoch": 2.34, "grad_norm": 0.016020530834794044, "learning_rate": 0.00022017105925151905, "logits/chosen": -18.241336822509766, "logits/rejected": -19.397336959838867, "logps/chosen": -2755.3173828125, "logps/rejected": -2647.48291015625, "loss": 5.1277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.81588745117188, "rewards/margins": 1.1613128185272217, "rewards/rejected": -162.97720336914062, "step": 40450 }, { "epoch": 2.34, "grad_norm": 0.09150157123804092, "learning_rate": 0.00021997755331088664, "logits/chosen": -16.0206356048584, "logits/rejected": -15.86412239074707, "logps/chosen": -3256.512451171875, "logps/rejected": -3175.962158203125, "loss": 1.8277, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -138.80616760253906, "rewards/margins": 10.133635520935059, "rewards/rejected": -148.93980407714844, "step": 40460 }, { "epoch": 2.34, "grad_norm": 55.29273986816406, "learning_rate": 0.00021978404737025425, "logits/chosen": -15.335820198059082, "logits/rejected": -15.376843452453613, "logps/chosen": -3332.371826171875, "logps/rejected": -2853.50927734375, "loss": 2.9943, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.43763732910156, "rewards/margins": 5.983911514282227, "rewards/rejected": -123.42155456542969, "step": 40470 }, { "epoch": 2.34, "grad_norm": 2.126251175127436e-08, "learning_rate": 0.0002195905414296219, "logits/chosen": -22.069122314453125, "logits/rejected": -24.262958526611328, "logps/chosen": -2639.89208984375, "logps/rejected": -2573.12939453125, "loss": 8.1005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -231.11679077148438, "rewards/margins": 3.5803794860839844, "rewards/rejected": -234.6971435546875, "step": 40480 }, { "epoch": 2.34, "grad_norm": 11.469526290893555, "learning_rate": 0.00021939703548898951, "logits/chosen": -18.37997055053711, "logits/rejected": -18.612789154052734, "logps/chosen": -2785.264892578125, "logps/rejected": -2582.68212890625, "loss": 1.8344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.69772338867188, "rewards/margins": 11.039396286010742, "rewards/rejected": -143.73712158203125, "step": 40490 }, { "epoch": 2.34, "grad_norm": 3.651210499810986e-05, "learning_rate": 0.00021920352954835716, "logits/chosen": -22.513643264770508, "logits/rejected": -22.985462188720703, "logps/chosen": -3014.7578125, "logps/rejected": -3094.78173828125, "loss": 5.7705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -237.7555389404297, "rewards/margins": 7.496058464050293, "rewards/rejected": -245.2516326904297, "step": 40500 }, { "epoch": 2.34, "grad_norm": 13.526840209960938, "learning_rate": 0.00021901002360772475, "logits/chosen": -15.899142265319824, "logits/rejected": -15.804484367370605, "logps/chosen": -2718.23974609375, "logps/rejected": -2771.06787109375, "loss": 4.3607, "rewards/accuracies": 0.5, "rewards/chosen": -123.53878021240234, "rewards/margins": 6.726564884185791, "rewards/rejected": -130.26535034179688, "step": 40510 }, { "epoch": 2.35, "grad_norm": 23.644914627075195, "learning_rate": 0.00021881651766709237, "logits/chosen": -20.329635620117188, "logits/rejected": -25.199098587036133, "logps/chosen": -2708.43408203125, "logps/rejected": -2778.882568359375, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -186.16043090820312, "rewards/margins": 15.109052658081055, "rewards/rejected": -201.26950073242188, "step": 40520 }, { "epoch": 2.35, "grad_norm": 7.689166571827526e-18, "learning_rate": 0.00021862301172646, "logits/chosen": -19.19161033630371, "logits/rejected": -18.25676155090332, "logps/chosen": -2921.194580078125, "logps/rejected": -2770.54931640625, "loss": 6.651, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.89620971679688, "rewards/margins": -0.7668342590332031, "rewards/rejected": -173.12937927246094, "step": 40530 }, { "epoch": 2.35, "grad_norm": 1.3456398248672485, "learning_rate": 0.00021842950578582763, "logits/chosen": -17.344228744506836, "logits/rejected": -17.849445343017578, "logps/chosen": -2830.01123046875, "logps/rejected": -2886.012451171875, "loss": 4.8588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.32632446289062, "rewards/margins": 2.5114219188690186, "rewards/rejected": -152.83775329589844, "step": 40540 }, { "epoch": 2.35, "grad_norm": 0.0, "learning_rate": 0.00021823599984519527, "logits/chosen": -20.354894638061523, "logits/rejected": -20.705501556396484, "logps/chosen": -3018.35986328125, "logps/rejected": -3113.22509765625, "loss": 5.7203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.239013671875, "rewards/margins": 13.779756546020508, "rewards/rejected": -204.01876831054688, "step": 40550 }, { "epoch": 2.35, "grad_norm": 1.7870658769523866e-15, "learning_rate": 0.0002180424939045629, "logits/chosen": -18.57024383544922, "logits/rejected": -19.124176025390625, "logps/chosen": -2793.897705078125, "logps/rejected": -2571.987060546875, "loss": 11.7978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -108.98347473144531, "rewards/margins": -2.9675283432006836, "rewards/rejected": -106.01595306396484, "step": 40560 }, { "epoch": 2.35, "grad_norm": 11.542793273925781, "learning_rate": 0.00021784898796393048, "logits/chosen": -23.40268898010254, "logits/rejected": -23.556560516357422, "logps/chosen": -2667.210205078125, "logps/rejected": -2680.69580078125, "loss": 3.1592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -206.25613403320312, "rewards/margins": 1.882619857788086, "rewards/rejected": -208.1387481689453, "step": 40570 }, { "epoch": 2.35, "grad_norm": 7.69794818167302e-12, "learning_rate": 0.00021765548202329812, "logits/chosen": -17.751976013183594, "logits/rejected": -18.645980834960938, "logps/chosen": -2736.481201171875, "logps/rejected": -2763.469970703125, "loss": 2.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.5240936279297, "rewards/margins": 7.240691184997559, "rewards/rejected": -155.76478576660156, "step": 40580 }, { "epoch": 2.35, "grad_norm": 0.020626110956072807, "learning_rate": 0.00021746197608266574, "logits/chosen": -17.45041275024414, "logits/rejected": -17.30083465576172, "logps/chosen": -2969.175048828125, "logps/rejected": -3090.61083984375, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": -154.5945587158203, "rewards/margins": 5.838309288024902, "rewards/rejected": -160.43283081054688, "step": 40590 }, { "epoch": 2.35, "grad_norm": 0.7628733515739441, "learning_rate": 0.00021726847014203336, "logits/chosen": -21.777099609375, "logits/rejected": -21.547443389892578, "logps/chosen": -2608.60205078125, "logps/rejected": -2632.27392578125, "loss": 1.689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.5823516845703, "rewards/margins": 3.796464443206787, "rewards/rejected": -197.37881469726562, "step": 40600 }, { "epoch": 2.35, "grad_norm": 45.303810119628906, "learning_rate": 0.000217074964201401, "logits/chosen": -22.177114486694336, "logits/rejected": -23.92868423461914, "logps/chosen": -2999.60400390625, "logps/rejected": -2688.24462890625, "loss": 14.9388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -225.0615692138672, "rewards/margins": -11.660921096801758, "rewards/rejected": -213.400634765625, "step": 40610 }, { "epoch": 2.35, "grad_norm": 33.995155334472656, "learning_rate": 0.0002168814582607686, "logits/chosen": -18.05834197998047, "logits/rejected": -18.697078704833984, "logps/chosen": -2852.45849609375, "logps/rejected": -2705.26416015625, "loss": 13.2957, "rewards/accuracies": 0.5, "rewards/chosen": -108.40983581542969, "rewards/margins": -2.9112160205841064, "rewards/rejected": -105.4986343383789, "step": 40620 }, { "epoch": 2.35, "grad_norm": 71.29209899902344, "learning_rate": 0.00021668795232013624, "logits/chosen": -19.468822479248047, "logits/rejected": -19.863439559936523, "logps/chosen": -3074.287841796875, "logps/rejected": -3022.819580078125, "loss": 3.2782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.9635467529297, "rewards/margins": 1.5609420537948608, "rewards/rejected": -167.5244903564453, "step": 40630 }, { "epoch": 2.35, "grad_norm": 4.7974895167612885e-09, "learning_rate": 0.00021649444637950385, "logits/chosen": -21.185691833496094, "logits/rejected": -22.919248580932617, "logps/chosen": -2962.58642578125, "logps/rejected": -3081.914794921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -221.7544708251953, "rewards/margins": 17.265155792236328, "rewards/rejected": -239.0196075439453, "step": 40640 }, { "epoch": 2.35, "grad_norm": 7.402605842798948e-05, "learning_rate": 0.00021630094043887147, "logits/chosen": -18.03628158569336, "logits/rejected": -20.96870994567871, "logps/chosen": -3022.860595703125, "logps/rejected": -2680.869140625, "loss": 16.2666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -156.61328125, "rewards/margins": -5.06403112411499, "rewards/rejected": -151.5492401123047, "step": 40650 }, { "epoch": 2.35, "grad_norm": 3.7379443290630476e-11, "learning_rate": 0.00021610743449823911, "logits/chosen": -16.41058349609375, "logits/rejected": -18.165685653686523, "logps/chosen": -3368.438720703125, "logps/rejected": -3153.173583984375, "loss": 5.0452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.5452651977539, "rewards/margins": 9.128568649291992, "rewards/rejected": -124.67384338378906, "step": 40660 }, { "epoch": 2.35, "grad_norm": 79.0977554321289, "learning_rate": 0.00021591392855760673, "logits/chosen": -21.968259811401367, "logits/rejected": -24.920682907104492, "logps/chosen": -2890.23681640625, "logps/rejected": -2840.98193359375, "loss": 2.9073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -249.1134033203125, "rewards/margins": 10.626704216003418, "rewards/rejected": -259.74005126953125, "step": 40670 }, { "epoch": 2.35, "grad_norm": 5.493875026702881, "learning_rate": 0.00021572042261697432, "logits/chosen": -18.301040649414062, "logits/rejected": -19.35032081604004, "logps/chosen": -2856.648681640625, "logps/rejected": -2569.391845703125, "loss": 0.8805, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -152.8546905517578, "rewards/margins": 7.022839546203613, "rewards/rejected": -159.87753295898438, "step": 40680 }, { "epoch": 2.36, "grad_norm": 9.60448932647705, "learning_rate": 0.00021552691667634197, "logits/chosen": -20.81790542602539, "logits/rejected": -21.86881446838379, "logps/chosen": -3182.80419921875, "logps/rejected": -2851.53173828125, "loss": 4.1048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.64212036132812, "rewards/margins": 5.274203300476074, "rewards/rejected": -212.9163055419922, "step": 40690 }, { "epoch": 2.36, "grad_norm": 1.446874875909998e-06, "learning_rate": 0.00021533341073570958, "logits/chosen": -20.913930892944336, "logits/rejected": -23.184232711791992, "logps/chosen": -2809.43701171875, "logps/rejected": -2953.84716796875, "loss": 2.0473, "rewards/accuracies": 0.5, "rewards/chosen": -200.07920837402344, "rewards/margins": 2.5632941722869873, "rewards/rejected": -202.64248657226562, "step": 40700 }, { "epoch": 2.36, "grad_norm": 3.411250393270393e-09, "learning_rate": 0.00021513990479507723, "logits/chosen": -18.782197952270508, "logits/rejected": -19.626996994018555, "logps/chosen": -2867.1845703125, "logps/rejected": -2713.7421875, "loss": 3.1638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -186.60826110839844, "rewards/margins": 6.225438117980957, "rewards/rejected": -192.83370971679688, "step": 40710 }, { "epoch": 2.36, "grad_norm": 0.0009683073731139302, "learning_rate": 0.00021494639885444485, "logits/chosen": -17.310810089111328, "logits/rejected": -18.591304779052734, "logps/chosen": -2793.628173828125, "logps/rejected": -2619.005859375, "loss": 0.6614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.2942352294922, "rewards/margins": 20.979093551635742, "rewards/rejected": -193.2733154296875, "step": 40720 }, { "epoch": 2.36, "grad_norm": 23.60474967956543, "learning_rate": 0.00021475289291381244, "logits/chosen": -18.4411678314209, "logits/rejected": -18.766223907470703, "logps/chosen": -2739.987548828125, "logps/rejected": -2654.66015625, "loss": 0.5186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -122.41133880615234, "rewards/margins": 15.475872993469238, "rewards/rejected": -137.88720703125, "step": 40730 }, { "epoch": 2.36, "grad_norm": 75.27416229248047, "learning_rate": 0.00021455938697318008, "logits/chosen": -19.765033721923828, "logits/rejected": -19.542505264282227, "logps/chosen": -2722.911376953125, "logps/rejected": -3000.938720703125, "loss": 1.2032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.05331420898438, "rewards/margins": 22.20484161376953, "rewards/rejected": -162.2581329345703, "step": 40740 }, { "epoch": 2.36, "grad_norm": 74.29576110839844, "learning_rate": 0.0002143658810325477, "logits/chosen": -17.801513671875, "logits/rejected": -17.77121353149414, "logps/chosen": -2967.5986328125, "logps/rejected": -3303.069091796875, "loss": 2.4562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.58392333984375, "rewards/margins": 9.503291130065918, "rewards/rejected": -184.0872039794922, "step": 40750 }, { "epoch": 2.36, "grad_norm": 0.00981154479086399, "learning_rate": 0.00021417237509191534, "logits/chosen": -20.876358032226562, "logits/rejected": -23.365032196044922, "logps/chosen": -3118.106201171875, "logps/rejected": -3136.899169921875, "loss": 3.8434, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -203.65951538085938, "rewards/margins": 2.0518667697906494, "rewards/rejected": -205.7113800048828, "step": 40760 }, { "epoch": 2.36, "grad_norm": 96.33906555175781, "learning_rate": 0.00021397886915128296, "logits/chosen": -17.357173919677734, "logits/rejected": -17.598651885986328, "logps/chosen": -2975.551025390625, "logps/rejected": -3072.21630859375, "loss": 2.6733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.9290008544922, "rewards/margins": 9.648271560668945, "rewards/rejected": -165.57728576660156, "step": 40770 }, { "epoch": 2.36, "grad_norm": 43.91044998168945, "learning_rate": 0.00021378536321065058, "logits/chosen": -20.557098388671875, "logits/rejected": -21.21797752380371, "logps/chosen": -2922.9248046875, "logps/rejected": -2861.05859375, "loss": 2.5256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.9500274658203, "rewards/margins": 2.410736322402954, "rewards/rejected": -227.3607635498047, "step": 40780 }, { "epoch": 2.36, "grad_norm": 18.681995391845703, "learning_rate": 0.0002135918572700182, "logits/chosen": -17.649105072021484, "logits/rejected": -18.530773162841797, "logps/chosen": -2776.673828125, "logps/rejected": -2829.248779296875, "loss": 4.6737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.80276489257812, "rewards/margins": 4.4284443855285645, "rewards/rejected": -184.23121643066406, "step": 40790 }, { "epoch": 2.36, "grad_norm": 3.277855853411893e-08, "learning_rate": 0.0002133983513293858, "logits/chosen": -17.357730865478516, "logits/rejected": -17.52522850036621, "logps/chosen": -3103.20458984375, "logps/rejected": -2881.184814453125, "loss": 6.3001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -213.35946655273438, "rewards/margins": 4.159063816070557, "rewards/rejected": -217.51852416992188, "step": 40800 }, { "epoch": 2.36, "grad_norm": 153.564453125, "learning_rate": 0.00021320484538875343, "logits/chosen": -16.22634506225586, "logits/rejected": -16.881702423095703, "logps/chosen": -3292.41162109375, "logps/rejected": -2995.93896484375, "loss": 13.5913, "rewards/accuracies": 0.5, "rewards/chosen": -137.31285095214844, "rewards/margins": -3.2127578258514404, "rewards/rejected": -134.10009765625, "step": 40810 }, { "epoch": 2.36, "grad_norm": 82.11215209960938, "learning_rate": 0.00021301133944812107, "logits/chosen": -16.609485626220703, "logits/rejected": -17.671741485595703, "logps/chosen": -2976.076171875, "logps/rejected": -2942.497314453125, "loss": 9.2685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.68878173828125, "rewards/margins": 3.019347667694092, "rewards/rejected": -156.70814514160156, "step": 40820 }, { "epoch": 2.36, "grad_norm": 6.773095674811672e-15, "learning_rate": 0.0002128178335074887, "logits/chosen": -16.008230209350586, "logits/rejected": -15.763354301452637, "logps/chosen": -2861.852294921875, "logps/rejected": -3009.7236328125, "loss": 5.9538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -116.05427551269531, "rewards/margins": 8.167095184326172, "rewards/rejected": -124.22135925292969, "step": 40830 }, { "epoch": 2.36, "grad_norm": 9.433856964111328, "learning_rate": 0.0002126243275668563, "logits/chosen": -17.138254165649414, "logits/rejected": -16.955883026123047, "logps/chosen": -2874.169677734375, "logps/rejected": -2829.958251953125, "loss": 2.4236, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.64828491210938, "rewards/margins": 0.9996105432510376, "rewards/rejected": -164.6478729248047, "step": 40840 }, { "epoch": 2.36, "grad_norm": 0.1090799942612648, "learning_rate": 0.00021243082162622392, "logits/chosen": -17.144121170043945, "logits/rejected": -17.467021942138672, "logps/chosen": -2962.40966796875, "logps/rejected": -2382.155029296875, "loss": 2.3206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -193.6422119140625, "rewards/margins": 9.352925300598145, "rewards/rejected": -202.99514770507812, "step": 40850 }, { "epoch": 2.37, "grad_norm": 0.9604871869087219, "learning_rate": 0.00021223731568559154, "logits/chosen": -17.563098907470703, "logits/rejected": -19.788433074951172, "logps/chosen": -2868.89208984375, "logps/rejected": -2953.736328125, "loss": 4.1864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -178.31686401367188, "rewards/margins": 11.170236587524414, "rewards/rejected": -189.48709106445312, "step": 40860 }, { "epoch": 2.37, "grad_norm": 0.0657319501042366, "learning_rate": 0.00021204380974495918, "logits/chosen": -20.9326171875, "logits/rejected": -23.29247283935547, "logps/chosen": -2741.173095703125, "logps/rejected": -2602.65576171875, "loss": 15.0271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -209.3392791748047, "rewards/margins": -6.750892639160156, "rewards/rejected": -202.58839416503906, "step": 40870 }, { "epoch": 2.37, "grad_norm": 0.07592016458511353, "learning_rate": 0.0002118503038043268, "logits/chosen": -17.8515567779541, "logits/rejected": -20.62816047668457, "logps/chosen": -2404.51025390625, "logps/rejected": -2369.58349609375, "loss": 14.7491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.30136108398438, "rewards/margins": -3.1643385887145996, "rewards/rejected": -160.13702392578125, "step": 40880 }, { "epoch": 2.37, "grad_norm": 86.3030014038086, "learning_rate": 0.00021165679786369445, "logits/chosen": -17.532148361206055, "logits/rejected": -19.17415428161621, "logps/chosen": -2811.78515625, "logps/rejected": -2814.43115234375, "loss": 1.1913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.24832153320312, "rewards/margins": 10.657491683959961, "rewards/rejected": -182.9058074951172, "step": 40890 }, { "epoch": 2.37, "grad_norm": 2.3609700292581692e-06, "learning_rate": 0.00021146329192306204, "logits/chosen": -16.358612060546875, "logits/rejected": -17.32851791381836, "logps/chosen": -2850.365966796875, "logps/rejected": -2706.859619140625, "loss": 2.7075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.05470275878906, "rewards/margins": 4.196542739868164, "rewards/rejected": -180.25125122070312, "step": 40900 }, { "epoch": 2.37, "grad_norm": 5.700268957298249e-06, "learning_rate": 0.00021126978598242965, "logits/chosen": -17.071033477783203, "logits/rejected": -17.348602294921875, "logps/chosen": -2550.95947265625, "logps/rejected": -2775.656494140625, "loss": 0.5637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.90414428710938, "rewards/margins": 13.304868698120117, "rewards/rejected": -203.20901489257812, "step": 40910 }, { "epoch": 2.37, "grad_norm": 2.682637841644464e-06, "learning_rate": 0.0002110762800417973, "logits/chosen": -18.262144088745117, "logits/rejected": -19.576454162597656, "logps/chosen": -2504.622314453125, "logps/rejected": -2577.5302734375, "loss": 3.9046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -217.5455322265625, "rewards/margins": 14.752873420715332, "rewards/rejected": -232.2983856201172, "step": 40920 }, { "epoch": 2.37, "grad_norm": 0.02484777756035328, "learning_rate": 0.00021088277410116491, "logits/chosen": -15.33710765838623, "logits/rejected": -17.540430068969727, "logps/chosen": -3068.435546875, "logps/rejected": -2438.384765625, "loss": 2.5116, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -118.65202331542969, "rewards/margins": 8.318960189819336, "rewards/rejected": -126.97098541259766, "step": 40930 }, { "epoch": 2.37, "grad_norm": 0.11605561524629593, "learning_rate": 0.00021068926816053253, "logits/chosen": -18.24187660217285, "logits/rejected": -20.501708984375, "logps/chosen": -2655.7861328125, "logps/rejected": -2434.332763671875, "loss": 12.0688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.6326446533203, "rewards/margins": 0.38889390230178833, "rewards/rejected": -145.02151489257812, "step": 40940 }, { "epoch": 2.37, "grad_norm": 36.40011978149414, "learning_rate": 0.00021049576221990015, "logits/chosen": -15.292436599731445, "logits/rejected": -16.33237648010254, "logps/chosen": -2983.28369140625, "logps/rejected": -2873.20068359375, "loss": 0.9531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -86.25231170654297, "rewards/margins": 26.202083587646484, "rewards/rejected": -112.45439147949219, "step": 40950 }, { "epoch": 2.37, "grad_norm": 129.68849182128906, "learning_rate": 0.00021030225627926777, "logits/chosen": -17.614398956298828, "logits/rejected": -18.28802490234375, "logps/chosen": -2923.45068359375, "logps/rejected": -2819.80615234375, "loss": 2.9828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.53506469726562, "rewards/margins": 19.772945404052734, "rewards/rejected": -180.30801391601562, "step": 40960 }, { "epoch": 2.37, "grad_norm": 88.1797866821289, "learning_rate": 0.0002101087503386354, "logits/chosen": -20.003822326660156, "logits/rejected": -21.885072708129883, "logps/chosen": -2699.720458984375, "logps/rejected": -2614.535400390625, "loss": 2.4333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -131.08033752441406, "rewards/margins": 17.07927894592285, "rewards/rejected": -148.1595916748047, "step": 40970 }, { "epoch": 2.37, "grad_norm": 3.364689837326296e-05, "learning_rate": 0.00020991524439800303, "logits/chosen": -19.094127655029297, "logits/rejected": -21.57491683959961, "logps/chosen": -2758.82568359375, "logps/rejected": -2666.50390625, "loss": 1.5474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -196.53787231445312, "rewards/margins": 11.460959434509277, "rewards/rejected": -207.9988250732422, "step": 40980 }, { "epoch": 2.37, "grad_norm": 2.1205347366048954e-05, "learning_rate": 0.00020972173845737064, "logits/chosen": -21.503719329833984, "logits/rejected": -22.29764175415039, "logps/chosen": -2905.17431640625, "logps/rejected": -2877.264404296875, "loss": 1.5285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -256.9900207519531, "rewards/margins": 2.881272792816162, "rewards/rejected": -259.87127685546875, "step": 40990 }, { "epoch": 2.37, "grad_norm": 54.44673156738281, "learning_rate": 0.0002095282325167383, "logits/chosen": -19.204675674438477, "logits/rejected": -19.488685607910156, "logps/chosen": -2761.874755859375, "logps/rejected": -2964.562255859375, "loss": 3.5314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.69740295410156, "rewards/margins": 4.528970241546631, "rewards/rejected": -192.22634887695312, "step": 41000 }, { "epoch": 2.37, "grad_norm": 80.9209976196289, "learning_rate": 0.00020933472657610588, "logits/chosen": -17.611961364746094, "logits/rejected": -17.980558395385742, "logps/chosen": -2804.21630859375, "logps/rejected": -2630.93017578125, "loss": 1.0584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.5425567626953, "rewards/margins": 13.612573623657227, "rewards/rejected": -168.15512084960938, "step": 41010 }, { "epoch": 2.37, "grad_norm": 1.3473912474637473e-07, "learning_rate": 0.0002091412206354735, "logits/chosen": -21.08807945251465, "logits/rejected": -23.209062576293945, "logps/chosen": -2394.546630859375, "logps/rejected": -2497.36572265625, "loss": 2.2556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -212.59573364257812, "rewards/margins": 18.0631046295166, "rewards/rejected": -230.65884399414062, "step": 41020 }, { "epoch": 2.37, "grad_norm": 5.516271114349365, "learning_rate": 0.00020894771469484114, "logits/chosen": -17.715435028076172, "logits/rejected": -20.051753997802734, "logps/chosen": -2538.626708984375, "logps/rejected": -2600.698486328125, "loss": 23.3976, "rewards/accuracies": 0.5, "rewards/chosen": -142.5717010498047, "rewards/margins": -15.999178886413574, "rewards/rejected": -126.57252502441406, "step": 41030 }, { "epoch": 2.38, "grad_norm": 5.490786714190676e-10, "learning_rate": 0.00020875420875420876, "logits/chosen": -15.752459526062012, "logits/rejected": -16.936418533325195, "logps/chosen": -3139.62158203125, "logps/rejected": -2726.149658203125, "loss": 7.8734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.1690216064453, "rewards/margins": 1.3091957569122314, "rewards/rejected": -168.47821044921875, "step": 41040 }, { "epoch": 2.38, "grad_norm": 15.65599536895752, "learning_rate": 0.0002085607028135764, "logits/chosen": -16.992408752441406, "logits/rejected": -18.835561752319336, "logps/chosen": -3469.829345703125, "logps/rejected": -3149.30224609375, "loss": 1.0829, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -132.6728973388672, "rewards/margins": 9.017614364624023, "rewards/rejected": -141.6905059814453, "step": 41050 }, { "epoch": 2.38, "grad_norm": 65.09366607666016, "learning_rate": 0.000208367196872944, "logits/chosen": -17.73263168334961, "logits/rejected": -17.842647552490234, "logps/chosen": -2903.684326171875, "logps/rejected": -2481.53466796875, "loss": 9.1751, "rewards/accuracies": 0.5, "rewards/chosen": -156.50711059570312, "rewards/margins": -1.290148377418518, "rewards/rejected": -155.21697998046875, "step": 41060 }, { "epoch": 2.38, "grad_norm": 0.007634031120687723, "learning_rate": 0.0002081736909323116, "logits/chosen": -15.549140930175781, "logits/rejected": -16.224899291992188, "logps/chosen": -3302.58447265625, "logps/rejected": -3143.152099609375, "loss": 0.5563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -134.7926025390625, "rewards/margins": 11.935476303100586, "rewards/rejected": -146.72808837890625, "step": 41070 }, { "epoch": 2.38, "grad_norm": 0.00035112613113597035, "learning_rate": 0.00020798018499167925, "logits/chosen": -17.625431060791016, "logits/rejected": -18.79480743408203, "logps/chosen": -2709.920166015625, "logps/rejected": -2626.5595703125, "loss": 4.9204, "rewards/accuracies": 0.5, "rewards/chosen": -190.80067443847656, "rewards/margins": 2.2070515155792236, "rewards/rejected": -193.00772094726562, "step": 41080 }, { "epoch": 2.38, "grad_norm": 0.002791247796267271, "learning_rate": 0.00020778667905104687, "logits/chosen": -13.46385383605957, "logits/rejected": -14.204383850097656, "logps/chosen": -3123.77783203125, "logps/rejected": -3197.11669921875, "loss": 4.9399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -124.4988784790039, "rewards/margins": 8.459953308105469, "rewards/rejected": -132.95884704589844, "step": 41090 }, { "epoch": 2.38, "grad_norm": 1.0978252085180884e-08, "learning_rate": 0.00020759317311041451, "logits/chosen": -17.574668884277344, "logits/rejected": -18.21514129638672, "logps/chosen": -2836.75830078125, "logps/rejected": -2759.01806640625, "loss": 4.8431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.52346801757812, "rewards/margins": 4.3503007888793945, "rewards/rejected": -185.87380981445312, "step": 41100 }, { "epoch": 2.38, "grad_norm": 101.61045837402344, "learning_rate": 0.00020739966716978213, "logits/chosen": -17.055471420288086, "logits/rejected": -17.366573333740234, "logps/chosen": -3274.5078125, "logps/rejected": -3311.961669921875, "loss": 3.7531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -214.6298828125, "rewards/margins": 5.444626808166504, "rewards/rejected": -220.07449340820312, "step": 41110 }, { "epoch": 2.38, "grad_norm": 72.28263854980469, "learning_rate": 0.00020720616122914972, "logits/chosen": -15.279342651367188, "logits/rejected": -15.650858879089355, "logps/chosen": -3074.97802734375, "logps/rejected": -3134.05859375, "loss": 6.9634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.88226318359375, "rewards/margins": 5.047981262207031, "rewards/rejected": -171.9302520751953, "step": 41120 }, { "epoch": 2.38, "grad_norm": 0.7593338489532471, "learning_rate": 0.00020701265528851737, "logits/chosen": -13.779397964477539, "logits/rejected": -14.122465133666992, "logps/chosen": -3382.326171875, "logps/rejected": -3234.02001953125, "loss": 1.7166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -88.38614654541016, "rewards/margins": 4.632689476013184, "rewards/rejected": -93.01883697509766, "step": 41130 }, { "epoch": 2.38, "grad_norm": 0.003396110376343131, "learning_rate": 0.00020681914934788498, "logits/chosen": -15.68751335144043, "logits/rejected": -16.156173706054688, "logps/chosen": -2615.249267578125, "logps/rejected": -2606.85107421875, "loss": 1.8307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -141.8389129638672, "rewards/margins": 7.295281887054443, "rewards/rejected": -149.13421630859375, "step": 41140 }, { "epoch": 2.38, "grad_norm": 6.371807081961833e-09, "learning_rate": 0.0002066256434072526, "logits/chosen": -15.884866714477539, "logits/rejected": -16.714557647705078, "logps/chosen": -2933.12841796875, "logps/rejected": -2976.63134765625, "loss": 3.7384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.11207580566406, "rewards/margins": 4.363428115844727, "rewards/rejected": -201.4755096435547, "step": 41150 }, { "epoch": 2.38, "grad_norm": 0.05695919319987297, "learning_rate": 0.00020643213746662025, "logits/chosen": -17.70560646057129, "logits/rejected": -18.36660385131836, "logps/chosen": -3199.935302734375, "logps/rejected": -3009.301025390625, "loss": 2.3523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.91171264648438, "rewards/margins": 5.7110395431518555, "rewards/rejected": -180.6227569580078, "step": 41160 }, { "epoch": 2.38, "grad_norm": 2.7894463539123535, "learning_rate": 0.00020623863152598784, "logits/chosen": -16.64417266845703, "logits/rejected": -16.087467193603516, "logps/chosen": -2800.98095703125, "logps/rejected": -3109.309326171875, "loss": 4.2209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.56460571289062, "rewards/margins": 1.3313353061676025, "rewards/rejected": -166.89593505859375, "step": 41170 }, { "epoch": 2.38, "grad_norm": 15.958146095275879, "learning_rate": 0.00020604512558535548, "logits/chosen": -19.27469825744629, "logits/rejected": -17.83077621459961, "logps/chosen": -2821.240234375, "logps/rejected": -2625.90673828125, "loss": 2.1922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -213.3618927001953, "rewards/margins": 7.828772068023682, "rewards/rejected": -221.1906280517578, "step": 41180 }, { "epoch": 2.38, "grad_norm": 40.588809967041016, "learning_rate": 0.0002058516196447231, "logits/chosen": -18.914302825927734, "logits/rejected": -18.457015991210938, "logps/chosen": -2930.337158203125, "logps/rejected": -3233.955810546875, "loss": 1.2281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -209.57955932617188, "rewards/margins": 16.13300132751465, "rewards/rejected": -225.7125701904297, "step": 41190 }, { "epoch": 2.38, "grad_norm": 0.9081230163574219, "learning_rate": 0.00020565811370409071, "logits/chosen": -17.585725784301758, "logits/rejected": -16.557388305664062, "logps/chosen": -3281.61328125, "logps/rejected": -3198.229248046875, "loss": 4.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -178.06015014648438, "rewards/margins": 5.19590425491333, "rewards/rejected": -183.25604248046875, "step": 41200 }, { "epoch": 2.39, "grad_norm": 242.86636352539062, "learning_rate": 0.00020546460776345836, "logits/chosen": -17.289831161499023, "logits/rejected": -19.086854934692383, "logps/chosen": -3188.569580078125, "logps/rejected": -3015.168212890625, "loss": 15.0455, "rewards/accuracies": 0.5, "rewards/chosen": -240.81332397460938, "rewards/margins": -6.3182692527771, "rewards/rejected": -234.4950714111328, "step": 41210 }, { "epoch": 2.39, "grad_norm": 78.32653045654297, "learning_rate": 0.00020527110182282598, "logits/chosen": -17.69808006286621, "logits/rejected": -18.692949295043945, "logps/chosen": -2922.40380859375, "logps/rejected": -2942.173095703125, "loss": 3.2444, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -219.82723999023438, "rewards/margins": 2.820202589035034, "rewards/rejected": -222.6474609375, "step": 41220 }, { "epoch": 2.39, "grad_norm": 7.962201731270824e-11, "learning_rate": 0.00020507759588219357, "logits/chosen": -14.493545532226562, "logits/rejected": -15.367815971374512, "logps/chosen": -3887.48681640625, "logps/rejected": -3271.679931640625, "loss": 7.6169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.7082977294922, "rewards/margins": -0.9659566879272461, "rewards/rejected": -146.74234008789062, "step": 41230 }, { "epoch": 2.39, "grad_norm": 256.59185791015625, "learning_rate": 0.0002048840899415612, "logits/chosen": -17.572309494018555, "logits/rejected": -19.599658966064453, "logps/chosen": -3304.95361328125, "logps/rejected": -2469.32080078125, "loss": 11.2297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.06887817382812, "rewards/margins": 4.134660243988037, "rewards/rejected": -156.2035369873047, "step": 41240 }, { "epoch": 2.39, "grad_norm": 90.224365234375, "learning_rate": 0.00020469058400092883, "logits/chosen": -19.979162216186523, "logits/rejected": -20.315818786621094, "logps/chosen": -3091.2685546875, "logps/rejected": -3252.5146484375, "loss": 4.5058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -223.1669158935547, "rewards/margins": 6.084439277648926, "rewards/rejected": -229.25137329101562, "step": 41250 }, { "epoch": 2.39, "grad_norm": 1.1560076018213294e-05, "learning_rate": 0.00020449707806029647, "logits/chosen": -17.819089889526367, "logits/rejected": -19.22212791442871, "logps/chosen": -3227.71142578125, "logps/rejected": -3261.040283203125, "loss": 0.7941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.4349822998047, "rewards/margins": 7.937676429748535, "rewards/rejected": -234.37265014648438, "step": 41260 }, { "epoch": 2.39, "grad_norm": 41.67488098144531, "learning_rate": 0.0002043035721196641, "logits/chosen": -18.941387176513672, "logits/rejected": -19.414058685302734, "logps/chosen": -3335.39990234375, "logps/rejected": -3403.02099609375, "loss": 2.1028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.90756225585938, "rewards/margins": 14.031959533691406, "rewards/rejected": -231.9395294189453, "step": 41270 }, { "epoch": 2.39, "grad_norm": 6.891943485243246e-06, "learning_rate": 0.00020411006617903168, "logits/chosen": -19.734905242919922, "logits/rejected": -20.712308883666992, "logps/chosen": -3327.783203125, "logps/rejected": -3392.567138671875, "loss": 7.4915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -206.45492553710938, "rewards/margins": 0.042832184582948685, "rewards/rejected": -206.4977569580078, "step": 41280 }, { "epoch": 2.39, "grad_norm": 82.96320343017578, "learning_rate": 0.00020391656023839932, "logits/chosen": -20.58116340637207, "logits/rejected": -22.160165786743164, "logps/chosen": -3301.08349609375, "logps/rejected": -3369.35595703125, "loss": 1.864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -258.1309509277344, "rewards/margins": 7.847662925720215, "rewards/rejected": -265.9786682128906, "step": 41290 }, { "epoch": 2.39, "grad_norm": 104.5331039428711, "learning_rate": 0.00020372305429776694, "logits/chosen": -18.624187469482422, "logits/rejected": -20.196578979492188, "logps/chosen": -3423.25146484375, "logps/rejected": -3097.83154296875, "loss": 3.6174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -225.82742309570312, "rewards/margins": 2.8448777198791504, "rewards/rejected": -228.6723175048828, "step": 41300 }, { "epoch": 2.39, "grad_norm": 114.31489562988281, "learning_rate": 0.00020352954835713458, "logits/chosen": -20.899044036865234, "logits/rejected": -20.3399658203125, "logps/chosen": -3364.6015625, "logps/rejected": -3281.74609375, "loss": 6.6508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -231.88839721679688, "rewards/margins": -3.0980472564697266, "rewards/rejected": -228.7903594970703, "step": 41310 }, { "epoch": 2.39, "grad_norm": 0.3828510344028473, "learning_rate": 0.0002033360424165022, "logits/chosen": -17.088817596435547, "logits/rejected": -19.13919448852539, "logps/chosen": -3608.53125, "logps/rejected": -3393.55224609375, "loss": 5.0008, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -211.4207000732422, "rewards/margins": 2.1641457080841064, "rewards/rejected": -213.58486938476562, "step": 41320 }, { "epoch": 2.39, "grad_norm": 0.02775692380964756, "learning_rate": 0.0002031425364758698, "logits/chosen": -14.225851058959961, "logits/rejected": -14.521374702453613, "logps/chosen": -3657.850341796875, "logps/rejected": -3380.13623046875, "loss": 1.6613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -95.13398742675781, "rewards/margins": 8.72599983215332, "rewards/rejected": -103.8599853515625, "step": 41330 }, { "epoch": 2.39, "grad_norm": 7.73257215769263e-06, "learning_rate": 0.00020294903053523744, "logits/chosen": -17.63468360900879, "logits/rejected": -20.51694679260254, "logps/chosen": -3071.431640625, "logps/rejected": -2774.73193359375, "loss": 10.0292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.94528198242188, "rewards/margins": 0.4279020428657532, "rewards/rejected": -168.37319946289062, "step": 41340 }, { "epoch": 2.39, "grad_norm": 3.064570166249734e-17, "learning_rate": 0.00020275552459460505, "logits/chosen": -14.918070793151855, "logits/rejected": -14.935376167297363, "logps/chosen": -3299.627685546875, "logps/rejected": -2894.845947265625, "loss": 3.4507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -86.85437774658203, "rewards/margins": 10.5604887008667, "rewards/rejected": -97.41487121582031, "step": 41350 }, { "epoch": 2.39, "grad_norm": 68.49210357666016, "learning_rate": 0.00020256201865397267, "logits/chosen": -16.7236328125, "logits/rejected": -17.23868751525879, "logps/chosen": -2776.09033203125, "logps/rejected": -2881.16650390625, "loss": 0.6592, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -197.1918487548828, "rewards/margins": 9.669412612915039, "rewards/rejected": -206.86123657226562, "step": 41360 }, { "epoch": 2.39, "grad_norm": 1.2279702428508177e-11, "learning_rate": 0.00020236851271334031, "logits/chosen": -16.67629623413086, "logits/rejected": -18.873626708984375, "logps/chosen": -2937.42578125, "logps/rejected": -2510.56298828125, "loss": 19.9667, "rewards/accuracies": 0.5, "rewards/chosen": -128.94485473632812, "rewards/margins": -2.632640838623047, "rewards/rejected": -126.31221771240234, "step": 41370 }, { "epoch": 2.4, "grad_norm": 81.52333068847656, "learning_rate": 0.00020217500677270793, "logits/chosen": -15.057235717773438, "logits/rejected": -15.377891540527344, "logps/chosen": -3313.121826171875, "logps/rejected": -2887.833251953125, "loss": 2.9416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -94.84476470947266, "rewards/margins": 8.14759635925293, "rewards/rejected": -102.99235534667969, "step": 41380 }, { "epoch": 2.4, "grad_norm": 82.517822265625, "learning_rate": 0.00020198150083207555, "logits/chosen": -17.333593368530273, "logits/rejected": -17.73177146911621, "logps/chosen": -3248.7626953125, "logps/rejected": -3212.584228515625, "loss": 2.6171, "rewards/accuracies": 0.5, "rewards/chosen": -163.80189514160156, "rewards/margins": 5.106375217437744, "rewards/rejected": -168.9082794189453, "step": 41390 }, { "epoch": 2.4, "grad_norm": 196.63290405273438, "learning_rate": 0.00020178799489144317, "logits/chosen": -19.69286346435547, "logits/rejected": -23.768938064575195, "logps/chosen": -3072.92529296875, "logps/rejected": -3124.02392578125, "loss": 12.1779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.5152587890625, "rewards/margins": -1.679088830947876, "rewards/rejected": -188.83615112304688, "step": 41400 }, { "epoch": 2.4, "grad_norm": 3.1635801792144775, "learning_rate": 0.00020159448895081078, "logits/chosen": -18.34151268005371, "logits/rejected": -19.97756004333496, "logps/chosen": -2601.884765625, "logps/rejected": -2730.92236328125, "loss": 0.1366, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -138.78482055664062, "rewards/margins": 21.687326431274414, "rewards/rejected": -160.47215270996094, "step": 41410 }, { "epoch": 2.4, "grad_norm": 0.07218460738658905, "learning_rate": 0.00020140098301017843, "logits/chosen": -21.83416175842285, "logits/rejected": -24.423322677612305, "logps/chosen": -2728.15869140625, "logps/rejected": -2553.00244140625, "loss": 16.5125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.1128692626953, "rewards/margins": -9.730512619018555, "rewards/rejected": -195.38233947753906, "step": 41420 }, { "epoch": 2.4, "grad_norm": 2.1026411056518555, "learning_rate": 0.00020120747706954604, "logits/chosen": -19.65062713623047, "logits/rejected": -18.613807678222656, "logps/chosen": -2836.560302734375, "logps/rejected": -2877.3310546875, "loss": 2.8964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.2384796142578, "rewards/margins": 18.331748962402344, "rewards/rejected": -190.57022094726562, "step": 41430 }, { "epoch": 2.4, "grad_norm": 5.497257232666016, "learning_rate": 0.00020101397112891366, "logits/chosen": -16.10439109802246, "logits/rejected": -16.549087524414062, "logps/chosen": -3299.167236328125, "logps/rejected": -3104.39892578125, "loss": 1.0038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -107.77913665771484, "rewards/margins": 14.083078384399414, "rewards/rejected": -121.86222839355469, "step": 41440 }, { "epoch": 2.4, "grad_norm": 85.03849792480469, "learning_rate": 0.00020082046518828128, "logits/chosen": -17.024982452392578, "logits/rejected": -17.017208099365234, "logps/chosen": -2859.011962890625, "logps/rejected": -2960.33544921875, "loss": 4.7645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -180.47796630859375, "rewards/margins": 3.5021228790283203, "rewards/rejected": -183.98008728027344, "step": 41450 }, { "epoch": 2.4, "grad_norm": 8.05710076434707e-09, "learning_rate": 0.0002006269592476489, "logits/chosen": -16.883129119873047, "logits/rejected": -18.15303611755371, "logps/chosen": -2812.47119140625, "logps/rejected": -2549.22216796875, "loss": 3.7987, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.6988525390625, "rewards/margins": 9.929487228393555, "rewards/rejected": -183.62832641601562, "step": 41460 }, { "epoch": 2.4, "grad_norm": 20.753847122192383, "learning_rate": 0.00020043345330701654, "logits/chosen": -19.739574432373047, "logits/rejected": -21.322866439819336, "logps/chosen": -2535.958740234375, "logps/rejected": -2394.087646484375, "loss": 7.4338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.22018432617188, "rewards/margins": 3.071277141571045, "rewards/rejected": -196.29147338867188, "step": 41470 }, { "epoch": 2.4, "grad_norm": 0.0011975892120972276, "learning_rate": 0.00020023994736638416, "logits/chosen": -17.151418685913086, "logits/rejected": -19.58509635925293, "logps/chosen": -2993.18603515625, "logps/rejected": -3001.24169921875, "loss": 1.0206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -191.24203491210938, "rewards/margins": 16.847530364990234, "rewards/rejected": -208.089599609375, "step": 41480 }, { "epoch": 2.4, "grad_norm": 1.0468157597642858e-05, "learning_rate": 0.00020004644142575177, "logits/chosen": -19.020235061645508, "logits/rejected": -19.127914428710938, "logps/chosen": -2992.196533203125, "logps/rejected": -2928.704833984375, "loss": 13.6678, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -159.8497314453125, "rewards/margins": -2.8087615966796875, "rewards/rejected": -157.04098510742188, "step": 41490 }, { "epoch": 2.4, "grad_norm": 8.676113397382414e-19, "learning_rate": 0.0001998529354851194, "logits/chosen": -17.98476219177246, "logits/rejected": -18.974035263061523, "logps/chosen": -2780.24462890625, "logps/rejected": -2527.72509765625, "loss": 0.2851, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -172.79299926757812, "rewards/margins": 20.143692016601562, "rewards/rejected": -192.93667602539062, "step": 41500 }, { "epoch": 2.4, "grad_norm": 74.4274673461914, "learning_rate": 0.000199659429544487, "logits/chosen": -18.26897621154785, "logits/rejected": -18.02222442626953, "logps/chosen": -2896.301513671875, "logps/rejected": -2680.663818359375, "loss": 4.6819, "rewards/accuracies": 0.5, "rewards/chosen": -154.85195922851562, "rewards/margins": 1.2583038806915283, "rewards/rejected": -156.11026000976562, "step": 41510 }, { "epoch": 2.4, "grad_norm": 3.011760991390394e-19, "learning_rate": 0.00019946592360385465, "logits/chosen": -19.09772300720215, "logits/rejected": -20.376270294189453, "logps/chosen": -3018.11962890625, "logps/rejected": -3019.323486328125, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -191.26580810546875, "rewards/margins": 13.356101989746094, "rewards/rejected": -204.62191772460938, "step": 41520 }, { "epoch": 2.4, "grad_norm": 0.9112070798873901, "learning_rate": 0.00019927241766322227, "logits/chosen": -16.2712345123291, "logits/rejected": -18.704225540161133, "logps/chosen": -2732.566650390625, "logps/rejected": -2638.774169921875, "loss": 1.6518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -187.88528442382812, "rewards/margins": 10.038999557495117, "rewards/rejected": -197.92428588867188, "step": 41530 }, { "epoch": 2.4, "grad_norm": 0.00013551376468967646, "learning_rate": 0.0001990789117225899, "logits/chosen": -20.29886817932129, "logits/rejected": -21.73532485961914, "logps/chosen": -2911.32568359375, "logps/rejected": -2570.33935546875, "loss": 2.3804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.47213745117188, "rewards/margins": 23.336528778076172, "rewards/rejected": -217.8086700439453, "step": 41540 }, { "epoch": 2.41, "grad_norm": 0.011759895831346512, "learning_rate": 0.0001988854057819575, "logits/chosen": -17.323551177978516, "logits/rejected": -19.73136329650879, "logps/chosen": -3112.251953125, "logps/rejected": -2839.59765625, "loss": 1.9001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -126.32695007324219, "rewards/margins": 19.318368911743164, "rewards/rejected": -145.6453399658203, "step": 41550 }, { "epoch": 2.41, "grad_norm": 5.38653564453125, "learning_rate": 0.00019869189984132512, "logits/chosen": -17.192359924316406, "logits/rejected": -18.196462631225586, "logps/chosen": -2986.60546875, "logps/rejected": -3068.38916015625, "loss": 2.1625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -161.0772247314453, "rewards/margins": 6.741004943847656, "rewards/rejected": -167.81822204589844, "step": 41560 }, { "epoch": 2.41, "grad_norm": 23.465560913085938, "learning_rate": 0.00019849839390069274, "logits/chosen": -17.562475204467773, "logits/rejected": -17.314563751220703, "logps/chosen": -2700.100341796875, "logps/rejected": -2464.615966796875, "loss": 4.4501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -125.06483459472656, "rewards/margins": 4.294646263122559, "rewards/rejected": -129.35946655273438, "step": 41570 }, { "epoch": 2.41, "grad_norm": 0.010926111601293087, "learning_rate": 0.00019830488796006038, "logits/chosen": -16.159751892089844, "logits/rejected": -18.12539291381836, "logps/chosen": -3027.5673828125, "logps/rejected": -2822.539794921875, "loss": 4.139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.89993286132812, "rewards/margins": 4.5279860496521, "rewards/rejected": -199.4279327392578, "step": 41580 }, { "epoch": 2.41, "grad_norm": 0.40346765518188477, "learning_rate": 0.000198111382019428, "logits/chosen": -17.443370819091797, "logits/rejected": -17.601852416992188, "logps/chosen": -3026.866943359375, "logps/rejected": -2675.921630859375, "loss": 1.3434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.19798278808594, "rewards/margins": 11.109292984008789, "rewards/rejected": -170.30728149414062, "step": 41590 }, { "epoch": 2.41, "grad_norm": 0.0012450257781893015, "learning_rate": 0.00019791787607879565, "logits/chosen": -14.631312370300293, "logits/rejected": -14.317743301391602, "logps/chosen": -2801.39697265625, "logps/rejected": -2818.775146484375, "loss": 4.6349, "rewards/accuracies": 0.5, "rewards/chosen": -108.9129867553711, "rewards/margins": -0.33920592069625854, "rewards/rejected": -108.57377624511719, "step": 41600 }, { "epoch": 2.41, "grad_norm": 5.234984001845078e-09, "learning_rate": 0.00019772437013816324, "logits/chosen": -19.1795654296875, "logits/rejected": -21.432353973388672, "logps/chosen": -2802.704833984375, "logps/rejected": -2589.37646484375, "loss": 2.1254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.20970153808594, "rewards/margins": 10.202014923095703, "rewards/rejected": -187.41172790527344, "step": 41610 }, { "epoch": 2.41, "grad_norm": 3.843932867050171, "learning_rate": 0.00019753086419753085, "logits/chosen": -17.460657119750977, "logits/rejected": -18.662294387817383, "logps/chosen": -2431.72265625, "logps/rejected": -2754.06005859375, "loss": 5.04, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.8097686767578, "rewards/margins": 1.0331833362579346, "rewards/rejected": -192.84292602539062, "step": 41620 }, { "epoch": 2.41, "grad_norm": 65.45478057861328, "learning_rate": 0.0001973373582568985, "logits/chosen": -16.930654525756836, "logits/rejected": -17.157058715820312, "logps/chosen": -2980.17138671875, "logps/rejected": -3020.759765625, "loss": 0.9139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -169.1126708984375, "rewards/margins": 8.370950698852539, "rewards/rejected": -177.48361206054688, "step": 41630 }, { "epoch": 2.41, "grad_norm": 2.6463754693395458e-05, "learning_rate": 0.00019714385231626611, "logits/chosen": -16.16534423828125, "logits/rejected": -17.492759704589844, "logps/chosen": -3085.25927734375, "logps/rejected": -2822.589111328125, "loss": 1.5394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -201.59140014648438, "rewards/margins": 11.198860168457031, "rewards/rejected": -212.7902374267578, "step": 41640 }, { "epoch": 2.41, "grad_norm": 7.485356263714493e-07, "learning_rate": 0.00019695034637563376, "logits/chosen": -17.466999053955078, "logits/rejected": -19.045856475830078, "logps/chosen": -3024.989013671875, "logps/rejected": -3086.50390625, "loss": 1.6082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -176.46481323242188, "rewards/margins": 7.392777442932129, "rewards/rejected": -183.85757446289062, "step": 41650 }, { "epoch": 2.41, "grad_norm": 3.114113411584185e-10, "learning_rate": 0.00019675684043500135, "logits/chosen": -16.81585693359375, "logits/rejected": -17.216615676879883, "logps/chosen": -3186.400146484375, "logps/rejected": -3267.31005859375, "loss": 0.2582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -209.39950561523438, "rewards/margins": 14.824636459350586, "rewards/rejected": -224.22415161132812, "step": 41660 }, { "epoch": 2.41, "grad_norm": 302.2403259277344, "learning_rate": 0.00019656333449436897, "logits/chosen": -17.16801643371582, "logits/rejected": -16.457674026489258, "logps/chosen": -3321.90380859375, "logps/rejected": -2868.46728515625, "loss": 5.5976, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.76559448242188, "rewards/margins": 0.20586737990379333, "rewards/rejected": -131.97146606445312, "step": 41670 }, { "epoch": 2.41, "grad_norm": 5.4858386984522195e-14, "learning_rate": 0.0001963698285537366, "logits/chosen": -15.665201187133789, "logits/rejected": -16.08738899230957, "logps/chosen": -3030.191650390625, "logps/rejected": -2632.006591796875, "loss": 3.3402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -128.3853302001953, "rewards/margins": 25.70760726928711, "rewards/rejected": -154.09292602539062, "step": 41680 }, { "epoch": 2.41, "grad_norm": 4.932441234588623, "learning_rate": 0.00019617632261310423, "logits/chosen": -17.073165893554688, "logits/rejected": -17.151187896728516, "logps/chosen": -3228.468017578125, "logps/rejected": -2995.997802734375, "loss": 1.2055, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -134.29196166992188, "rewards/margins": 14.422683715820312, "rewards/rejected": -148.7146453857422, "step": 41690 }, { "epoch": 2.41, "grad_norm": 62.476924896240234, "learning_rate": 0.00019598281667247184, "logits/chosen": -17.801013946533203, "logits/rejected": -19.115150451660156, "logps/chosen": -2963.40966796875, "logps/rejected": -2622.415283203125, "loss": 1.9672, "rewards/accuracies": 0.5, "rewards/chosen": -189.8140411376953, "rewards/margins": 2.3765666484832764, "rewards/rejected": -192.1905975341797, "step": 41700 }, { "epoch": 2.41, "grad_norm": 5.191983222961426, "learning_rate": 0.0001957893107318395, "logits/chosen": -19.465713500976562, "logits/rejected": -20.41499137878418, "logps/chosen": -2803.427001953125, "logps/rejected": -2740.85009765625, "loss": 0.927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.4235076904297, "rewards/margins": 9.614511489868164, "rewards/rejected": -148.0380096435547, "step": 41710 }, { "epoch": 2.41, "grad_norm": 50.17478561401367, "learning_rate": 0.00019559580479120708, "logits/chosen": -15.9493408203125, "logits/rejected": -16.44232749938965, "logps/chosen": -3020.21533203125, "logps/rejected": -2966.566162109375, "loss": 1.4238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -150.4322052001953, "rewards/margins": 9.109598159790039, "rewards/rejected": -159.5417938232422, "step": 41720 }, { "epoch": 2.42, "grad_norm": 68.94549560546875, "learning_rate": 0.00019540229885057472, "logits/chosen": -18.15446662902832, "logits/rejected": -18.80550765991211, "logps/chosen": -2835.225830078125, "logps/rejected": -2797.66650390625, "loss": 2.2165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.79623413085938, "rewards/margins": 8.0555419921875, "rewards/rejected": -220.85177612304688, "step": 41730 }, { "epoch": 2.42, "grad_norm": 55.080570220947266, "learning_rate": 0.00019520879290994234, "logits/chosen": -20.222637176513672, "logits/rejected": -20.445842742919922, "logps/chosen": -2751.67236328125, "logps/rejected": -2745.745849609375, "loss": 3.0672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -237.6470184326172, "rewards/margins": 7.7730817794799805, "rewards/rejected": -245.4200897216797, "step": 41740 }, { "epoch": 2.42, "grad_norm": 0.0, "learning_rate": 0.00019501528696930996, "logits/chosen": -16.654298782348633, "logits/rejected": -16.960067749023438, "logps/chosen": -2893.665771484375, "logps/rejected": -3016.634765625, "loss": 0.0724, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -145.55325317382812, "rewards/margins": 19.38312530517578, "rewards/rejected": -164.93637084960938, "step": 41750 }, { "epoch": 2.42, "grad_norm": 2.0382648902028677e-09, "learning_rate": 0.0001948217810286776, "logits/chosen": -17.285247802734375, "logits/rejected": -16.82724952697754, "logps/chosen": -2873.67236328125, "logps/rejected": -3015.701904296875, "loss": 2.8935, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.49057006835938, "rewards/margins": 3.5344719886779785, "rewards/rejected": -162.02503967285156, "step": 41760 }, { "epoch": 2.42, "grad_norm": 77.48551177978516, "learning_rate": 0.0001946282750880452, "logits/chosen": -17.29623794555664, "logits/rejected": -17.746707916259766, "logps/chosen": -3029.732177734375, "logps/rejected": -2766.315673828125, "loss": 2.0983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.28799438476562, "rewards/margins": 5.071476936340332, "rewards/rejected": -231.35946655273438, "step": 41770 }, { "epoch": 2.42, "grad_norm": 87.52729034423828, "learning_rate": 0.00019443476914741284, "logits/chosen": -17.57163429260254, "logits/rejected": -20.25261116027832, "logps/chosen": -3020.734130859375, "logps/rejected": -2790.899658203125, "loss": 9.6651, "rewards/accuracies": 0.5, "rewards/chosen": -188.1287841796875, "rewards/margins": -3.337401866912842, "rewards/rejected": -184.7913360595703, "step": 41780 }, { "epoch": 2.42, "grad_norm": 0.0, "learning_rate": 0.00019424126320678045, "logits/chosen": -19.4802188873291, "logits/rejected": -19.08102035522461, "logps/chosen": -3437.06201171875, "logps/rejected": -3245.27783203125, "loss": 2.661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -266.26971435546875, "rewards/margins": 13.708398818969727, "rewards/rejected": -279.9781494140625, "step": 41790 }, { "epoch": 2.42, "grad_norm": 5.3152948709112025e-08, "learning_rate": 0.00019404775726614807, "logits/chosen": -17.542875289916992, "logits/rejected": -18.148950576782227, "logps/chosen": -3036.541015625, "logps/rejected": -2770.654052734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -171.80343627929688, "rewards/margins": 19.294443130493164, "rewards/rejected": -191.09786987304688, "step": 41800 }, { "epoch": 2.42, "grad_norm": 0.2626948058605194, "learning_rate": 0.00019385425132551571, "logits/chosen": -20.710376739501953, "logits/rejected": -23.583003997802734, "logps/chosen": -2519.728271484375, "logps/rejected": -2529.3212890625, "loss": 4.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.14865112304688, "rewards/margins": 4.094106197357178, "rewards/rejected": -213.24276733398438, "step": 41810 }, { "epoch": 2.42, "grad_norm": 69.6431884765625, "learning_rate": 0.00019366074538488333, "logits/chosen": -17.27628517150879, "logits/rejected": -20.550655364990234, "logps/chosen": -3204.50244140625, "logps/rejected": -3048.03466796875, "loss": 4.0787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.7762451171875, "rewards/margins": 8.761045455932617, "rewards/rejected": -179.53729248046875, "step": 41820 }, { "epoch": 2.42, "grad_norm": 3.603423692766228e-07, "learning_rate": 0.00019346723944425092, "logits/chosen": -17.173095703125, "logits/rejected": -17.40689468383789, "logps/chosen": -3107.85888671875, "logps/rejected": -2991.72119140625, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -189.93814086914062, "rewards/margins": 25.27119255065918, "rewards/rejected": -215.20932006835938, "step": 41830 }, { "epoch": 2.42, "grad_norm": 0.040876373648643494, "learning_rate": 0.00019327373350361857, "logits/chosen": -15.151168823242188, "logits/rejected": -16.324548721313477, "logps/chosen": -3089.664306640625, "logps/rejected": -2813.881103515625, "loss": 1.3851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -124.0045166015625, "rewards/margins": 9.079174995422363, "rewards/rejected": -133.08367919921875, "step": 41840 }, { "epoch": 2.42, "grad_norm": 0.01156886201351881, "learning_rate": 0.00019308022756298618, "logits/chosen": -16.895299911499023, "logits/rejected": -17.393184661865234, "logps/chosen": -2682.60302734375, "logps/rejected": -2606.809326171875, "loss": 5.9519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -160.1133575439453, "rewards/margins": 3.21722149848938, "rewards/rejected": -163.33056640625, "step": 41850 }, { "epoch": 2.42, "grad_norm": 2.0451431274414062, "learning_rate": 0.00019288672162235383, "logits/chosen": -16.29355812072754, "logits/rejected": -16.331954956054688, "logps/chosen": -2495.44482421875, "logps/rejected": -2344.10498046875, "loss": 3.4928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -111.93748474121094, "rewards/margins": 12.272177696228027, "rewards/rejected": -124.20967102050781, "step": 41860 }, { "epoch": 2.42, "grad_norm": 0.029869331046938896, "learning_rate": 0.00019269321568172144, "logits/chosen": -19.136016845703125, "logits/rejected": -20.39891815185547, "logps/chosen": -2746.235107421875, "logps/rejected": -2938.510986328125, "loss": 0.1348, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -192.140869140625, "rewards/margins": 30.734704971313477, "rewards/rejected": -222.8755645751953, "step": 41870 }, { "epoch": 2.42, "grad_norm": 2.9185276174552343e-15, "learning_rate": 0.00019249970974108903, "logits/chosen": -18.7891845703125, "logits/rejected": -19.040678024291992, "logps/chosen": -2948.92724609375, "logps/rejected": -2836.048828125, "loss": 4.5839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -192.07015991210938, "rewards/margins": 3.209716796875, "rewards/rejected": -195.2798614501953, "step": 41880 }, { "epoch": 2.42, "grad_norm": 120.15225982666016, "learning_rate": 0.00019230620380045668, "logits/chosen": -16.38127326965332, "logits/rejected": -16.469881057739258, "logps/chosen": -3135.6064453125, "logps/rejected": -3034.26611328125, "loss": 5.0194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.69723510742188, "rewards/margins": 3.171358585357666, "rewards/rejected": -144.8685760498047, "step": 41890 }, { "epoch": 2.43, "grad_norm": 52.86676025390625, "learning_rate": 0.0001921126978598243, "logits/chosen": -18.011518478393555, "logits/rejected": -17.95253562927246, "logps/chosen": -3251.880615234375, "logps/rejected": -3050.255126953125, "loss": 8.3032, "rewards/accuracies": 0.5, "rewards/chosen": -184.62716674804688, "rewards/margins": 5.276668548583984, "rewards/rejected": -189.9038543701172, "step": 41900 }, { "epoch": 2.43, "grad_norm": 0.004002428613603115, "learning_rate": 0.00019191919191919191, "logits/chosen": -16.108402252197266, "logits/rejected": -17.575702667236328, "logps/chosen": -3169.468994140625, "logps/rejected": -3045.234375, "loss": 18.1321, "rewards/accuracies": 0.5, "rewards/chosen": -125.13191223144531, "rewards/margins": -15.732772827148438, "rewards/rejected": -109.39912414550781, "step": 41910 }, { "epoch": 2.43, "grad_norm": 0.015129636973142624, "learning_rate": 0.00019172568597855956, "logits/chosen": -19.47820281982422, "logits/rejected": -21.21078872680664, "logps/chosen": -2766.53369140625, "logps/rejected": -2906.60009765625, "loss": 2.8988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -223.3129425048828, "rewards/margins": 4.50747013092041, "rewards/rejected": -227.8204345703125, "step": 41920 }, { "epoch": 2.43, "grad_norm": 0.0006964675267226994, "learning_rate": 0.00019153218003792718, "logits/chosen": -17.799144744873047, "logits/rejected": -19.844087600708008, "logps/chosen": -2893.309326171875, "logps/rejected": -2759.858154296875, "loss": 16.633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.04852294921875, "rewards/margins": -6.804129123687744, "rewards/rejected": -148.24440002441406, "step": 41930 }, { "epoch": 2.43, "grad_norm": 6.367259629769251e-05, "learning_rate": 0.0001913386740972948, "logits/chosen": -19.755172729492188, "logits/rejected": -19.670352935791016, "logps/chosen": -3034.88134765625, "logps/rejected": -3008.4541015625, "loss": 3.5946, "rewards/accuracies": 0.5, "rewards/chosen": -199.52804565429688, "rewards/margins": 8.056539535522461, "rewards/rejected": -207.5845947265625, "step": 41940 }, { "epoch": 2.43, "grad_norm": 0.0006779133691452444, "learning_rate": 0.0001911451681566624, "logits/chosen": -17.42286491394043, "logits/rejected": -18.0889949798584, "logps/chosen": -3206.18505859375, "logps/rejected": -2878.295654296875, "loss": 0.532, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -132.29598999023438, "rewards/margins": 17.98005485534668, "rewards/rejected": -150.27603149414062, "step": 41950 }, { "epoch": 2.43, "grad_norm": 0.031398847699165344, "learning_rate": 0.00019095166221603003, "logits/chosen": -18.308818817138672, "logits/rejected": -19.42542839050293, "logps/chosen": -2780.994384765625, "logps/rejected": -2397.556884765625, "loss": 3.0243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -138.7615203857422, "rewards/margins": 13.882661819458008, "rewards/rejected": -152.64418029785156, "step": 41960 }, { "epoch": 2.43, "grad_norm": 28.855960845947266, "learning_rate": 0.00019075815627539767, "logits/chosen": -18.242290496826172, "logits/rejected": -20.017684936523438, "logps/chosen": -2769.287109375, "logps/rejected": -2833.459228515625, "loss": 1.9148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.50186157226562, "rewards/margins": 10.770950317382812, "rewards/rejected": -190.27279663085938, "step": 41970 }, { "epoch": 2.43, "grad_norm": 0.0002411051536910236, "learning_rate": 0.0001905646503347653, "logits/chosen": -17.583433151245117, "logits/rejected": -17.638248443603516, "logps/chosen": -2784.92529296875, "logps/rejected": -2800.933349609375, "loss": 2.3119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -203.2570037841797, "rewards/margins": 1.7806065082550049, "rewards/rejected": -205.03762817382812, "step": 41980 }, { "epoch": 2.43, "grad_norm": 9.60485553741455, "learning_rate": 0.0001903711443941329, "logits/chosen": -15.632777214050293, "logits/rejected": -15.797431945800781, "logps/chosen": -3020.04296875, "logps/rejected": -2853.47607421875, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": -125.3928451538086, "rewards/margins": 16.67615509033203, "rewards/rejected": -142.06900024414062, "step": 41990 }, { "epoch": 2.43, "grad_norm": 3.8889753818511963, "learning_rate": 0.00019017763845350052, "logits/chosen": -17.371671676635742, "logits/rejected": -17.525936126708984, "logps/chosen": -2747.51416015625, "logps/rejected": -2804.925048828125, "loss": 1.6294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -202.91259765625, "rewards/margins": 7.965049743652344, "rewards/rejected": -210.87765502929688, "step": 42000 }, { "epoch": 2.43, "grad_norm": 0.0, "learning_rate": 0.00018998413251286814, "logits/chosen": -20.849720001220703, "logits/rejected": -22.773845672607422, "logps/chosen": -2533.508544921875, "logps/rejected": -2731.65625, "loss": 4.9564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.62054443359375, "rewards/margins": 25.983972549438477, "rewards/rejected": -215.6045379638672, "step": 42010 }, { "epoch": 2.43, "grad_norm": 26.959949493408203, "learning_rate": 0.00018979062657223578, "logits/chosen": -20.87216567993164, "logits/rejected": -21.50428581237793, "logps/chosen": -2739.99169921875, "logps/rejected": -2673.79833984375, "loss": 1.0089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.23037719726562, "rewards/margins": 4.916424751281738, "rewards/rejected": -180.14682006835938, "step": 42020 }, { "epoch": 2.43, "grad_norm": 0.0003289948799647391, "learning_rate": 0.0001895971206316034, "logits/chosen": -22.439599990844727, "logits/rejected": -25.470291137695312, "logps/chosen": -2698.677001953125, "logps/rejected": -2514.973876953125, "loss": 4.0816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -203.236572265625, "rewards/margins": 8.608331680297852, "rewards/rejected": -211.8448944091797, "step": 42030 }, { "epoch": 2.43, "grad_norm": 28.84722900390625, "learning_rate": 0.000189403614690971, "logits/chosen": -17.747791290283203, "logits/rejected": -18.470012664794922, "logps/chosen": -2935.71337890625, "logps/rejected": -3043.73681640625, "loss": 3.4337, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -185.58370971679688, "rewards/margins": 6.1946120262146, "rewards/rejected": -191.77835083007812, "step": 42040 }, { "epoch": 2.43, "grad_norm": 1.2699919693659467e-07, "learning_rate": 0.00018921010875033864, "logits/chosen": -17.34183120727539, "logits/rejected": -19.075366973876953, "logps/chosen": -2893.586181640625, "logps/rejected": -2908.810546875, "loss": 0.7149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -208.39859008789062, "rewards/margins": 9.741960525512695, "rewards/rejected": -218.14053344726562, "step": 42050 }, { "epoch": 2.43, "grad_norm": 97.3476791381836, "learning_rate": 0.00018901660280970625, "logits/chosen": -18.513267517089844, "logits/rejected": -19.479251861572266, "logps/chosen": -2962.87451171875, "logps/rejected": -3059.969970703125, "loss": 2.723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.3824691772461, "rewards/margins": 12.83619213104248, "rewards/rejected": -132.21865844726562, "step": 42060 }, { "epoch": 2.44, "grad_norm": 0.11374744772911072, "learning_rate": 0.0001888230968690739, "logits/chosen": -18.715713500976562, "logits/rejected": -20.164701461791992, "logps/chosen": -2838.224365234375, "logps/rejected": -2904.0732421875, "loss": 2.9595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.76382446289062, "rewards/margins": 6.568333625793457, "rewards/rejected": -224.3321533203125, "step": 42070 }, { "epoch": 2.44, "grad_norm": 0.17015331983566284, "learning_rate": 0.00018862959092844151, "logits/chosen": -16.24490737915039, "logits/rejected": -17.34419822692871, "logps/chosen": -3058.72900390625, "logps/rejected": -2767.43310546875, "loss": 1.8758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.61740112304688, "rewards/margins": 16.706466674804688, "rewards/rejected": -173.32388305664062, "step": 42080 }, { "epoch": 2.44, "grad_norm": 2.8648381444895676e-08, "learning_rate": 0.00018843608498780913, "logits/chosen": -20.378955841064453, "logits/rejected": -20.489492416381836, "logps/chosen": -3122.63037109375, "logps/rejected": -2972.19384765625, "loss": 4.7078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -237.9500732421875, "rewards/margins": 11.12535572052002, "rewards/rejected": -249.075439453125, "step": 42090 }, { "epoch": 2.44, "grad_norm": 87.0215835571289, "learning_rate": 0.00018824257904717675, "logits/chosen": -18.717458724975586, "logits/rejected": -19.062440872192383, "logps/chosen": -3281.858154296875, "logps/rejected": -3242.0771484375, "loss": 5.0693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.8057098388672, "rewards/margins": 3.3567168712615967, "rewards/rejected": -221.1624298095703, "step": 42100 }, { "epoch": 2.44, "grad_norm": 83.58832550048828, "learning_rate": 0.00018804907310654437, "logits/chosen": -21.157367706298828, "logits/rejected": -22.27511215209961, "logps/chosen": -2743.791015625, "logps/rejected": -2735.62353515625, "loss": 2.5051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -244.7637176513672, "rewards/margins": 6.037446022033691, "rewards/rejected": -250.8011474609375, "step": 42110 }, { "epoch": 2.44, "grad_norm": 0.19447751343250275, "learning_rate": 0.00018785556716591198, "logits/chosen": -14.613470077514648, "logits/rejected": -15.604341506958008, "logps/chosen": -3184.93115234375, "logps/rejected": -2666.73779296875, "loss": 1.5093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -104.2387924194336, "rewards/margins": 15.760539054870605, "rewards/rejected": -119.99934387207031, "step": 42120 }, { "epoch": 2.44, "grad_norm": 4.078400135040283, "learning_rate": 0.00018766206122527963, "logits/chosen": -17.640714645385742, "logits/rejected": -17.58744239807129, "logps/chosen": -3318.539794921875, "logps/rejected": -3017.19384765625, "loss": 2.3619, "rewards/accuracies": 0.5, "rewards/chosen": -182.27825927734375, "rewards/margins": 5.393714904785156, "rewards/rejected": -187.67196655273438, "step": 42130 }, { "epoch": 2.44, "grad_norm": 84.82939910888672, "learning_rate": 0.00018746855528464724, "logits/chosen": -18.733766555786133, "logits/rejected": -17.035472869873047, "logps/chosen": -3190.925048828125, "logps/rejected": -3068.095458984375, "loss": 5.6689, "rewards/accuracies": 0.5, "rewards/chosen": -176.10360717773438, "rewards/margins": 3.373300075531006, "rewards/rejected": -179.47691345214844, "step": 42140 }, { "epoch": 2.44, "grad_norm": 0.004530816804617643, "learning_rate": 0.00018727504934401486, "logits/chosen": -14.719825744628906, "logits/rejected": -15.15270709991455, "logps/chosen": -3136.08984375, "logps/rejected": -2860.81494140625, "loss": 2.5468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.71713256835938, "rewards/margins": 7.878035068511963, "rewards/rejected": -175.59518432617188, "step": 42150 }, { "epoch": 2.44, "grad_norm": 11.911471366882324, "learning_rate": 0.00018708154340338248, "logits/chosen": -19.685606002807617, "logits/rejected": -21.582799911499023, "logps/chosen": -2942.344482421875, "logps/rejected": -3010.069580078125, "loss": 2.0837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -203.41432189941406, "rewards/margins": 6.958959102630615, "rewards/rejected": -210.373291015625, "step": 42160 }, { "epoch": 2.44, "grad_norm": 0.005152073688805103, "learning_rate": 0.0001868880374627501, "logits/chosen": -18.534889221191406, "logits/rejected": -19.832340240478516, "logps/chosen": -3346.529296875, "logps/rejected": -3297.45458984375, "loss": 2.1107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -262.73724365234375, "rewards/margins": 17.553436279296875, "rewards/rejected": -280.2906799316406, "step": 42170 }, { "epoch": 2.44, "grad_norm": 7.297699539776659e-06, "learning_rate": 0.00018669453152211774, "logits/chosen": -18.603958129882812, "logits/rejected": -17.365575790405273, "logps/chosen": -2861.77099609375, "logps/rejected": -2744.35595703125, "loss": 11.9425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.79855346679688, "rewards/margins": 1.0491470098495483, "rewards/rejected": -133.84768676757812, "step": 42180 }, { "epoch": 2.44, "grad_norm": 0.00018355896463617682, "learning_rate": 0.00018650102558148536, "logits/chosen": -17.750089645385742, "logits/rejected": -18.443483352661133, "logps/chosen": -2693.223388671875, "logps/rejected": -2684.158203125, "loss": 8.5778, "rewards/accuracies": 0.5, "rewards/chosen": -178.1753387451172, "rewards/margins": -3.3350653648376465, "rewards/rejected": -174.84027099609375, "step": 42190 }, { "epoch": 2.44, "grad_norm": 5.332458030693488e-08, "learning_rate": 0.000186307519640853, "logits/chosen": -18.34819221496582, "logits/rejected": -20.180160522460938, "logps/chosen": -3182.65771484375, "logps/rejected": -2981.38037109375, "loss": 0.473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.07925415039062, "rewards/margins": 12.050837516784668, "rewards/rejected": -186.1300811767578, "step": 42200 }, { "epoch": 2.44, "grad_norm": 138.91030883789062, "learning_rate": 0.0001861140137002206, "logits/chosen": -15.779749870300293, "logits/rejected": -15.882789611816406, "logps/chosen": -2913.34521484375, "logps/rejected": -2465.5244140625, "loss": 6.4461, "rewards/accuracies": 0.5, "rewards/chosen": -177.59170532226562, "rewards/margins": -1.6807174682617188, "rewards/rejected": -175.91098022460938, "step": 42210 }, { "epoch": 2.44, "grad_norm": 0.001647271797992289, "learning_rate": 0.0001859205077595882, "logits/chosen": -14.269454956054688, "logits/rejected": -14.410931587219238, "logps/chosen": -3359.624267578125, "logps/rejected": -3149.24560546875, "loss": 0.9593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -111.09393310546875, "rewards/margins": 11.814931869506836, "rewards/rejected": -122.90885925292969, "step": 42220 }, { "epoch": 2.44, "grad_norm": 42.93711471557617, "learning_rate": 0.00018572700181895585, "logits/chosen": -16.912721633911133, "logits/rejected": -18.532093048095703, "logps/chosen": -2579.711181640625, "logps/rejected": -2640.26806640625, "loss": 1.3915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.73179626464844, "rewards/margins": 13.2648344039917, "rewards/rejected": -203.9966278076172, "step": 42230 }, { "epoch": 2.45, "grad_norm": 34.256710052490234, "learning_rate": 0.00018553349587832347, "logits/chosen": -25.011882781982422, "logits/rejected": -26.265239715576172, "logps/chosen": -2926.08642578125, "logps/rejected": -2724.526611328125, "loss": 17.3492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -265.41925048828125, "rewards/margins": -11.55322265625, "rewards/rejected": -253.86599731445312, "step": 42240 }, { "epoch": 2.45, "grad_norm": 7.634791687749498e-10, "learning_rate": 0.0001853399899376911, "logits/chosen": -16.76397705078125, "logits/rejected": -17.342239379882812, "logps/chosen": -3057.100830078125, "logps/rejected": -2901.66455078125, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -171.3552703857422, "rewards/margins": 14.256197929382324, "rewards/rejected": -185.61148071289062, "step": 42250 }, { "epoch": 2.45, "grad_norm": 314.77490234375, "learning_rate": 0.0001851464839970587, "logits/chosen": -18.309711456298828, "logits/rejected": -19.020410537719727, "logps/chosen": -2827.67431640625, "logps/rejected": -2886.19580078125, "loss": 3.789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -133.44708251953125, "rewards/margins": 9.727705001831055, "rewards/rejected": -143.17477416992188, "step": 42260 }, { "epoch": 2.45, "grad_norm": 2.284830791669672e-19, "learning_rate": 0.00018495297805642632, "logits/chosen": -15.947542190551758, "logits/rejected": -18.17328453063965, "logps/chosen": -3239.140625, "logps/rejected": -2704.5869140625, "loss": 4.4971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -139.68792724609375, "rewards/margins": 24.375707626342773, "rewards/rejected": -164.0636444091797, "step": 42270 }, { "epoch": 2.45, "grad_norm": 77.03668975830078, "learning_rate": 0.00018475947211579397, "logits/chosen": -17.84992027282715, "logits/rejected": -19.64361000061035, "logps/chosen": -3200.717041015625, "logps/rejected": -2863.697265625, "loss": 9.0314, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -176.5703125, "rewards/margins": -4.318160057067871, "rewards/rejected": -172.2521514892578, "step": 42280 }, { "epoch": 2.45, "grad_norm": 1.6254187357844785e-05, "learning_rate": 0.00018456596617516158, "logits/chosen": -20.193395614624023, "logits/rejected": -20.47111701965332, "logps/chosen": -2337.974365234375, "logps/rejected": -2422.197265625, "loss": 2.5086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -202.24038696289062, "rewards/margins": 12.141339302062988, "rewards/rejected": -214.38174438476562, "step": 42290 }, { "epoch": 2.45, "grad_norm": 1.7173429223819654e-17, "learning_rate": 0.0001843724602345292, "logits/chosen": -17.575849533081055, "logits/rejected": -19.769163131713867, "logps/chosen": -3237.64404296875, "logps/rejected": -3107.6064453125, "loss": 1.6776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.23196411132812, "rewards/margins": 20.71701431274414, "rewards/rejected": -166.94898986816406, "step": 42300 }, { "epoch": 2.45, "grad_norm": 0.0024006813764572144, "learning_rate": 0.00018417895429389684, "logits/chosen": -17.105037689208984, "logits/rejected": -18.24478530883789, "logps/chosen": -2745.897705078125, "logps/rejected": -2661.062255859375, "loss": 1.6516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -130.74331665039062, "rewards/margins": 2.2443900108337402, "rewards/rejected": -132.9877166748047, "step": 42310 }, { "epoch": 2.45, "grad_norm": 52.90739440917969, "learning_rate": 0.00018398544835326444, "logits/chosen": -17.011703491210938, "logits/rejected": -18.823450088500977, "logps/chosen": -3104.82666015625, "logps/rejected": -3055.530517578125, "loss": 2.7971, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.09048461914062, "rewards/margins": 7.042708396911621, "rewards/rejected": -196.13319396972656, "step": 42320 }, { "epoch": 2.45, "grad_norm": 88.79682159423828, "learning_rate": 0.00018379194241263208, "logits/chosen": -19.8996639251709, "logits/rejected": -18.569210052490234, "logps/chosen": -2711.142822265625, "logps/rejected": -2791.789794921875, "loss": 1.0318, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -236.56832885742188, "rewards/margins": 14.649627685546875, "rewards/rejected": -251.21798706054688, "step": 42330 }, { "epoch": 2.45, "grad_norm": 3.067662633247578e-14, "learning_rate": 0.0001835984364719997, "logits/chosen": -15.561088562011719, "logits/rejected": -16.89406967163086, "logps/chosen": -3244.46533203125, "logps/rejected": -2742.21533203125, "loss": 16.0953, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -139.9404296875, "rewards/margins": 3.3324432373046875, "rewards/rejected": -143.2728729248047, "step": 42340 }, { "epoch": 2.45, "grad_norm": 231.88558959960938, "learning_rate": 0.00018340493053136731, "logits/chosen": -17.4748592376709, "logits/rejected": -18.328266143798828, "logps/chosen": -2519.93994140625, "logps/rejected": -2539.045166015625, "loss": 8.5284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.34835815429688, "rewards/margins": 2.2618765830993652, "rewards/rejected": -165.6102294921875, "step": 42350 }, { "epoch": 2.45, "grad_norm": 115.41415405273438, "learning_rate": 0.00018321142459073496, "logits/chosen": -16.322702407836914, "logits/rejected": -17.445415496826172, "logps/chosen": -3186.628662109375, "logps/rejected": -3338.92431640625, "loss": 2.6411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -214.91531372070312, "rewards/margins": 8.955175399780273, "rewards/rejected": -223.8704833984375, "step": 42360 }, { "epoch": 2.45, "grad_norm": 73.55632781982422, "learning_rate": 0.00018301791865010255, "logits/chosen": -17.67812156677246, "logits/rejected": -19.32253646850586, "logps/chosen": -2757.972412109375, "logps/rejected": -2891.39111328125, "loss": 2.1835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -186.11842346191406, "rewards/margins": 10.910465240478516, "rewards/rejected": -197.02890014648438, "step": 42370 }, { "epoch": 2.45, "grad_norm": 48.55304718017578, "learning_rate": 0.00018282441270947017, "logits/chosen": -15.333503723144531, "logits/rejected": -15.896596908569336, "logps/chosen": -2967.469482421875, "logps/rejected": -2515.461181640625, "loss": 6.346, "rewards/accuracies": 0.5, "rewards/chosen": -184.16461181640625, "rewards/margins": 7.151447296142578, "rewards/rejected": -191.31605529785156, "step": 42380 }, { "epoch": 2.45, "grad_norm": 5.108112335205078, "learning_rate": 0.0001826309067688378, "logits/chosen": -17.306121826171875, "logits/rejected": -18.03057098388672, "logps/chosen": -2564.01123046875, "logps/rejected": -3029.426025390625, "loss": 5.4334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -145.9664764404297, "rewards/margins": 4.133579254150391, "rewards/rejected": -150.10006713867188, "step": 42390 }, { "epoch": 2.45, "grad_norm": 64.81767272949219, "learning_rate": 0.00018243740082820543, "logits/chosen": -17.6234073638916, "logits/rejected": -19.75461196899414, "logps/chosen": -2879.786376953125, "logps/rejected": -2678.927490234375, "loss": 14.5796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -210.16964721679688, "rewards/margins": -7.188348293304443, "rewards/rejected": -202.98130798339844, "step": 42400 }, { "epoch": 2.45, "grad_norm": 0.07730487734079361, "learning_rate": 0.00018224389488757307, "logits/chosen": -16.68627166748047, "logits/rejected": -18.831632614135742, "logps/chosen": -2753.14453125, "logps/rejected": -2830.600341796875, "loss": 0.0715, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -208.5504150390625, "rewards/margins": 20.17160415649414, "rewards/rejected": -228.72201538085938, "step": 42410 }, { "epoch": 2.46, "grad_norm": 28.151641845703125, "learning_rate": 0.0001820503889469407, "logits/chosen": -14.5228853225708, "logits/rejected": -14.848332405090332, "logps/chosen": -3462.90185546875, "logps/rejected": -3016.37060546875, "loss": 0.8251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -95.93962097167969, "rewards/margins": 12.244062423706055, "rewards/rejected": -108.1836929321289, "step": 42420 }, { "epoch": 2.46, "grad_norm": 0.036262836307287216, "learning_rate": 0.00018185688300630828, "logits/chosen": -19.414432525634766, "logits/rejected": -19.38848304748535, "logps/chosen": -2633.20361328125, "logps/rejected": -2737.323486328125, "loss": 1.231, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.686279296875, "rewards/margins": 12.16968059539795, "rewards/rejected": -210.85592651367188, "step": 42430 }, { "epoch": 2.46, "grad_norm": 1.6163933524993013e-09, "learning_rate": 0.00018166337706567592, "logits/chosen": -20.128971099853516, "logits/rejected": -20.76712989807129, "logps/chosen": -2995.85205078125, "logps/rejected": -2795.1328125, "loss": 13.2688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -228.52066040039062, "rewards/margins": -1.9817211627960205, "rewards/rejected": -226.5389404296875, "step": 42440 }, { "epoch": 2.46, "grad_norm": 1.2715109733107965e-05, "learning_rate": 0.00018146987112504354, "logits/chosen": -17.081464767456055, "logits/rejected": -17.712242126464844, "logps/chosen": -2736.43115234375, "logps/rejected": -2642.106201171875, "loss": 5.3799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.06674194335938, "rewards/margins": 5.325156211853027, "rewards/rejected": -171.3919219970703, "step": 42450 }, { "epoch": 2.46, "grad_norm": 7.243546651380939e-09, "learning_rate": 0.00018127636518441116, "logits/chosen": -17.26166343688965, "logits/rejected": -18.907625198364258, "logps/chosen": -3146.44287109375, "logps/rejected": -2865.15234375, "loss": 2.479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -125.41815185546875, "rewards/margins": 4.9105048179626465, "rewards/rejected": -130.3286590576172, "step": 42460 }, { "epoch": 2.46, "grad_norm": 0.00024769167066551745, "learning_rate": 0.0001810828592437788, "logits/chosen": -19.09114646911621, "logits/rejected": -20.651968002319336, "logps/chosen": -2894.73974609375, "logps/rejected": -2792.09326171875, "loss": 10.9744, "rewards/accuracies": 0.5, "rewards/chosen": -218.33340454101562, "rewards/margins": -5.976852893829346, "rewards/rejected": -212.35653686523438, "step": 42470 }, { "epoch": 2.46, "grad_norm": 0.0009255495388060808, "learning_rate": 0.0001808893533031464, "logits/chosen": -20.08916473388672, "logits/rejected": -22.757089614868164, "logps/chosen": -2741.488525390625, "logps/rejected": -2698.687255859375, "loss": 2.8273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -182.959716796875, "rewards/margins": 3.522325038909912, "rewards/rejected": -186.48202514648438, "step": 42480 }, { "epoch": 2.46, "grad_norm": 64.32713317871094, "learning_rate": 0.00018069584736251404, "logits/chosen": -17.90789794921875, "logits/rejected": -18.483013153076172, "logps/chosen": -2736.412841796875, "logps/rejected": -2596.332763671875, "loss": 1.9239, "rewards/accuracies": 0.5, "rewards/chosen": -171.80209350585938, "rewards/margins": 11.180971145629883, "rewards/rejected": -182.98306274414062, "step": 42490 }, { "epoch": 2.46, "grad_norm": 3.698122341688759e-08, "learning_rate": 0.00018050234142188165, "logits/chosen": -16.76340103149414, "logits/rejected": -17.189861297607422, "logps/chosen": -3109.08154296875, "logps/rejected": -2814.54248046875, "loss": 2.0004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.80638122558594, "rewards/margins": 12.479896545410156, "rewards/rejected": -139.28628540039062, "step": 42500 }, { "epoch": 2.46, "grad_norm": 0.013929558917880058, "learning_rate": 0.00018030883548124927, "logits/chosen": -19.38140106201172, "logits/rejected": -19.62091636657715, "logps/chosen": -3337.192138671875, "logps/rejected": -3353.1875, "loss": 3.4298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.47625732421875, "rewards/margins": 6.51584005355835, "rewards/rejected": -155.99209594726562, "step": 42510 }, { "epoch": 2.46, "grad_norm": 0.00018127659859601408, "learning_rate": 0.00018011532954061691, "logits/chosen": -16.20597267150879, "logits/rejected": -16.946815490722656, "logps/chosen": -3225.72509765625, "logps/rejected": -3133.68603515625, "loss": 4.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -116.6383056640625, "rewards/margins": 11.44217586517334, "rewards/rejected": -128.08047485351562, "step": 42520 }, { "epoch": 2.46, "grad_norm": 0.01178702898323536, "learning_rate": 0.00017992182359998453, "logits/chosen": -19.979001998901367, "logits/rejected": -21.297016143798828, "logps/chosen": -2950.31396484375, "logps/rejected": -2677.088623046875, "loss": 0.5032, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -188.4505615234375, "rewards/margins": 17.77643585205078, "rewards/rejected": -206.2270050048828, "step": 42530 }, { "epoch": 2.46, "grad_norm": 37.84653854370117, "learning_rate": 0.00017972831765935215, "logits/chosen": -23.678714752197266, "logits/rejected": -23.4332275390625, "logps/chosen": -2805.880859375, "logps/rejected": -2541.89990234375, "loss": 26.97, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -254.3143310546875, "rewards/margins": -23.41098403930664, "rewards/rejected": -230.90335083007812, "step": 42540 }, { "epoch": 2.46, "grad_norm": 0.01982017792761326, "learning_rate": 0.00017953481171871977, "logits/chosen": -17.849979400634766, "logits/rejected": -17.972185134887695, "logps/chosen": -2842.866943359375, "logps/rejected": -2959.62158203125, "loss": 3.1974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -168.71133422851562, "rewards/margins": 6.317445755004883, "rewards/rejected": -175.02879333496094, "step": 42550 }, { "epoch": 2.46, "grad_norm": 0.004658994264900684, "learning_rate": 0.00017934130577808738, "logits/chosen": -20.79867935180664, "logits/rejected": -23.4126033782959, "logps/chosen": -2880.6484375, "logps/rejected": -2878.067626953125, "loss": 3.7559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -253.9568634033203, "rewards/margins": 9.88415241241455, "rewards/rejected": -263.84100341796875, "step": 42560 }, { "epoch": 2.46, "grad_norm": 0.000257026229519397, "learning_rate": 0.00017914779983745503, "logits/chosen": -19.3836612701416, "logits/rejected": -22.004718780517578, "logps/chosen": -2876.9951171875, "logps/rejected": -3097.337646484375, "loss": 9.108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.6770477294922, "rewards/margins": 2.44206166267395, "rewards/rejected": -208.1190948486328, "step": 42570 }, { "epoch": 2.46, "grad_norm": 0.027810348197817802, "learning_rate": 0.00017895429389682264, "logits/chosen": -22.468061447143555, "logits/rejected": -22.357254028320312, "logps/chosen": -2315.91357421875, "logps/rejected": -2276.36767578125, "loss": 2.8663, "rewards/accuracies": 0.5, "rewards/chosen": -184.28817749023438, "rewards/margins": 5.957841396331787, "rewards/rejected": -190.2460174560547, "step": 42580 }, { "epoch": 2.47, "grad_norm": 0.06746724247932434, "learning_rate": 0.00017876078795619023, "logits/chosen": -20.512691497802734, "logits/rejected": -21.85544776916504, "logps/chosen": -2851.95361328125, "logps/rejected": -3174.703857421875, "loss": 3.9439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -207.2274169921875, "rewards/margins": 7.276156425476074, "rewards/rejected": -214.50357055664062, "step": 42590 }, { "epoch": 2.47, "grad_norm": 4.679600715637207, "learning_rate": 0.00017856728201555788, "logits/chosen": -16.976947784423828, "logits/rejected": -17.883459091186523, "logps/chosen": -3326.70068359375, "logps/rejected": -2986.1953125, "loss": 0.743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -125.62751770019531, "rewards/margins": 7.807530403137207, "rewards/rejected": -133.43502807617188, "step": 42600 }, { "epoch": 2.47, "grad_norm": 2.7490531806506624e-07, "learning_rate": 0.0001783737760749255, "logits/chosen": -19.17578125, "logits/rejected": -21.127485275268555, "logps/chosen": -2980.262451171875, "logps/rejected": -3069.410888671875, "loss": 0.102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -227.62060546875, "rewards/margins": 15.779069900512695, "rewards/rejected": -243.3997039794922, "step": 42610 }, { "epoch": 2.47, "grad_norm": 7.549398723992278e-16, "learning_rate": 0.00017818027013429314, "logits/chosen": -15.292447090148926, "logits/rejected": -16.376155853271484, "logps/chosen": -2973.865478515625, "logps/rejected": -2993.921875, "loss": 1.5934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -146.26742553710938, "rewards/margins": 24.225662231445312, "rewards/rejected": -170.4930877685547, "step": 42620 }, { "epoch": 2.47, "grad_norm": 129.15884399414062, "learning_rate": 0.00017798676419366076, "logits/chosen": -14.360475540161133, "logits/rejected": -14.603830337524414, "logps/chosen": -3230.8173828125, "logps/rejected": -3150.953125, "loss": 3.0538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -114.43348693847656, "rewards/margins": 15.29393196105957, "rewards/rejected": -129.7274169921875, "step": 42630 }, { "epoch": 2.47, "grad_norm": 3.874082088470459, "learning_rate": 0.00017779325825302837, "logits/chosen": -16.608034133911133, "logits/rejected": -17.455678939819336, "logps/chosen": -3123.526123046875, "logps/rejected": -2956.18212890625, "loss": 16.5775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -140.015869140625, "rewards/margins": -10.010251998901367, "rewards/rejected": -130.005615234375, "step": 42640 }, { "epoch": 2.47, "grad_norm": 0.0, "learning_rate": 0.000177599752312396, "logits/chosen": -18.228496551513672, "logits/rejected": -21.467729568481445, "logps/chosen": -2959.277099609375, "logps/rejected": -2744.10986328125, "loss": 4.6839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.21498107910156, "rewards/margins": 26.989465713500977, "rewards/rejected": -183.20443725585938, "step": 42650 }, { "epoch": 2.47, "grad_norm": 1.6165793567779474e-05, "learning_rate": 0.0001774062463717636, "logits/chosen": -16.631267547607422, "logits/rejected": -18.860321044921875, "logps/chosen": -3464.93359375, "logps/rejected": -3293.57666015625, "loss": 3.1488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -131.83511352539062, "rewards/margins": 9.934102058410645, "rewards/rejected": -141.7692108154297, "step": 42660 }, { "epoch": 2.47, "grad_norm": 152.17225646972656, "learning_rate": 0.00017721274043113125, "logits/chosen": -15.38850212097168, "logits/rejected": -16.221044540405273, "logps/chosen": -3175.97705078125, "logps/rejected": -2856.752197265625, "loss": 9.2221, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.9784393310547, "rewards/margins": -3.006547451019287, "rewards/rejected": -163.97190856933594, "step": 42670 }, { "epoch": 2.47, "grad_norm": 41.71542739868164, "learning_rate": 0.00017701923449049887, "logits/chosen": -16.01058006286621, "logits/rejected": -16.21502113342285, "logps/chosen": -3450.638671875, "logps/rejected": -3055.5546875, "loss": 6.1499, "rewards/accuracies": 0.5, "rewards/chosen": -170.20729064941406, "rewards/margins": 7.279118537902832, "rewards/rejected": -177.4863739013672, "step": 42680 }, { "epoch": 2.47, "grad_norm": 3.381209899089299e-05, "learning_rate": 0.0001768257285498665, "logits/chosen": -17.43413734436035, "logits/rejected": -20.09465789794922, "logps/chosen": -3138.735107421875, "logps/rejected": -2748.184326171875, "loss": 1.539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.20550537109375, "rewards/margins": 12.69550895690918, "rewards/rejected": -174.9010009765625, "step": 42690 }, { "epoch": 2.47, "grad_norm": 242.25840759277344, "learning_rate": 0.0001766322226092341, "logits/chosen": -16.254323959350586, "logits/rejected": -16.994924545288086, "logps/chosen": -2711.524169921875, "logps/rejected": -3129.018798828125, "loss": 9.7919, "rewards/accuracies": 0.5, "rewards/chosen": -178.01336669921875, "rewards/margins": -1.0030285120010376, "rewards/rejected": -177.01034545898438, "step": 42700 }, { "epoch": 2.47, "grad_norm": 0.00022668363817501813, "learning_rate": 0.00017643871666860172, "logits/chosen": -17.176780700683594, "logits/rejected": -19.652141571044922, "logps/chosen": -2600.404541015625, "logps/rejected": -2828.53955078125, "loss": 2.4186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -152.7906494140625, "rewards/margins": 25.4027156829834, "rewards/rejected": -178.19334411621094, "step": 42710 }, { "epoch": 2.47, "grad_norm": 2.7051916229587204e-17, "learning_rate": 0.00017624521072796934, "logits/chosen": -16.24562644958496, "logits/rejected": -17.689067840576172, "logps/chosen": -3535.125732421875, "logps/rejected": -2711.549072265625, "loss": 1.0549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -94.15242004394531, "rewards/margins": 15.310842514038086, "rewards/rejected": -109.46327209472656, "step": 42720 }, { "epoch": 2.47, "grad_norm": 84.95547485351562, "learning_rate": 0.00017605170478733698, "logits/chosen": -19.303312301635742, "logits/rejected": -25.494243621826172, "logps/chosen": -3040.045166015625, "logps/rejected": -2701.80029296875, "loss": 5.6835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.18795776367188, "rewards/margins": 14.1705961227417, "rewards/rejected": -177.35855102539062, "step": 42730 }, { "epoch": 2.47, "grad_norm": 82.76956939697266, "learning_rate": 0.0001758581988467046, "logits/chosen": -15.719064712524414, "logits/rejected": -16.191152572631836, "logps/chosen": -2581.567138671875, "logps/rejected": -2622.697998046875, "loss": 12.7831, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -169.249267578125, "rewards/margins": -8.958415985107422, "rewards/rejected": -160.29086303710938, "step": 42740 }, { "epoch": 2.47, "grad_norm": 620.7531127929688, "learning_rate": 0.00017566469290607225, "logits/chosen": -18.782604217529297, "logits/rejected": -22.819717407226562, "logps/chosen": -3198.234375, "logps/rejected": -2803.94140625, "loss": 13.7007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -211.6453094482422, "rewards/margins": 0.9967783093452454, "rewards/rejected": -212.64212036132812, "step": 42750 }, { "epoch": 2.48, "grad_norm": 53.31612777709961, "learning_rate": 0.00017547118696543984, "logits/chosen": -21.26735496520996, "logits/rejected": -23.776939392089844, "logps/chosen": -2932.2236328125, "logps/rejected": -2902.690185546875, "loss": 11.6815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -227.4684295654297, "rewards/margins": 5.465161323547363, "rewards/rejected": -232.93359375, "step": 42760 }, { "epoch": 2.48, "grad_norm": 121.16119384765625, "learning_rate": 0.00017527768102480745, "logits/chosen": -15.287147521972656, "logits/rejected": -17.21701431274414, "logps/chosen": -3254.52197265625, "logps/rejected": -3213.316162109375, "loss": 5.3827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.47320556640625, "rewards/margins": 11.063709259033203, "rewards/rejected": -188.53692626953125, "step": 42770 }, { "epoch": 2.48, "grad_norm": 6.290286000409928e-12, "learning_rate": 0.0001750841750841751, "logits/chosen": -16.283649444580078, "logits/rejected": -17.545520782470703, "logps/chosen": -3111.598388671875, "logps/rejected": -2877.09228515625, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -169.49093627929688, "rewards/margins": 15.852561950683594, "rewards/rejected": -185.34349060058594, "step": 42780 }, { "epoch": 2.48, "grad_norm": 1.3544426864797732e-11, "learning_rate": 0.00017489066914354271, "logits/chosen": -20.560373306274414, "logits/rejected": -21.813167572021484, "logps/chosen": -3003.72021484375, "logps/rejected": -2981.45654296875, "loss": 4.582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -265.6902770996094, "rewards/margins": 5.110093593597412, "rewards/rejected": -270.80035400390625, "step": 42790 }, { "epoch": 2.48, "grad_norm": 89.09568786621094, "learning_rate": 0.00017469716320291033, "logits/chosen": -16.897377014160156, "logits/rejected": -17.011066436767578, "logps/chosen": -2737.92041015625, "logps/rejected": -2740.04931640625, "loss": 5.1645, "rewards/accuracies": 0.5, "rewards/chosen": -235.3613739013672, "rewards/margins": 2.6025681495666504, "rewards/rejected": -237.9639434814453, "step": 42800 }, { "epoch": 2.48, "grad_norm": 57.86320877075195, "learning_rate": 0.00017450365726227795, "logits/chosen": -21.308494567871094, "logits/rejected": -20.99524688720703, "logps/chosen": -2981.324951171875, "logps/rejected": -3124.44580078125, "loss": 3.1392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -197.1134490966797, "rewards/margins": 11.936044692993164, "rewards/rejected": -209.0495147705078, "step": 42810 }, { "epoch": 2.48, "grad_norm": 0.0004901019856333733, "learning_rate": 0.00017431015132164557, "logits/chosen": -17.721214294433594, "logits/rejected": -18.24996566772461, "logps/chosen": -3024.849365234375, "logps/rejected": -3038.755859375, "loss": 0.537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -232.25308227539062, "rewards/margins": 11.533853530883789, "rewards/rejected": -243.7869415283203, "step": 42820 }, { "epoch": 2.48, "grad_norm": 6.067275570842412e-10, "learning_rate": 0.0001741166453810132, "logits/chosen": -15.080232620239258, "logits/rejected": -16.379608154296875, "logps/chosen": -3077.583251953125, "logps/rejected": -3249.959716796875, "loss": 1.883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -170.09841918945312, "rewards/margins": 16.167856216430664, "rewards/rejected": -186.26626586914062, "step": 42830 }, { "epoch": 2.48, "grad_norm": 3.633807699188196e-10, "learning_rate": 0.00017392313944038083, "logits/chosen": -16.478193283081055, "logits/rejected": -17.2816162109375, "logps/chosen": -3050.958251953125, "logps/rejected": -3049.802001953125, "loss": 1.7313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.50283813476562, "rewards/margins": 8.098226547241211, "rewards/rejected": -232.60104370117188, "step": 42840 }, { "epoch": 2.48, "grad_norm": 9.602271165931597e-05, "learning_rate": 0.00017372963349974844, "logits/chosen": -16.31873321533203, "logits/rejected": -16.571348190307617, "logps/chosen": -2944.58154296875, "logps/rejected": -2926.65625, "loss": 5.4187, "rewards/accuracies": 0.5, "rewards/chosen": -135.71511840820312, "rewards/margins": -0.13457803428173065, "rewards/rejected": -135.58055114746094, "step": 42850 }, { "epoch": 2.48, "grad_norm": 0.00023957114899531007, "learning_rate": 0.0001735361275591161, "logits/chosen": -16.386287689208984, "logits/rejected": -17.700664520263672, "logps/chosen": -3152.76708984375, "logps/rejected": -2922.482421875, "loss": 2.2086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.6710662841797, "rewards/margins": 13.223367691040039, "rewards/rejected": -183.89442443847656, "step": 42860 }, { "epoch": 2.48, "grad_norm": 0.007598118390887976, "learning_rate": 0.00017334262161848368, "logits/chosen": -16.387561798095703, "logits/rejected": -16.93915367126465, "logps/chosen": -3014.375732421875, "logps/rejected": -3009.637939453125, "loss": 4.8765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -193.655029296875, "rewards/margins": 9.729637145996094, "rewards/rejected": -203.38467407226562, "step": 42870 }, { "epoch": 2.48, "grad_norm": 0.04384815692901611, "learning_rate": 0.00017314911567785132, "logits/chosen": -14.32014274597168, "logits/rejected": -15.424084663391113, "logps/chosen": -3161.4677734375, "logps/rejected": -2724.662109375, "loss": 2.2876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.6255340576172, "rewards/margins": 18.950008392333984, "rewards/rejected": -181.5755157470703, "step": 42880 }, { "epoch": 2.48, "grad_norm": 2.8403280794009333e-07, "learning_rate": 0.00017295560973721894, "logits/chosen": -17.460580825805664, "logits/rejected": -19.59352684020996, "logps/chosen": -3175.640869140625, "logps/rejected": -2804.53857421875, "loss": 14.9978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.52822875976562, "rewards/margins": -5.503540992736816, "rewards/rejected": -204.02467346191406, "step": 42890 }, { "epoch": 2.48, "grad_norm": 1.0801479710664807e-09, "learning_rate": 0.00017276210379658656, "logits/chosen": -18.816904067993164, "logits/rejected": -19.421245574951172, "logps/chosen": -3187.015625, "logps/rejected": -3251.09521484375, "loss": 6.3525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -276.36309814453125, "rewards/margins": 8.105070114135742, "rewards/rejected": -284.4681701660156, "step": 42900 }, { "epoch": 2.48, "grad_norm": 1.900050605740944e-08, "learning_rate": 0.0001725685978559542, "logits/chosen": -18.642091751098633, "logits/rejected": -21.153329849243164, "logps/chosen": -2887.502685546875, "logps/rejected": -2803.93896484375, "loss": 7.1231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -232.99252319335938, "rewards/margins": 5.918937683105469, "rewards/rejected": -238.9114532470703, "step": 42910 }, { "epoch": 2.48, "grad_norm": 65.91297912597656, "learning_rate": 0.0001723750919153218, "logits/chosen": -18.24953269958496, "logits/rejected": -19.436851501464844, "logps/chosen": -2685.593017578125, "logps/rejected": -2696.3896484375, "loss": 3.6907, "rewards/accuracies": 0.5, "rewards/chosen": -205.92800903320312, "rewards/margins": 1.7182515859603882, "rewards/rejected": -207.64627075195312, "step": 42920 }, { "epoch": 2.48, "grad_norm": 6.478127478892759e-10, "learning_rate": 0.0001721815859746894, "logits/chosen": -16.769329071044922, "logits/rejected": -16.80379867553711, "logps/chosen": -3092.629150390625, "logps/rejected": -3207.48828125, "loss": 7.4251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -107.2898941040039, "rewards/margins": 1.1616466045379639, "rewards/rejected": -108.45152282714844, "step": 42930 }, { "epoch": 2.49, "grad_norm": 0.0006753662601113319, "learning_rate": 0.00017198808003405705, "logits/chosen": -16.136289596557617, "logits/rejected": -15.866386413574219, "logps/chosen": -3175.37060546875, "logps/rejected": -2357.663330078125, "loss": 0.452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -100.17562103271484, "rewards/margins": 11.6159029006958, "rewards/rejected": -111.79151916503906, "step": 42940 }, { "epoch": 2.49, "grad_norm": 151.38072204589844, "learning_rate": 0.00017179457409342467, "logits/chosen": -20.153608322143555, "logits/rejected": -22.158058166503906, "logps/chosen": -3119.74609375, "logps/rejected": -3179.28076171875, "loss": 1.8302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -239.92428588867188, "rewards/margins": 5.784821510314941, "rewards/rejected": -245.70913696289062, "step": 42950 }, { "epoch": 2.49, "grad_norm": 5.142383575439453, "learning_rate": 0.00017160106815279231, "logits/chosen": -15.06959342956543, "logits/rejected": -15.397555351257324, "logps/chosen": -3196.458251953125, "logps/rejected": -3291.89306640625, "loss": 2.0163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.9069061279297, "rewards/margins": 11.415445327758789, "rewards/rejected": -146.32235717773438, "step": 42960 }, { "epoch": 2.49, "grad_norm": 0.028821397572755814, "learning_rate": 0.0001714075622121599, "logits/chosen": -15.624265670776367, "logits/rejected": -16.686275482177734, "logps/chosen": -2902.0771484375, "logps/rejected": -2509.868896484375, "loss": 3.145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.26589965820312, "rewards/margins": 1.8759771585464478, "rewards/rejected": -183.14187622070312, "step": 42970 }, { "epoch": 2.49, "grad_norm": 0.0, "learning_rate": 0.00017121405627152752, "logits/chosen": -15.567631721496582, "logits/rejected": -15.752721786499023, "logps/chosen": -3187.41064453125, "logps/rejected": -2962.30859375, "loss": 3.012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.04684448242188, "rewards/margins": 16.744178771972656, "rewards/rejected": -157.791015625, "step": 42980 }, { "epoch": 2.49, "grad_norm": 1.499059765208699e-09, "learning_rate": 0.00017102055033089517, "logits/chosen": -20.43406105041504, "logits/rejected": -21.746179580688477, "logps/chosen": -2929.356201171875, "logps/rejected": -2990.66357421875, "loss": 1.4991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -224.97607421875, "rewards/margins": 21.134342193603516, "rewards/rejected": -246.1104278564453, "step": 42990 }, { "epoch": 2.49, "grad_norm": 0.01471120584756136, "learning_rate": 0.00017082704439026278, "logits/chosen": -19.01131248474121, "logits/rejected": -18.33707046508789, "logps/chosen": -2884.89306640625, "logps/rejected": -2959.879638671875, "loss": 9.9838, "rewards/accuracies": 0.5, "rewards/chosen": -169.40768432617188, "rewards/margins": -4.867661476135254, "rewards/rejected": -164.54000854492188, "step": 43000 }, { "epoch": 2.49, "grad_norm": 4.5812761584329564e-08, "learning_rate": 0.00017063353844963043, "logits/chosen": -15.204485893249512, "logits/rejected": -16.35725975036621, "logps/chosen": -3168.869384765625, "logps/rejected": -3470.45458984375, "loss": 6.2995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -142.19049072265625, "rewards/margins": 2.6683993339538574, "rewards/rejected": -144.85890197753906, "step": 43010 }, { "epoch": 2.49, "grad_norm": 0.0038924990221858025, "learning_rate": 0.00017044003250899804, "logits/chosen": -18.60020637512207, "logits/rejected": -19.235082626342773, "logps/chosen": -3126.958984375, "logps/rejected": -3073.45361328125, "loss": 3.7763, "rewards/accuracies": 0.5, "rewards/chosen": -206.498046875, "rewards/margins": 4.326127052307129, "rewards/rejected": -210.82418823242188, "step": 43020 }, { "epoch": 2.49, "grad_norm": 2.117584705352783, "learning_rate": 0.00017024652656836563, "logits/chosen": -16.83297348022461, "logits/rejected": -18.43716812133789, "logps/chosen": -3125.2138671875, "logps/rejected": -2850.25390625, "loss": 2.7966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -202.5199432373047, "rewards/margins": 12.322223663330078, "rewards/rejected": -214.8421630859375, "step": 43030 }, { "epoch": 2.49, "grad_norm": 1.2134273188636334e-15, "learning_rate": 0.00017005302062773328, "logits/chosen": -18.232803344726562, "logits/rejected": -17.939809799194336, "logps/chosen": -2658.770263671875, "logps/rejected": -2772.41796875, "loss": 3.9796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -231.56704711914062, "rewards/margins": 6.095056056976318, "rewards/rejected": -237.662109375, "step": 43040 }, { "epoch": 2.49, "grad_norm": 0.00033037876710295677, "learning_rate": 0.0001698595146871009, "logits/chosen": -15.010188102722168, "logits/rejected": -17.492324829101562, "logps/chosen": -3240.267578125, "logps/rejected": -2966.2216796875, "loss": 0.5139, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -171.9571990966797, "rewards/margins": 14.929403305053711, "rewards/rejected": -186.88662719726562, "step": 43050 }, { "epoch": 2.49, "grad_norm": 126.13904571533203, "learning_rate": 0.0001696660087464685, "logits/chosen": -16.021453857421875, "logits/rejected": -16.27444076538086, "logps/chosen": -2759.15283203125, "logps/rejected": -2690.77001953125, "loss": 4.6514, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -150.05291748046875, "rewards/margins": -1.8966070413589478, "rewards/rejected": -148.15631103515625, "step": 43060 }, { "epoch": 2.49, "grad_norm": 56.1917610168457, "learning_rate": 0.00016947250280583616, "logits/chosen": -15.030923843383789, "logits/rejected": -15.397926330566406, "logps/chosen": -3348.25146484375, "logps/rejected": -3406.5703125, "loss": 2.4495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -190.3219757080078, "rewards/margins": 6.713732719421387, "rewards/rejected": -197.0356903076172, "step": 43070 }, { "epoch": 2.49, "grad_norm": 0.0002685684885364026, "learning_rate": 0.00016927899686520375, "logits/chosen": -18.27498435974121, "logits/rejected": -18.419790267944336, "logps/chosen": -3168.148193359375, "logps/rejected": -3169.394287109375, "loss": 2.505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -208.64804077148438, "rewards/margins": 3.3423011302948, "rewards/rejected": -211.99032592773438, "step": 43080 }, { "epoch": 2.49, "grad_norm": 2.09224009513855, "learning_rate": 0.0001690854909245714, "logits/chosen": -16.894926071166992, "logits/rejected": -16.531423568725586, "logps/chosen": -3078.9521484375, "logps/rejected": -2776.46826171875, "loss": 13.6042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.33255004882812, "rewards/margins": -2.489121675491333, "rewards/rejected": -199.84341430664062, "step": 43090 }, { "epoch": 2.49, "grad_norm": 46.92225646972656, "learning_rate": 0.000168891984983939, "logits/chosen": -18.00339126586914, "logits/rejected": -17.930803298950195, "logps/chosen": -3120.7470703125, "logps/rejected": -3093.929931640625, "loss": 5.9996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -177.7881317138672, "rewards/margins": -0.99370938539505, "rewards/rejected": -176.79441833496094, "step": 43100 }, { "epoch": 2.5, "grad_norm": 1.9681429862976074, "learning_rate": 0.00016869847904330663, "logits/chosen": -17.581418991088867, "logits/rejected": -19.85361671447754, "logps/chosen": -3011.154541015625, "logps/rejected": -3078.083984375, "loss": 2.6153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -195.5808868408203, "rewards/margins": 7.033902645111084, "rewards/rejected": -202.6147918701172, "step": 43110 }, { "epoch": 2.5, "grad_norm": 8.154598236083984, "learning_rate": 0.00016850497310267427, "logits/chosen": -15.824116706848145, "logits/rejected": -15.99260425567627, "logps/chosen": -2997.591064453125, "logps/rejected": -2660.414794921875, "loss": 2.6852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.21490478515625, "rewards/margins": 2.1927542686462402, "rewards/rejected": -196.40765380859375, "step": 43120 }, { "epoch": 2.5, "grad_norm": 155.9016876220703, "learning_rate": 0.0001683114671620419, "logits/chosen": -14.133196830749512, "logits/rejected": -14.703938484191895, "logps/chosen": -3233.449462890625, "logps/rejected": -2967.154541015625, "loss": 4.9239, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -117.70343017578125, "rewards/margins": 2.6459338665008545, "rewards/rejected": -120.349365234375, "step": 43130 }, { "epoch": 2.5, "grad_norm": 3.687032013033331e-09, "learning_rate": 0.00016811796122140948, "logits/chosen": -16.860750198364258, "logits/rejected": -20.182872772216797, "logps/chosen": -3058.55517578125, "logps/rejected": -2866.615234375, "loss": 7.134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.68478393554688, "rewards/margins": -0.04702568054199219, "rewards/rejected": -166.6377716064453, "step": 43140 }, { "epoch": 2.5, "grad_norm": 0.36275243759155273, "learning_rate": 0.00016792445528077712, "logits/chosen": -17.08779525756836, "logits/rejected": -15.396410942077637, "logps/chosen": -2904.349365234375, "logps/rejected": -2627.531005859375, "loss": 1.155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -124.89735412597656, "rewards/margins": 14.166536331176758, "rewards/rejected": -139.06390380859375, "step": 43150 }, { "epoch": 2.5, "grad_norm": 6.160214979900047e-05, "learning_rate": 0.00016773094934014474, "logits/chosen": -18.388710021972656, "logits/rejected": -19.914121627807617, "logps/chosen": -3127.1572265625, "logps/rejected": -2912.718017578125, "loss": 1.6829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.91018676757812, "rewards/margins": 7.856850624084473, "rewards/rejected": -180.7670135498047, "step": 43160 }, { "epoch": 2.5, "grad_norm": 2.278974875480344e-08, "learning_rate": 0.00016753744339951238, "logits/chosen": -15.938291549682617, "logits/rejected": -17.661264419555664, "logps/chosen": -2924.1796875, "logps/rejected": -2881.143798828125, "loss": 3.4967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.0268096923828, "rewards/margins": 7.220580101013184, "rewards/rejected": -145.24737548828125, "step": 43170 }, { "epoch": 2.5, "grad_norm": 2.492189884185791, "learning_rate": 0.00016734393745888, "logits/chosen": -18.21421241760254, "logits/rejected": -18.410261154174805, "logps/chosen": -2800.789794921875, "logps/rejected": -2575.4208984375, "loss": 4.5658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.08346557617188, "rewards/margins": 7.965911865234375, "rewards/rejected": -159.04940795898438, "step": 43180 }, { "epoch": 2.5, "grad_norm": 0.0006050682277418673, "learning_rate": 0.0001671504315182476, "logits/chosen": -14.720995903015137, "logits/rejected": -15.191741943359375, "logps/chosen": -3229.008056640625, "logps/rejected": -3135.5517578125, "loss": 12.659, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -174.48153686523438, "rewards/margins": 2.0526161193847656, "rewards/rejected": -176.53414916992188, "step": 43190 }, { "epoch": 2.5, "grad_norm": 0.00024696579203009605, "learning_rate": 0.00016695692557761524, "logits/chosen": -17.199974060058594, "logits/rejected": -17.19051742553711, "logps/chosen": -3356.92578125, "logps/rejected": -3040.968017578125, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": -185.81814575195312, "rewards/margins": 14.155309677124023, "rewards/rejected": -199.97344970703125, "step": 43200 }, { "epoch": 2.5, "grad_norm": 0.00011287787492619827, "learning_rate": 0.00016676341963698285, "logits/chosen": -16.78044891357422, "logits/rejected": -17.721988677978516, "logps/chosen": -2944.43896484375, "logps/rejected": -3003.279541015625, "loss": 5.9611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -127.7202377319336, "rewards/margins": 5.860079288482666, "rewards/rejected": -133.580322265625, "step": 43210 }, { "epoch": 2.5, "grad_norm": 1.2589363841076348e-17, "learning_rate": 0.0001665699136963505, "logits/chosen": -20.148765563964844, "logits/rejected": -21.2442684173584, "logps/chosen": -2711.21142578125, "logps/rejected": -2906.830810546875, "loss": 0.2975, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -205.792724609375, "rewards/margins": 33.28642654418945, "rewards/rejected": -239.0791473388672, "step": 43220 }, { "epoch": 2.5, "grad_norm": 0.007638377137482166, "learning_rate": 0.00016637640775571811, "logits/chosen": -16.726037979125977, "logits/rejected": -17.22429847717285, "logps/chosen": -3020.7568359375, "logps/rejected": -2838.8828125, "loss": 2.3691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.89724731445312, "rewards/margins": 7.653894901275635, "rewards/rejected": -193.5511474609375, "step": 43230 }, { "epoch": 2.5, "grad_norm": 0.00277704163454473, "learning_rate": 0.00016618290181508573, "logits/chosen": -19.93037986755371, "logits/rejected": -22.534793853759766, "logps/chosen": -2695.145263671875, "logps/rejected": -2776.364013671875, "loss": 6.9977, "rewards/accuracies": 0.5, "rewards/chosen": -202.67828369140625, "rewards/margins": 12.34733772277832, "rewards/rejected": -215.02560424804688, "step": 43240 }, { "epoch": 2.5, "grad_norm": 45.22943115234375, "learning_rate": 0.00016598939587445335, "logits/chosen": -17.489978790283203, "logits/rejected": -17.877262115478516, "logps/chosen": -2779.83349609375, "logps/rejected": -2853.37109375, "loss": 0.5898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.94349670410156, "rewards/margins": 13.476556777954102, "rewards/rejected": -188.4200439453125, "step": 43250 }, { "epoch": 2.5, "grad_norm": 7.602705665021858e-08, "learning_rate": 0.00016579588993382097, "logits/chosen": -16.494415283203125, "logits/rejected": -19.03366470336914, "logps/chosen": -3275.180908203125, "logps/rejected": -3337.547607421875, "loss": 4.4619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.58450317382812, "rewards/margins": 10.382867813110352, "rewards/rejected": -224.9673614501953, "step": 43260 }, { "epoch": 2.5, "grad_norm": 0.0007927108090370893, "learning_rate": 0.00016560238399318858, "logits/chosen": -17.39481544494629, "logits/rejected": -17.694894790649414, "logps/chosen": -2615.114501953125, "logps/rejected": -2230.38720703125, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": -84.47466278076172, "rewards/margins": 13.54780387878418, "rewards/rejected": -98.02247619628906, "step": 43270 }, { "epoch": 2.51, "grad_norm": 0.0013076909817755222, "learning_rate": 0.00016540887805255623, "logits/chosen": -16.46657943725586, "logits/rejected": -16.61802864074707, "logps/chosen": -2929.84033203125, "logps/rejected": -2832.80322265625, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -115.2376480102539, "rewards/margins": 28.4069881439209, "rewards/rejected": -143.64463806152344, "step": 43280 }, { "epoch": 2.51, "grad_norm": 0.4813029170036316, "learning_rate": 0.00016521537211192384, "logits/chosen": -20.972270965576172, "logits/rejected": -23.52747917175293, "logps/chosen": -2972.846923828125, "logps/rejected": -3005.40478515625, "loss": 2.3732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -264.25506591796875, "rewards/margins": 7.996177673339844, "rewards/rejected": -272.2512512207031, "step": 43290 }, { "epoch": 2.51, "grad_norm": 0.41722026467323303, "learning_rate": 0.00016502186617129146, "logits/chosen": -18.17368507385254, "logits/rejected": -17.49001693725586, "logps/chosen": -3046.0869140625, "logps/rejected": -3196.38818359375, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -229.3745880126953, "rewards/margins": 14.893056869506836, "rewards/rejected": -244.2676544189453, "step": 43300 }, { "epoch": 2.51, "grad_norm": 2.2232730388641357, "learning_rate": 0.00016482836023065908, "logits/chosen": -15.528329849243164, "logits/rejected": -16.872861862182617, "logps/chosen": -3301.186767578125, "logps/rejected": -2897.565673828125, "loss": 3.7025, "rewards/accuracies": 0.5, "rewards/chosen": -135.04019165039062, "rewards/margins": 2.421006441116333, "rewards/rejected": -137.461181640625, "step": 43310 }, { "epoch": 2.51, "grad_norm": 0.0005600790027529001, "learning_rate": 0.0001646348542900267, "logits/chosen": -15.720563888549805, "logits/rejected": -15.632914543151855, "logps/chosen": -2771.428466796875, "logps/rejected": -2488.99755859375, "loss": 6.5299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -132.1074981689453, "rewards/margins": 1.818433403968811, "rewards/rejected": -133.92593383789062, "step": 43320 }, { "epoch": 2.51, "grad_norm": 0.2565760016441345, "learning_rate": 0.00016444134834939434, "logits/chosen": -19.011333465576172, "logits/rejected": -19.595691680908203, "logps/chosen": -2691.840087890625, "logps/rejected": -2835.573974609375, "loss": 4.5504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -178.26382446289062, "rewards/margins": 4.2604475021362305, "rewards/rejected": -182.52427673339844, "step": 43330 }, { "epoch": 2.51, "grad_norm": 7.292068004608154, "learning_rate": 0.00016424784240876196, "logits/chosen": -19.504709243774414, "logits/rejected": -19.49819564819336, "logps/chosen": -2795.160888671875, "logps/rejected": -2818.897705078125, "loss": 1.9974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -245.42343139648438, "rewards/margins": 7.70089864730835, "rewards/rejected": -253.1243438720703, "step": 43340 }, { "epoch": 2.51, "grad_norm": 5.2567397410916363e-17, "learning_rate": 0.00016405433646812957, "logits/chosen": -16.759716033935547, "logits/rejected": -17.637481689453125, "logps/chosen": -2813.205078125, "logps/rejected": -2756.271240234375, "loss": 9.261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.89431762695312, "rewards/margins": 2.244194507598877, "rewards/rejected": -207.1385040283203, "step": 43350 }, { "epoch": 2.51, "grad_norm": 35.22857666015625, "learning_rate": 0.0001638608305274972, "logits/chosen": -17.717187881469727, "logits/rejected": -18.034915924072266, "logps/chosen": -3155.4794921875, "logps/rejected": -2920.45849609375, "loss": 1.6798, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.7581329345703, "rewards/margins": 15.797945022583008, "rewards/rejected": -178.55606079101562, "step": 43360 }, { "epoch": 2.51, "grad_norm": 55.728389739990234, "learning_rate": 0.0001636673245868648, "logits/chosen": -16.598011016845703, "logits/rejected": -17.765655517578125, "logps/chosen": -3082.62939453125, "logps/rejected": -2715.25537109375, "loss": 1.8415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.01319885253906, "rewards/margins": 10.17562484741211, "rewards/rejected": -167.18882751464844, "step": 43370 }, { "epoch": 2.51, "grad_norm": 0.030054714530706406, "learning_rate": 0.00016347381864623245, "logits/chosen": -18.137544631958008, "logits/rejected": -17.464069366455078, "logps/chosen": -3299.748779296875, "logps/rejected": -3121.062255859375, "loss": 0.9098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -185.6743927001953, "rewards/margins": 21.124982833862305, "rewards/rejected": -206.7993621826172, "step": 43380 }, { "epoch": 2.51, "grad_norm": 0.0013992105377838016, "learning_rate": 0.00016328031270560007, "logits/chosen": -19.029094696044922, "logits/rejected": -19.1942138671875, "logps/chosen": -2778.0244140625, "logps/rejected": -2877.805908203125, "loss": 2.1285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -211.1975860595703, "rewards/margins": 7.7813005447387695, "rewards/rejected": -218.97891235351562, "step": 43390 }, { "epoch": 2.51, "grad_norm": 6.293430805206299, "learning_rate": 0.0001630868067649677, "logits/chosen": -18.134761810302734, "logits/rejected": -20.46139144897461, "logps/chosen": -3097.716796875, "logps/rejected": -3026.27734375, "loss": 5.0549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -171.26678466796875, "rewards/margins": 4.3829240798950195, "rewards/rejected": -175.6497039794922, "step": 43400 }, { "epoch": 2.51, "grad_norm": 68.74604797363281, "learning_rate": 0.0001628933008243353, "logits/chosen": -18.26889419555664, "logits/rejected": -18.752216339111328, "logps/chosen": -2683.037109375, "logps/rejected": -2921.05224609375, "loss": 6.758, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -219.02767944335938, "rewards/margins": 2.2277183532714844, "rewards/rejected": -221.25540161132812, "step": 43410 }, { "epoch": 2.51, "grad_norm": 157.8799285888672, "learning_rate": 0.00016269979488370292, "logits/chosen": -15.573946952819824, "logits/rejected": -15.980157852172852, "logps/chosen": -2615.831787109375, "logps/rejected": -2726.64013671875, "loss": 4.5483, "rewards/accuracies": 0.5, "rewards/chosen": -105.96065521240234, "rewards/margins": 0.9190391302108765, "rewards/rejected": -106.87969970703125, "step": 43420 }, { "epoch": 2.51, "grad_norm": 0.491695374250412, "learning_rate": 0.00016250628894307057, "logits/chosen": -17.388553619384766, "logits/rejected": -17.78142738342285, "logps/chosen": -2651.89111328125, "logps/rejected": -2370.194091796875, "loss": 5.5118, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -153.4952850341797, "rewards/margins": 4.315972328186035, "rewards/rejected": -157.81124877929688, "step": 43430 }, { "epoch": 2.51, "grad_norm": 7.58230235078372e-05, "learning_rate": 0.00016231278300243818, "logits/chosen": -17.56061553955078, "logits/rejected": -17.97182273864746, "logps/chosen": -2735.54931640625, "logps/rejected": -2630.6767578125, "loss": 1.175, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -211.6669158935547, "rewards/margins": 10.165852546691895, "rewards/rejected": -221.83279418945312, "step": 43440 }, { "epoch": 2.52, "grad_norm": 1.554634877720673e-06, "learning_rate": 0.0001621192770618058, "logits/chosen": -16.428783416748047, "logits/rejected": -16.526687622070312, "logps/chosen": -3292.038330078125, "logps/rejected": -3039.103515625, "loss": 2.2161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.14903259277344, "rewards/margins": 9.013218879699707, "rewards/rejected": -207.1622772216797, "step": 43450 }, { "epoch": 2.52, "grad_norm": 0.420359343290329, "learning_rate": 0.00016192577112117344, "logits/chosen": -16.347272872924805, "logits/rejected": -16.569976806640625, "logps/chosen": -2512.39990234375, "logps/rejected": -2526.197021484375, "loss": 2.0896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -172.82464599609375, "rewards/margins": 4.884628772735596, "rewards/rejected": -177.7092742919922, "step": 43460 }, { "epoch": 2.52, "grad_norm": 0.24368366599082947, "learning_rate": 0.00016173226518054103, "logits/chosen": -18.566652297973633, "logits/rejected": -18.825998306274414, "logps/chosen": -2943.176513671875, "logps/rejected": -2873.81884765625, "loss": 4.0093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -219.6911163330078, "rewards/margins": 2.4355053901672363, "rewards/rejected": -222.1266326904297, "step": 43470 }, { "epoch": 2.52, "grad_norm": 7.850150723243132e-05, "learning_rate": 0.00016153875923990865, "logits/chosen": -16.272327423095703, "logits/rejected": -16.98483657836914, "logps/chosen": -2559.27685546875, "logps/rejected": -2503.22900390625, "loss": 0.8984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -123.46604919433594, "rewards/margins": 10.324612617492676, "rewards/rejected": -133.79066467285156, "step": 43480 }, { "epoch": 2.52, "grad_norm": 4.359353047815595e-13, "learning_rate": 0.0001613452532992763, "logits/chosen": -19.869604110717773, "logits/rejected": -19.987985610961914, "logps/chosen": -2868.083984375, "logps/rejected": -2912.85693359375, "loss": 10.2222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.98719787597656, "rewards/margins": 2.1224288940429688, "rewards/rejected": -178.10963439941406, "step": 43490 }, { "epoch": 2.52, "grad_norm": 0.6916388273239136, "learning_rate": 0.0001611517473586439, "logits/chosen": -17.282529830932617, "logits/rejected": -18.037540435791016, "logps/chosen": -3014.494384765625, "logps/rejected": -3212.90771484375, "loss": 7.6486, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -216.94509887695312, "rewards/margins": 0.040467072278261185, "rewards/rejected": -216.98556518554688, "step": 43500 }, { "epoch": 2.52, "grad_norm": 0.0746716633439064, "learning_rate": 0.00016095824141801156, "logits/chosen": -20.21917152404785, "logits/rejected": -19.236785888671875, "logps/chosen": -2812.18798828125, "logps/rejected": -3139.66455078125, "loss": 2.4553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.07168579101562, "rewards/margins": 8.7558012008667, "rewards/rejected": -171.82748413085938, "step": 43510 }, { "epoch": 2.52, "grad_norm": 0.00872254278510809, "learning_rate": 0.00016076473547737915, "logits/chosen": -16.263151168823242, "logits/rejected": -15.527493476867676, "logps/chosen": -3268.6171875, "logps/rejected": -2738.54638671875, "loss": 4.8281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -135.81105041503906, "rewards/margins": 21.070104598999023, "rewards/rejected": -156.8811492919922, "step": 43520 }, { "epoch": 2.52, "grad_norm": 1.118254931498086e-05, "learning_rate": 0.00016057122953674677, "logits/chosen": -14.662936210632324, "logits/rejected": -15.229037284851074, "logps/chosen": -2936.83447265625, "logps/rejected": -2403.918212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -110.85673522949219, "rewards/margins": 22.63263702392578, "rewards/rejected": -133.48936462402344, "step": 43530 }, { "epoch": 2.52, "grad_norm": 24.777631759643555, "learning_rate": 0.0001603777235961144, "logits/chosen": -14.500836372375488, "logits/rejected": -14.520637512207031, "logps/chosen": -3033.41015625, "logps/rejected": -2739.72509765625, "loss": 0.6195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -150.30667114257812, "rewards/margins": 10.742834091186523, "rewards/rejected": -161.0495147705078, "step": 43540 }, { "epoch": 2.52, "grad_norm": 7.951778883789895e-13, "learning_rate": 0.00016018421765548203, "logits/chosen": -16.891935348510742, "logits/rejected": -16.986352920532227, "logps/chosen": -2557.430419921875, "logps/rejected": -2683.05126953125, "loss": 0.3845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -186.00279235839844, "rewards/margins": 18.514812469482422, "rewards/rejected": -204.51759338378906, "step": 43550 }, { "epoch": 2.52, "grad_norm": 0.019435660913586617, "learning_rate": 0.00015999071171484967, "logits/chosen": -16.6214542388916, "logits/rejected": -17.85711097717285, "logps/chosen": -2779.10400390625, "logps/rejected": -3192.92236328125, "loss": 2.1632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -173.69000244140625, "rewards/margins": 8.811273574829102, "rewards/rejected": -182.50125122070312, "step": 43560 }, { "epoch": 2.52, "grad_norm": 0.03347652032971382, "learning_rate": 0.0001597972057742173, "logits/chosen": -15.364712715148926, "logits/rejected": -15.3374662399292, "logps/chosen": -2979.7021484375, "logps/rejected": -2954.168701171875, "loss": 5.324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.51832580566406, "rewards/margins": 5.399893283843994, "rewards/rejected": -159.91822814941406, "step": 43570 }, { "epoch": 2.52, "grad_norm": 94.8865966796875, "learning_rate": 0.00015960369983358488, "logits/chosen": -19.142818450927734, "logits/rejected": -21.56853675842285, "logps/chosen": -2862.79931640625, "logps/rejected": -3081.368408203125, "loss": 2.2312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -216.1649169921875, "rewards/margins": 26.600284576416016, "rewards/rejected": -242.7651824951172, "step": 43580 }, { "epoch": 2.52, "grad_norm": 76.16474914550781, "learning_rate": 0.00015941019389295252, "logits/chosen": -15.13586711883545, "logits/rejected": -15.085931777954102, "logps/chosen": -2953.869873046875, "logps/rejected": -2661.904052734375, "loss": 14.2541, "rewards/accuracies": 0.5, "rewards/chosen": -149.58023071289062, "rewards/margins": -6.68133544921875, "rewards/rejected": -142.89891052246094, "step": 43590 }, { "epoch": 2.52, "grad_norm": 9.291247367858887, "learning_rate": 0.00015921668795232014, "logits/chosen": -17.113842010498047, "logits/rejected": -17.47720718383789, "logps/chosen": -2886.282958984375, "logps/rejected": -2520.546875, "loss": 2.0521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.70468139648438, "rewards/margins": 6.487693786621094, "rewards/rejected": -217.19235229492188, "step": 43600 }, { "epoch": 2.52, "grad_norm": 5.723037611460313e-05, "learning_rate": 0.00015902318201168776, "logits/chosen": -16.844730377197266, "logits/rejected": -17.056194305419922, "logps/chosen": -2757.487548828125, "logps/rejected": -2852.21875, "loss": 1.0765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -208.6065673828125, "rewards/margins": 8.974321365356445, "rewards/rejected": -217.5808868408203, "step": 43610 }, { "epoch": 2.52, "grad_norm": 119.94293212890625, "learning_rate": 0.0001588296760710554, "logits/chosen": -14.842371940612793, "logits/rejected": -15.680437088012695, "logps/chosen": -2820.65380859375, "logps/rejected": -2700.369140625, "loss": 16.4828, "rewards/accuracies": 0.5, "rewards/chosen": -169.62130737304688, "rewards/margins": -9.70527458190918, "rewards/rejected": -159.91604614257812, "step": 43620 }, { "epoch": 2.53, "grad_norm": 0.47694405913352966, "learning_rate": 0.000158636170130423, "logits/chosen": -18.98089599609375, "logits/rejected": -20.700721740722656, "logps/chosen": -3095.287109375, "logps/rejected": -2872.72314453125, "loss": 4.8501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -198.31863403320312, "rewards/margins": 3.5955398082733154, "rewards/rejected": -201.9141845703125, "step": 43630 }, { "epoch": 2.53, "grad_norm": 72.35736846923828, "learning_rate": 0.00015844266418979064, "logits/chosen": -16.98579978942871, "logits/rejected": -21.300643920898438, "logps/chosen": -2999.052978515625, "logps/rejected": -2861.15576171875, "loss": 5.5401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -173.96481323242188, "rewards/margins": 9.539121627807617, "rewards/rejected": -183.50393676757812, "step": 43640 }, { "epoch": 2.53, "grad_norm": 1.1055528237671752e-12, "learning_rate": 0.00015824915824915825, "logits/chosen": -15.972883224487305, "logits/rejected": -17.135990142822266, "logps/chosen": -3031.96240234375, "logps/rejected": -3120.288818359375, "loss": 2.6683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.6458282470703, "rewards/margins": 16.577848434448242, "rewards/rejected": -183.22366333007812, "step": 43650 }, { "epoch": 2.53, "grad_norm": 131.3008270263672, "learning_rate": 0.00015805565230852587, "logits/chosen": -16.498489379882812, "logits/rejected": -17.669696807861328, "logps/chosen": -3207.714111328125, "logps/rejected": -3202.623291015625, "loss": 0.7919, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -139.50042724609375, "rewards/margins": 12.53210163116455, "rewards/rejected": -152.03253173828125, "step": 43660 }, { "epoch": 2.53, "grad_norm": 56.38093948364258, "learning_rate": 0.00015786214636789351, "logits/chosen": -17.80392074584961, "logits/rejected": -19.2365665435791, "logps/chosen": -2660.52978515625, "logps/rejected": -2686.469482421875, "loss": 0.8506, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -198.33847045898438, "rewards/margins": 12.626582145690918, "rewards/rejected": -210.9650421142578, "step": 43670 }, { "epoch": 2.53, "grad_norm": 0.01076948270201683, "learning_rate": 0.0001576686404272611, "logits/chosen": -18.309228897094727, "logits/rejected": -17.93661117553711, "logps/chosen": -2644.25439453125, "logps/rejected": -2613.51708984375, "loss": 5.8199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -233.7408905029297, "rewards/margins": 2.356524705886841, "rewards/rejected": -236.09744262695312, "step": 43680 }, { "epoch": 2.53, "grad_norm": 0.0004221149138174951, "learning_rate": 0.00015747513448662872, "logits/chosen": -18.328449249267578, "logits/rejected": -20.183984756469727, "logps/chosen": -2433.275146484375, "logps/rejected": -2591.890625, "loss": 0.7826, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -205.6558380126953, "rewards/margins": 12.77563762664795, "rewards/rejected": -218.43148803710938, "step": 43690 }, { "epoch": 2.53, "grad_norm": 19.098575592041016, "learning_rate": 0.00015728162854599637, "logits/chosen": -16.082202911376953, "logits/rejected": -16.634410858154297, "logps/chosen": -2453.38818359375, "logps/rejected": -2456.803955078125, "loss": 2.5261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.83370971679688, "rewards/margins": 2.8719680309295654, "rewards/rejected": -179.70565795898438, "step": 43700 }, { "epoch": 2.53, "grad_norm": 39.68260192871094, "learning_rate": 0.00015708812260536398, "logits/chosen": -18.646045684814453, "logits/rejected": -18.79660415649414, "logps/chosen": -2375.412353515625, "logps/rejected": -2574.60400390625, "loss": 1.2161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -193.74415588378906, "rewards/margins": 6.644598484039307, "rewards/rejected": -200.3887481689453, "step": 43710 }, { "epoch": 2.53, "grad_norm": 40.577518463134766, "learning_rate": 0.00015689461666473163, "logits/chosen": -17.5244197845459, "logits/rejected": -17.35927391052246, "logps/chosen": -2797.548095703125, "logps/rejected": -2589.259033203125, "loss": 22.2644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -158.37451171875, "rewards/margins": -18.467384338378906, "rewards/rejected": -139.90713500976562, "step": 43720 }, { "epoch": 2.53, "grad_norm": 278.1980895996094, "learning_rate": 0.00015670111072409924, "logits/chosen": -16.69061279296875, "logits/rejected": -19.601848602294922, "logps/chosen": -3063.60498046875, "logps/rejected": -2661.300048828125, "loss": 24.0746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.5783233642578, "rewards/margins": -10.723481178283691, "rewards/rejected": -183.85482788085938, "step": 43730 }, { "epoch": 2.53, "grad_norm": 165.57467651367188, "learning_rate": 0.00015650760478346683, "logits/chosen": -18.766035079956055, "logits/rejected": -19.431676864624023, "logps/chosen": -2615.01708984375, "logps/rejected": -2647.61181640625, "loss": 3.7039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -195.51048278808594, "rewards/margins": 3.237893581390381, "rewards/rejected": -198.7483673095703, "step": 43740 }, { "epoch": 2.53, "grad_norm": 0.08668682724237442, "learning_rate": 0.00015631409884283448, "logits/chosen": -14.367372512817383, "logits/rejected": -14.547538757324219, "logps/chosen": -2926.0537109375, "logps/rejected": -2942.314697265625, "loss": 3.9833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.28085327148438, "rewards/margins": 6.809147834777832, "rewards/rejected": -180.08999633789062, "step": 43750 }, { "epoch": 2.53, "grad_norm": 18.829456329345703, "learning_rate": 0.0001561205929022021, "logits/chosen": -14.362350463867188, "logits/rejected": -14.35425090789795, "logps/chosen": -2954.509765625, "logps/rejected": -2934.73291015625, "loss": 2.6914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -139.76866149902344, "rewards/margins": 4.740586280822754, "rewards/rejected": -144.50926208496094, "step": 43760 }, { "epoch": 2.53, "grad_norm": 203.16380310058594, "learning_rate": 0.00015592708696156974, "logits/chosen": -15.76483154296875, "logits/rejected": -15.858926773071289, "logps/chosen": -2965.154541015625, "logps/rejected": -2957.1689453125, "loss": 2.9601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.17706298828125, "rewards/margins": 9.13406753540039, "rewards/rejected": -211.31112670898438, "step": 43770 }, { "epoch": 2.53, "grad_norm": 7.836431980133057, "learning_rate": 0.00015573358102093736, "logits/chosen": -17.8089542388916, "logits/rejected": -17.853683471679688, "logps/chosen": -2884.9931640625, "logps/rejected": -2840.56396484375, "loss": 2.0424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -215.64053344726562, "rewards/margins": 6.386122226715088, "rewards/rejected": -222.0266876220703, "step": 43780 }, { "epoch": 2.53, "grad_norm": 93.89768981933594, "learning_rate": 0.00015554007508030495, "logits/chosen": -15.622055053710938, "logits/rejected": -15.781881332397461, "logps/chosen": -2788.9482421875, "logps/rejected": -3022.69580078125, "loss": 3.8841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -154.5865936279297, "rewards/margins": 5.0958075523376465, "rewards/rejected": -159.68240356445312, "step": 43790 }, { "epoch": 2.54, "grad_norm": 478.9013366699219, "learning_rate": 0.0001553465691396726, "logits/chosen": -17.14430809020996, "logits/rejected": -17.40169906616211, "logps/chosen": -2640.278076171875, "logps/rejected": -2772.22900390625, "loss": 10.731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -213.25668334960938, "rewards/margins": -5.132508277893066, "rewards/rejected": -208.12417602539062, "step": 43800 }, { "epoch": 2.54, "grad_norm": 4.60801707902192e-08, "learning_rate": 0.0001551530631990402, "logits/chosen": -15.703536987304688, "logits/rejected": -15.631095886230469, "logps/chosen": -3278.08154296875, "logps/rejected": -3276.228759765625, "loss": 1.1596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -143.77499389648438, "rewards/margins": 12.792940139770508, "rewards/rejected": -156.56793212890625, "step": 43810 }, { "epoch": 2.54, "grad_norm": 56.453548431396484, "learning_rate": 0.00015495955725840783, "logits/chosen": -17.459197998046875, "logits/rejected": -18.401025772094727, "logps/chosen": -2891.4560546875, "logps/rejected": -2786.48583984375, "loss": 2.2225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.97608947753906, "rewards/margins": 6.535317897796631, "rewards/rejected": -196.5113983154297, "step": 43820 }, { "epoch": 2.54, "grad_norm": 65.81172180175781, "learning_rate": 0.00015476605131777547, "logits/chosen": -13.857386589050293, "logits/rejected": -14.329582214355469, "logps/chosen": -2989.510498046875, "logps/rejected": -2443.831787109375, "loss": 2.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -137.90359497070312, "rewards/margins": 8.785574913024902, "rewards/rejected": -146.68917846679688, "step": 43830 }, { "epoch": 2.54, "grad_norm": 51.86097717285156, "learning_rate": 0.0001545725453771431, "logits/chosen": -19.7508602142334, "logits/rejected": -20.831043243408203, "logps/chosen": -3003.874755859375, "logps/rejected": -3017.97021484375, "loss": 1.999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.3672332763672, "rewards/margins": 7.990459442138672, "rewards/rejected": -206.3577117919922, "step": 43840 }, { "epoch": 2.54, "grad_norm": 77.58016204833984, "learning_rate": 0.0001543790394365107, "logits/chosen": -12.634710311889648, "logits/rejected": -12.397489547729492, "logps/chosen": -3298.126220703125, "logps/rejected": -3225.514404296875, "loss": 1.498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -95.83494567871094, "rewards/margins": 10.073509216308594, "rewards/rejected": -105.90846252441406, "step": 43850 }, { "epoch": 2.54, "grad_norm": 0.0, "learning_rate": 0.00015418553349587832, "logits/chosen": -14.699131965637207, "logits/rejected": -14.06678295135498, "logps/chosen": -3053.716064453125, "logps/rejected": -2950.570556640625, "loss": 6.7181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.33779907226562, "rewards/margins": 4.499754905700684, "rewards/rejected": -159.83755493164062, "step": 43860 }, { "epoch": 2.54, "grad_norm": 0.0, "learning_rate": 0.00015399202755524594, "logits/chosen": -13.785038948059082, "logits/rejected": -13.824389457702637, "logps/chosen": -3051.404296875, "logps/rejected": -2964.048828125, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -132.08905029296875, "rewards/margins": 31.132671356201172, "rewards/rejected": -163.22171020507812, "step": 43870 }, { "epoch": 2.54, "grad_norm": 4.768486542161554e-05, "learning_rate": 0.00015379852161461358, "logits/chosen": -17.27236557006836, "logits/rejected": -16.95067024230957, "logps/chosen": -2381.30517578125, "logps/rejected": -2365.65625, "loss": 5.7899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.4449920654297, "rewards/margins": -0.6014953851699829, "rewards/rejected": -208.843505859375, "step": 43880 }, { "epoch": 2.54, "grad_norm": 61.62126541137695, "learning_rate": 0.0001536050156739812, "logits/chosen": -16.65121078491211, "logits/rejected": -18.89198875427246, "logps/chosen": -3178.27734375, "logps/rejected": -3035.77587890625, "loss": 3.9332, "rewards/accuracies": 0.5, "rewards/chosen": -163.36038208007812, "rewards/margins": 7.386086463928223, "rewards/rejected": -170.74647521972656, "step": 43890 }, { "epoch": 2.54, "grad_norm": 70.2710189819336, "learning_rate": 0.00015341150973334882, "logits/chosen": -15.9527587890625, "logits/rejected": -16.819278717041016, "logps/chosen": -2922.79052734375, "logps/rejected": -2841.742431640625, "loss": 2.4576, "rewards/accuracies": 0.5, "rewards/chosen": -213.6322479248047, "rewards/margins": 5.339605331420898, "rewards/rejected": -218.9718780517578, "step": 43900 }, { "epoch": 2.54, "grad_norm": 127.67740631103516, "learning_rate": 0.00015321800379271643, "logits/chosen": -18.095603942871094, "logits/rejected": -18.22565269470215, "logps/chosen": -2991.1875, "logps/rejected": -2979.010986328125, "loss": 1.8523, "rewards/accuracies": 0.5, "rewards/chosen": -233.89395141601562, "rewards/margins": 4.5377326011657715, "rewards/rejected": -238.4316864013672, "step": 43910 }, { "epoch": 2.54, "grad_norm": 0.0012640219647437334, "learning_rate": 0.00015302449785208405, "logits/chosen": -17.777652740478516, "logits/rejected": -18.201234817504883, "logps/chosen": -2491.17041015625, "logps/rejected": -2648.211181640625, "loss": 4.8633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.3374786376953, "rewards/margins": 2.8787741661071777, "rewards/rejected": -208.2162322998047, "step": 43920 }, { "epoch": 2.54, "grad_norm": 3.896173383921564e-10, "learning_rate": 0.0001528309919114517, "logits/chosen": -19.89682388305664, "logits/rejected": -20.330081939697266, "logps/chosen": -2841.8466796875, "logps/rejected": -2874.459716796875, "loss": 0.2394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -255.97146606445312, "rewards/margins": 8.737600326538086, "rewards/rejected": -264.7090759277344, "step": 43930 }, { "epoch": 2.54, "grad_norm": 1.3118610706036149e-16, "learning_rate": 0.00015263748597081931, "logits/chosen": -17.609506607055664, "logits/rejected": -17.8779296875, "logps/chosen": -2512.4453125, "logps/rejected": -2598.695068359375, "loss": 1.7802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -212.94192504882812, "rewards/margins": 14.562894821166992, "rewards/rejected": -227.5048065185547, "step": 43940 }, { "epoch": 2.54, "grad_norm": 56.629329681396484, "learning_rate": 0.00015244398003018693, "logits/chosen": -17.452638626098633, "logits/rejected": -19.366291046142578, "logps/chosen": -3104.27001953125, "logps/rejected": -3085.24267578125, "loss": 3.4974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -202.01834106445312, "rewards/margins": 6.081540107727051, "rewards/rejected": -208.09988403320312, "step": 43950 }, { "epoch": 2.54, "grad_norm": 85.22631072998047, "learning_rate": 0.00015225047408955455, "logits/chosen": -19.53298568725586, "logits/rejected": -19.84769058227539, "logps/chosen": -2891.05419921875, "logps/rejected": -2887.89208984375, "loss": 4.4091, "rewards/accuracies": 0.5, "rewards/chosen": -252.53866577148438, "rewards/margins": 0.4906783998012543, "rewards/rejected": -253.0293426513672, "step": 43960 }, { "epoch": 2.55, "grad_norm": 34.25877380371094, "learning_rate": 0.00015205696814892217, "logits/chosen": -16.64484214782715, "logits/rejected": -17.998926162719727, "logps/chosen": -2451.204345703125, "logps/rejected": -2847.237548828125, "loss": 3.7595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -146.7447967529297, "rewards/margins": 20.16001319885254, "rewards/rejected": -166.90481567382812, "step": 43970 }, { "epoch": 2.55, "grad_norm": 1.6264788200714975e-06, "learning_rate": 0.0001518634622082898, "logits/chosen": -16.9411678314209, "logits/rejected": -18.018625259399414, "logps/chosen": -3097.263671875, "logps/rejected": -2770.428466796875, "loss": 7.8917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -172.16171264648438, "rewards/margins": 13.914340019226074, "rewards/rejected": -186.0760498046875, "step": 43980 }, { "epoch": 2.55, "grad_norm": 2.3509583473205566, "learning_rate": 0.00015166995626765743, "logits/chosen": -16.765361785888672, "logits/rejected": -16.49669075012207, "logps/chosen": -2982.048583984375, "logps/rejected": -3014.72119140625, "loss": 12.1562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.92138671875, "rewards/margins": -8.699060440063477, "rewards/rejected": -183.22232055664062, "step": 43990 }, { "epoch": 2.55, "grad_norm": 0.2728678286075592, "learning_rate": 0.00015147645032702504, "logits/chosen": -17.14298439025879, "logits/rejected": -17.345947265625, "logps/chosen": -2874.75146484375, "logps/rejected": -2790.365234375, "loss": 1.7515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.45968627929688, "rewards/margins": 4.104211330413818, "rewards/rejected": -193.5638885498047, "step": 44000 }, { "epoch": 2.55, "grad_norm": 64.45189666748047, "learning_rate": 0.00015128294438639266, "logits/chosen": -16.34703254699707, "logits/rejected": -18.52835464477539, "logps/chosen": -3339.76708984375, "logps/rejected": -2905.474609375, "loss": 4.2318, "rewards/accuracies": 0.5, "rewards/chosen": -178.35482788085938, "rewards/margins": 7.823282718658447, "rewards/rejected": -186.1781005859375, "step": 44010 }, { "epoch": 2.55, "grad_norm": 89.8117446899414, "learning_rate": 0.00015108943844576028, "logits/chosen": -18.037681579589844, "logits/rejected": -18.568523406982422, "logps/chosen": -3181.098388671875, "logps/rejected": -3206.888916015625, "loss": 4.6835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -207.05154418945312, "rewards/margins": 1.2041622400283813, "rewards/rejected": -208.2556915283203, "step": 44020 }, { "epoch": 2.55, "grad_norm": 5.023074150085449, "learning_rate": 0.0001508959325051279, "logits/chosen": -20.09713363647461, "logits/rejected": -20.519102096557617, "logps/chosen": -2948.346435546875, "logps/rejected": -2731.495361328125, "loss": 2.8529, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -242.23910522460938, "rewards/margins": -0.1955406218767166, "rewards/rejected": -242.04354858398438, "step": 44030 }, { "epoch": 2.55, "grad_norm": 78.3518295288086, "learning_rate": 0.00015070242656449554, "logits/chosen": -17.7911376953125, "logits/rejected": -17.95328712463379, "logps/chosen": -2742.84912109375, "logps/rejected": -2341.52783203125, "loss": 7.0594, "rewards/accuracies": 0.5, "rewards/chosen": -185.81402587890625, "rewards/margins": 2.8420746326446533, "rewards/rejected": -188.65609741210938, "step": 44040 }, { "epoch": 2.55, "grad_norm": 0.0004260841815266758, "learning_rate": 0.00015050892062386316, "logits/chosen": -15.883615493774414, "logits/rejected": -16.23422622680664, "logps/chosen": -2727.2626953125, "logps/rejected": -3085.4599609375, "loss": 5.1314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.1343231201172, "rewards/margins": -0.04688377305865288, "rewards/rejected": -205.0874481201172, "step": 44050 }, { "epoch": 2.55, "grad_norm": 0.0021909899078309536, "learning_rate": 0.0001503154146832308, "logits/chosen": -18.69464111328125, "logits/rejected": -19.746183395385742, "logps/chosen": -2797.610107421875, "logps/rejected": -2796.4677734375, "loss": 1.9274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.8298797607422, "rewards/margins": 9.700949668884277, "rewards/rejected": -191.5308380126953, "step": 44060 }, { "epoch": 2.55, "grad_norm": 0.33424457907676697, "learning_rate": 0.0001501219087425984, "logits/chosen": -20.6337890625, "logits/rejected": -22.100168228149414, "logps/chosen": -2914.4345703125, "logps/rejected": -2918.99267578125, "loss": 2.9515, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -229.8031463623047, "rewards/margins": 0.33117160201072693, "rewards/rejected": -230.1343231201172, "step": 44070 }, { "epoch": 2.55, "grad_norm": 0.2412145584821701, "learning_rate": 0.000149928402801966, "logits/chosen": -16.6557674407959, "logits/rejected": -18.807077407836914, "logps/chosen": -3048.294189453125, "logps/rejected": -3021.30322265625, "loss": 2.2064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -193.65487670898438, "rewards/margins": 8.730911254882812, "rewards/rejected": -202.38577270507812, "step": 44080 }, { "epoch": 2.55, "grad_norm": 1.385895848274231, "learning_rate": 0.00014973489686133365, "logits/chosen": -17.138874053955078, "logits/rejected": -19.153465270996094, "logps/chosen": -2835.9755859375, "logps/rejected": -2742.333984375, "loss": 1.2168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.19119262695312, "rewards/margins": 8.287053108215332, "rewards/rejected": -187.47824096679688, "step": 44090 }, { "epoch": 2.55, "grad_norm": 382.2491149902344, "learning_rate": 0.00014954139092070127, "logits/chosen": -15.518458366394043, "logits/rejected": -16.087745666503906, "logps/chosen": -3232.29150390625, "logps/rejected": -3197.48779296875, "loss": 6.055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -175.05731201171875, "rewards/margins": 5.69661808013916, "rewards/rejected": -180.75393676757812, "step": 44100 }, { "epoch": 2.55, "grad_norm": 2.339707612991333, "learning_rate": 0.00014934788498006891, "logits/chosen": -15.265698432922363, "logits/rejected": -15.089350700378418, "logps/chosen": -2958.5634765625, "logps/rejected": -3075.93505859375, "loss": 1.0109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.79544067382812, "rewards/margins": 6.820077419281006, "rewards/rejected": -192.6155242919922, "step": 44110 }, { "epoch": 2.55, "grad_norm": 71.33674621582031, "learning_rate": 0.0001491543790394365, "logits/chosen": -16.676807403564453, "logits/rejected": -16.904359817504883, "logps/chosen": -2905.33544921875, "logps/rejected": -2935.473876953125, "loss": 2.0997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -187.2427215576172, "rewards/margins": 6.6747727394104, "rewards/rejected": -193.91746520996094, "step": 44120 }, { "epoch": 2.55, "grad_norm": 3.8974334870545135e-07, "learning_rate": 0.00014896087309880412, "logits/chosen": -15.953852653503418, "logits/rejected": -16.589542388916016, "logps/chosen": -3030.159423828125, "logps/rejected": -3095.41943359375, "loss": 0.6266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -253.461669921875, "rewards/margins": 14.128252983093262, "rewards/rejected": -267.58990478515625, "step": 44130 }, { "epoch": 2.55, "grad_norm": 7.227275091281626e-06, "learning_rate": 0.00014876736715817177, "logits/chosen": -15.413820266723633, "logits/rejected": -16.192453384399414, "logps/chosen": -2564.194091796875, "logps/rejected": -2757.489013671875, "loss": 0.9849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -158.120849609375, "rewards/margins": 6.084254264831543, "rewards/rejected": -164.20510864257812, "step": 44140 }, { "epoch": 2.56, "grad_norm": 2.3569633657988254e-12, "learning_rate": 0.00014857386121753938, "logits/chosen": -17.856788635253906, "logits/rejected": -17.83946418762207, "logps/chosen": -2463.2021484375, "logps/rejected": -2491.6474609375, "loss": 7.4579, "rewards/accuracies": 0.5, "rewards/chosen": -210.4133758544922, "rewards/margins": 9.099299430847168, "rewards/rejected": -219.5126953125, "step": 44150 }, { "epoch": 2.56, "grad_norm": 8.270913124084473, "learning_rate": 0.000148380355276907, "logits/chosen": -16.292098999023438, "logits/rejected": -16.274660110473633, "logps/chosen": -2929.673583984375, "logps/rejected": -2720.87353515625, "loss": 2.2844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.05625915527344, "rewards/margins": 11.018943786621094, "rewards/rejected": -201.0751953125, "step": 44160 }, { "epoch": 2.56, "grad_norm": 297.9817199707031, "learning_rate": 0.00014818684933627464, "logits/chosen": -16.681182861328125, "logits/rejected": -17.070175170898438, "logps/chosen": -3347.93408203125, "logps/rejected": -3022.293212890625, "loss": 7.8132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -156.88551330566406, "rewards/margins": -0.12491035461425781, "rewards/rejected": -156.76058959960938, "step": 44170 }, { "epoch": 2.56, "grad_norm": 1.8360226672342606e-09, "learning_rate": 0.00014799334339564223, "logits/chosen": -16.566932678222656, "logits/rejected": -16.756227493286133, "logps/chosen": -3168.889892578125, "logps/rejected": -3041.776611328125, "loss": 0.9918, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -205.2144775390625, "rewards/margins": 9.584176063537598, "rewards/rejected": -214.79867553710938, "step": 44180 }, { "epoch": 2.56, "grad_norm": 214.01315307617188, "learning_rate": 0.00014779983745500988, "logits/chosen": -18.439937591552734, "logits/rejected": -20.124691009521484, "logps/chosen": -2910.110107421875, "logps/rejected": -2469.742919921875, "loss": 31.4855, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -189.56680297851562, "rewards/margins": -26.989049911499023, "rewards/rejected": -162.5777587890625, "step": 44190 }, { "epoch": 2.56, "grad_norm": 0.01193490531295538, "learning_rate": 0.0001476063315143775, "logits/chosen": -19.949966430664062, "logits/rejected": -19.519573211669922, "logps/chosen": -3121.72998046875, "logps/rejected": -3049.982421875, "loss": 10.3988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.35751342773438, "rewards/margins": -5.115218639373779, "rewards/rejected": -184.24227905273438, "step": 44200 }, { "epoch": 2.56, "grad_norm": 1.172253353415309e-11, "learning_rate": 0.0001474128255737451, "logits/chosen": -16.174453735351562, "logits/rejected": -17.564176559448242, "logps/chosen": -3456.37158203125, "logps/rejected": -2949.940185546875, "loss": 2.0648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -192.24951171875, "rewards/margins": 9.982038497924805, "rewards/rejected": -202.2315673828125, "step": 44210 }, { "epoch": 2.56, "grad_norm": 3.149786789435893e-05, "learning_rate": 0.00014721931963311276, "logits/chosen": -16.188703536987305, "logits/rejected": -17.95463752746582, "logps/chosen": -2671.34423828125, "logps/rejected": -2549.357421875, "loss": 8.8207, "rewards/accuracies": 0.5, "rewards/chosen": -170.30270385742188, "rewards/margins": 11.487865447998047, "rewards/rejected": -181.79055786132812, "step": 44220 }, { "epoch": 2.56, "grad_norm": 0.0017402090597897768, "learning_rate": 0.00014702581369248035, "logits/chosen": -18.57582664489746, "logits/rejected": -20.485685348510742, "logps/chosen": -3139.47119140625, "logps/rejected": -2664.422607421875, "loss": 1.4232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.1541290283203, "rewards/margins": 23.2928409576416, "rewards/rejected": -179.4469757080078, "step": 44230 }, { "epoch": 2.56, "grad_norm": 152.2362060546875, "learning_rate": 0.00014683230775184796, "logits/chosen": -18.166940689086914, "logits/rejected": -18.64493179321289, "logps/chosen": -2889.764892578125, "logps/rejected": -2843.43310546875, "loss": 5.7793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -216.5116729736328, "rewards/margins": 2.914607524871826, "rewards/rejected": -219.42626953125, "step": 44240 }, { "epoch": 2.56, "grad_norm": 27.731037139892578, "learning_rate": 0.0001466388018112156, "logits/chosen": -16.13125228881836, "logits/rejected": -16.193017959594727, "logps/chosen": -2843.37646484375, "logps/rejected": -2829.390625, "loss": 0.8215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -129.56210327148438, "rewards/margins": 9.365320205688477, "rewards/rejected": -138.9274139404297, "step": 44250 }, { "epoch": 2.56, "grad_norm": 8.013555309033097e-16, "learning_rate": 0.00014644529587058323, "logits/chosen": -16.413944244384766, "logits/rejected": -17.366947174072266, "logps/chosen": -3123.86083984375, "logps/rejected": -2663.884033203125, "loss": 1.4028, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -166.59130859375, "rewards/margins": 12.05767822265625, "rewards/rejected": -178.64898681640625, "step": 44260 }, { "epoch": 2.56, "grad_norm": 0.005614951252937317, "learning_rate": 0.00014625178992995087, "logits/chosen": -15.454983711242676, "logits/rejected": -16.437549591064453, "logps/chosen": -3331.85498046875, "logps/rejected": -3311.846923828125, "loss": 4.5025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -86.42704010009766, "rewards/margins": 9.032512664794922, "rewards/rejected": -95.45954895019531, "step": 44270 }, { "epoch": 2.56, "grad_norm": 0.009298568591475487, "learning_rate": 0.0001460582839893185, "logits/chosen": -16.367122650146484, "logits/rejected": -17.597667694091797, "logps/chosen": -2897.24365234375, "logps/rejected": -2939.30810546875, "loss": 0.6432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -223.40185546875, "rewards/margins": 10.645547866821289, "rewards/rejected": -234.0474090576172, "step": 44280 }, { "epoch": 2.56, "grad_norm": 5.696553707122803, "learning_rate": 0.00014586477804868608, "logits/chosen": -17.99252700805664, "logits/rejected": -18.599323272705078, "logps/chosen": -2496.734619140625, "logps/rejected": -2363.601318359375, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": -178.4376678466797, "rewards/margins": 8.16087532043457, "rewards/rejected": -186.59854125976562, "step": 44290 }, { "epoch": 2.56, "grad_norm": 0.1930457055568695, "learning_rate": 0.00014567127210805372, "logits/chosen": -18.712848663330078, "logits/rejected": -18.46395492553711, "logps/chosen": -2811.54345703125, "logps/rejected": -2961.37841796875, "loss": 11.9522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -166.3733673095703, "rewards/margins": 0.550952136516571, "rewards/rejected": -166.92431640625, "step": 44300 }, { "epoch": 2.56, "grad_norm": 61.75602340698242, "learning_rate": 0.00014547776616742134, "logits/chosen": -17.724712371826172, "logits/rejected": -17.56941032409668, "logps/chosen": -3267.433349609375, "logps/rejected": -3280.276611328125, "loss": 2.6548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -178.25540161132812, "rewards/margins": 6.139512538909912, "rewards/rejected": -184.39491271972656, "step": 44310 }, { "epoch": 2.57, "grad_norm": 5.293955920339377e-23, "learning_rate": 0.00014528426022678898, "logits/chosen": -16.31802749633789, "logits/rejected": -18.680208206176758, "logps/chosen": -3429.51708984375, "logps/rejected": -2967.60546875, "loss": 1.6392, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -182.7320556640625, "rewards/margins": 12.142975807189941, "rewards/rejected": -194.87503051757812, "step": 44320 }, { "epoch": 2.57, "grad_norm": 0.009612591937184334, "learning_rate": 0.0001450907542861566, "logits/chosen": -17.205005645751953, "logits/rejected": -18.28622055053711, "logps/chosen": -3114.990966796875, "logps/rejected": -3119.6787109375, "loss": 6.7796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -173.94410705566406, "rewards/margins": 5.957309722900391, "rewards/rejected": -179.9014129638672, "step": 44330 }, { "epoch": 2.57, "grad_norm": 0.015318972058594227, "learning_rate": 0.0001448972483455242, "logits/chosen": -16.496835708618164, "logits/rejected": -17.930950164794922, "logps/chosen": -2857.753662109375, "logps/rejected": -2644.68017578125, "loss": 3.7714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.0572052001953, "rewards/margins": -0.5246533751487732, "rewards/rejected": -167.53253173828125, "step": 44340 }, { "epoch": 2.57, "grad_norm": 0.0, "learning_rate": 0.00014470374240489184, "logits/chosen": -16.37795066833496, "logits/rejected": -17.579029083251953, "logps/chosen": -2965.03564453125, "logps/rejected": -2571.6201171875, "loss": 4.1257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -106.04052734375, "rewards/margins": 31.389251708984375, "rewards/rejected": -137.42977905273438, "step": 44350 }, { "epoch": 2.57, "grad_norm": 43.46563720703125, "learning_rate": 0.00014451023646425945, "logits/chosen": -16.219465255737305, "logits/rejected": -16.484851837158203, "logps/chosen": -3151.454345703125, "logps/rejected": -2885.77587890625, "loss": 1.7888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.9842529296875, "rewards/margins": 6.1068291664123535, "rewards/rejected": -171.09109497070312, "step": 44360 }, { "epoch": 2.57, "grad_norm": 1.4464616775512695, "learning_rate": 0.00014431673052362707, "logits/chosen": -19.00873374938965, "logits/rejected": -20.048809051513672, "logps/chosen": -2978.792236328125, "logps/rejected": -2502.52880859375, "loss": 34.0963, "rewards/accuracies": 0.5, "rewards/chosen": -226.8808135986328, "rewards/margins": -27.556249618530273, "rewards/rejected": -199.32455444335938, "step": 44370 }, { "epoch": 2.57, "grad_norm": 32.40945053100586, "learning_rate": 0.00014412322458299471, "logits/chosen": -16.432659149169922, "logits/rejected": -19.059720993041992, "logps/chosen": -2991.299560546875, "logps/rejected": -2830.655517578125, "loss": 5.1404, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -142.3897247314453, "rewards/margins": 2.0762600898742676, "rewards/rejected": -144.46597290039062, "step": 44380 }, { "epoch": 2.57, "grad_norm": 1.8221551179885864, "learning_rate": 0.0001439297186423623, "logits/chosen": -20.861305236816406, "logits/rejected": -20.747644424438477, "logps/chosen": -3074.70068359375, "logps/rejected": -3079.24755859375, "loss": 2.1168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.55136108398438, "rewards/margins": 5.882866859436035, "rewards/rejected": -210.4342041015625, "step": 44390 }, { "epoch": 2.57, "grad_norm": 41.35147476196289, "learning_rate": 0.00014373621270172995, "logits/chosen": -17.90115737915039, "logits/rejected": -18.973478317260742, "logps/chosen": -2737.08251953125, "logps/rejected": -2666.77783203125, "loss": 3.3257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -169.35157775878906, "rewards/margins": 14.951687812805176, "rewards/rejected": -184.3032684326172, "step": 44400 }, { "epoch": 2.57, "grad_norm": 1.9413446352700703e-05, "learning_rate": 0.00014354270676109757, "logits/chosen": -15.147122383117676, "logits/rejected": -16.192358016967773, "logps/chosen": -2973.87841796875, "logps/rejected": -2401.564697265625, "loss": 0.4396, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -121.77287292480469, "rewards/margins": 16.378286361694336, "rewards/rejected": -138.15115356445312, "step": 44410 }, { "epoch": 2.57, "grad_norm": 2.8280270099639893, "learning_rate": 0.00014334920082046518, "logits/chosen": -13.752652168273926, "logits/rejected": -13.480015754699707, "logps/chosen": -3151.7919921875, "logps/rejected": -3004.2841796875, "loss": 1.1882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.37507629394531, "rewards/margins": 10.993154525756836, "rewards/rejected": -115.36822509765625, "step": 44420 }, { "epoch": 2.57, "grad_norm": 33.25735092163086, "learning_rate": 0.00014315569487983283, "logits/chosen": -15.647809028625488, "logits/rejected": -15.994059562683105, "logps/chosen": -2732.364501953125, "logps/rejected": -2780.823974609375, "loss": 0.7405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -113.64241790771484, "rewards/margins": 7.803288459777832, "rewards/rejected": -121.4457015991211, "step": 44430 }, { "epoch": 2.57, "grad_norm": 2.3925245841383003e-05, "learning_rate": 0.00014296218893920044, "logits/chosen": -15.160150527954102, "logits/rejected": -15.337547302246094, "logps/chosen": -2724.186767578125, "logps/rejected": -2767.341064453125, "loss": 5.0046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -107.5665054321289, "rewards/margins": 7.669136047363281, "rewards/rejected": -115.23563385009766, "step": 44440 }, { "epoch": 2.57, "grad_norm": 24.03046226501465, "learning_rate": 0.00014276868299856806, "logits/chosen": -17.27022361755371, "logits/rejected": -17.288692474365234, "logps/chosen": -2945.45068359375, "logps/rejected": -2936.163330078125, "loss": 1.7954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -190.64292907714844, "rewards/margins": 5.084983825683594, "rewards/rejected": -195.7279052734375, "step": 44450 }, { "epoch": 2.57, "grad_norm": 0.6297392845153809, "learning_rate": 0.00014257517705793568, "logits/chosen": -13.497884750366211, "logits/rejected": -13.49903392791748, "logps/chosen": -3148.090576171875, "logps/rejected": -2488.161376953125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -100.22916412353516, "rewards/margins": 22.7512264251709, "rewards/rejected": -122.98039245605469, "step": 44460 }, { "epoch": 2.57, "grad_norm": 7.126929255085734e-12, "learning_rate": 0.0001423816711173033, "logits/chosen": -16.716541290283203, "logits/rejected": -15.225934028625488, "logps/chosen": -3155.42626953125, "logps/rejected": -3095.900390625, "loss": 2.6841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.5135040283203, "rewards/margins": 7.453403472900391, "rewards/rejected": -158.96690368652344, "step": 44470 }, { "epoch": 2.57, "grad_norm": 77.83734893798828, "learning_rate": 0.00014218816517667094, "logits/chosen": -19.425212860107422, "logits/rejected": -20.077836990356445, "logps/chosen": -2611.173095703125, "logps/rejected": -2789.70654296875, "loss": 8.9099, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -218.2777099609375, "rewards/margins": 1.701359510421753, "rewards/rejected": -219.9790496826172, "step": 44480 }, { "epoch": 2.58, "grad_norm": 9.324923515319824, "learning_rate": 0.00014199465923603856, "logits/chosen": -17.379806518554688, "logits/rejected": -16.75356101989746, "logps/chosen": -3008.241455078125, "logps/rejected": -2838.7841796875, "loss": 1.4188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -195.32736206054688, "rewards/margins": 10.779492378234863, "rewards/rejected": -206.1068878173828, "step": 44490 }, { "epoch": 2.58, "grad_norm": 353.51190185546875, "learning_rate": 0.00014180115329540615, "logits/chosen": -16.777652740478516, "logits/rejected": -16.94232940673828, "logps/chosen": -2823.62646484375, "logps/rejected": -3100.14013671875, "loss": 5.9113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.45355224609375, "rewards/margins": 10.347627639770508, "rewards/rejected": -207.80117797851562, "step": 44500 }, { "epoch": 2.58, "grad_norm": 38.863704681396484, "learning_rate": 0.0001416076473547738, "logits/chosen": -18.480722427368164, "logits/rejected": -19.535728454589844, "logps/chosen": -2635.168701171875, "logps/rejected": -2380.17041015625, "loss": 20.4699, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -211.55209350585938, "rewards/margins": -13.31805419921875, "rewards/rejected": -198.23403930664062, "step": 44510 }, { "epoch": 2.58, "grad_norm": 2.6921875644590754e-13, "learning_rate": 0.0001414141414141414, "logits/chosen": -15.065889358520508, "logits/rejected": -15.80485725402832, "logps/chosen": -3233.601318359375, "logps/rejected": -3553.672607421875, "loss": 4.0223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -96.04075622558594, "rewards/margins": 10.597684860229492, "rewards/rejected": -106.63844299316406, "step": 44520 }, { "epoch": 2.58, "grad_norm": 0.0035096369683742523, "learning_rate": 0.00014122063547350905, "logits/chosen": -17.971603393554688, "logits/rejected": -19.632568359375, "logps/chosen": -2785.2607421875, "logps/rejected": -2995.0966796875, "loss": 7.1676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.4827117919922, "rewards/margins": -2.0242886543273926, "rewards/rejected": -195.45846557617188, "step": 44530 }, { "epoch": 2.58, "grad_norm": 2.6869051456451416, "learning_rate": 0.00014102712953287667, "logits/chosen": -16.234371185302734, "logits/rejected": -16.242107391357422, "logps/chosen": -2793.772216796875, "logps/rejected": -2370.802978515625, "loss": 0.4051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.59381103515625, "rewards/margins": 14.039573669433594, "rewards/rejected": -181.63339233398438, "step": 44540 }, { "epoch": 2.58, "grad_norm": 49.4187126159668, "learning_rate": 0.0001408336235922443, "logits/chosen": -17.75936508178711, "logits/rejected": -19.10025405883789, "logps/chosen": -2927.894287109375, "logps/rejected": -3009.890625, "loss": 0.6533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -216.98092651367188, "rewards/margins": 22.851125717163086, "rewards/rejected": -239.8320770263672, "step": 44550 }, { "epoch": 2.58, "grad_norm": 0.1547410935163498, "learning_rate": 0.0001406401176516119, "logits/chosen": -16.12039566040039, "logits/rejected": -17.92940902709961, "logps/chosen": -3138.969482421875, "logps/rejected": -2774.428466796875, "loss": 16.8643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.5470428466797, "rewards/margins": -1.965932846069336, "rewards/rejected": -172.5811004638672, "step": 44560 }, { "epoch": 2.58, "grad_norm": 0.002723113400861621, "learning_rate": 0.00014044661171097952, "logits/chosen": -17.601764678955078, "logits/rejected": -18.224971771240234, "logps/chosen": -2718.656982421875, "logps/rejected": -2785.41650390625, "loss": 0.285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -198.59762573242188, "rewards/margins": 14.809057235717773, "rewards/rejected": -213.4066925048828, "step": 44570 }, { "epoch": 2.58, "grad_norm": 23.571186065673828, "learning_rate": 0.00014025310577034714, "logits/chosen": -14.750927925109863, "logits/rejected": -14.7919282913208, "logps/chosen": -3028.288330078125, "logps/rejected": -2988.019775390625, "loss": 1.5131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -113.7218017578125, "rewards/margins": 3.326533555984497, "rewards/rejected": -117.04830169677734, "step": 44580 }, { "epoch": 2.58, "grad_norm": 2.1728596687316895, "learning_rate": 0.00014005959982971478, "logits/chosen": -14.146804809570312, "logits/rejected": -13.768819808959961, "logps/chosen": -2674.30419921875, "logps/rejected": -2799.465576171875, "loss": 4.3965, "rewards/accuracies": 0.5, "rewards/chosen": -69.81591796875, "rewards/margins": 12.109664916992188, "rewards/rejected": -81.92557525634766, "step": 44590 }, { "epoch": 2.58, "grad_norm": 3.0634177505817206e-07, "learning_rate": 0.0001398660938890824, "logits/chosen": -17.924339294433594, "logits/rejected": -17.649608612060547, "logps/chosen": -2788.71044921875, "logps/rejected": -2966.18798828125, "loss": 1.5896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -146.90725708007812, "rewards/margins": 17.490955352783203, "rewards/rejected": -164.39822387695312, "step": 44600 }, { "epoch": 2.58, "grad_norm": 12.228687286376953, "learning_rate": 0.00013967258794845002, "logits/chosen": -18.165546417236328, "logits/rejected": -20.417354583740234, "logps/chosen": -3492.534423828125, "logps/rejected": -3393.80419921875, "loss": 14.1985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.04148864746094, "rewards/margins": -6.884586334228516, "rewards/rejected": -158.1569061279297, "step": 44610 }, { "epoch": 2.58, "grad_norm": 1.8140696056434535e-07, "learning_rate": 0.00013947908200781763, "logits/chosen": -20.8675594329834, "logits/rejected": -23.370731353759766, "logps/chosen": -3002.00439453125, "logps/rejected": -3134.056396484375, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -197.30538940429688, "rewards/margins": 16.081806182861328, "rewards/rejected": -213.38720703125, "step": 44620 }, { "epoch": 2.58, "grad_norm": 5.41701078414917, "learning_rate": 0.00013928557606718525, "logits/chosen": -19.506498336791992, "logits/rejected": -18.912038803100586, "logps/chosen": -2803.729736328125, "logps/rejected": -2862.156494140625, "loss": 7.5476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -243.75625610351562, "rewards/margins": 1.3390487432479858, "rewards/rejected": -245.0952911376953, "step": 44630 }, { "epoch": 2.58, "grad_norm": 5.625452104141004e-09, "learning_rate": 0.0001390920701265529, "logits/chosen": -20.44717025756836, "logits/rejected": -21.959829330444336, "logps/chosen": -3023.91455078125, "logps/rejected": -2993.24365234375, "loss": 6.6206, "rewards/accuracies": 0.5, "rewards/chosen": -230.40029907226562, "rewards/margins": 2.5195422172546387, "rewards/rejected": -232.91983032226562, "step": 44640 }, { "epoch": 2.58, "grad_norm": 3.490638333936147e-10, "learning_rate": 0.0001388985641859205, "logits/chosen": -18.35860252380371, "logits/rejected": -18.20172691345215, "logps/chosen": -2037.718017578125, "logps/rejected": -2389.6083984375, "loss": 2.2776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -169.35377502441406, "rewards/margins": 9.102311134338379, "rewards/rejected": -178.45608520507812, "step": 44650 }, { "epoch": 2.59, "grad_norm": 51.5148811340332, "learning_rate": 0.00013870505824528816, "logits/chosen": -19.864606857299805, "logits/rejected": -22.187904357910156, "logps/chosen": -2921.0205078125, "logps/rejected": -2807.474853515625, "loss": 17.2851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -219.9871368408203, "rewards/margins": 6.060159206390381, "rewards/rejected": -226.0472869873047, "step": 44660 }, { "epoch": 2.59, "grad_norm": 3.122371197150642e-07, "learning_rate": 0.00013851155230465575, "logits/chosen": -18.78577995300293, "logits/rejected": -19.561023712158203, "logps/chosen": -3077.12451171875, "logps/rejected": -2948.12255859375, "loss": 3.2366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.73005676269531, "rewards/margins": 25.67119789123535, "rewards/rejected": -149.40126037597656, "step": 44670 }, { "epoch": 2.59, "grad_norm": 121.61823272705078, "learning_rate": 0.00013831804636402336, "logits/chosen": -17.816211700439453, "logits/rejected": -18.999391555786133, "logps/chosen": -3098.50341796875, "logps/rejected": -3008.825927734375, "loss": 3.8769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -232.87655639648438, "rewards/margins": 5.57871150970459, "rewards/rejected": -238.45529174804688, "step": 44680 }, { "epoch": 2.59, "grad_norm": 2.0416295528411865, "learning_rate": 0.000138124540423391, "logits/chosen": -17.73343276977539, "logits/rejected": -17.686023712158203, "logps/chosen": -2875.9033203125, "logps/rejected": -2708.83154296875, "loss": 6.4644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.24246215820312, "rewards/margins": 2.5381157398223877, "rewards/rejected": -190.7805633544922, "step": 44690 }, { "epoch": 2.59, "grad_norm": 2.8346996307373047, "learning_rate": 0.00013793103448275863, "logits/chosen": -16.400920867919922, "logits/rejected": -17.13607406616211, "logps/chosen": -2951.67236328125, "logps/rejected": -2988.78369140625, "loss": 2.9198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.21627807617188, "rewards/margins": 9.057416915893555, "rewards/rejected": -190.27371215820312, "step": 44700 }, { "epoch": 2.59, "grad_norm": 270.02703857421875, "learning_rate": 0.00013773752854212624, "logits/chosen": -16.591318130493164, "logits/rejected": -18.187267303466797, "logps/chosen": -3035.6962890625, "logps/rejected": -2880.554443359375, "loss": 7.1606, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -176.1472625732422, "rewards/margins": 0.9325809478759766, "rewards/rejected": -177.079833984375, "step": 44710 }, { "epoch": 2.59, "grad_norm": 64.15083312988281, "learning_rate": 0.00013754402260149386, "logits/chosen": -17.44197654724121, "logits/rejected": -18.088531494140625, "logps/chosen": -2767.211669921875, "logps/rejected": -2615.956298828125, "loss": 2.7268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -203.88540649414062, "rewards/margins": 8.009854316711426, "rewards/rejected": -211.895263671875, "step": 44720 }, { "epoch": 2.59, "grad_norm": 0.04972858354449272, "learning_rate": 0.00013735051666086148, "logits/chosen": -16.689796447753906, "logits/rejected": -17.636356353759766, "logps/chosen": -3322.11328125, "logps/rejected": -3092.932373046875, "loss": 3.423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.80328369140625, "rewards/margins": 2.1777138710021973, "rewards/rejected": -134.9810028076172, "step": 44730 }, { "epoch": 2.59, "grad_norm": 0.01071309857070446, "learning_rate": 0.00013715701072022912, "logits/chosen": -14.63517951965332, "logits/rejected": -14.859138488769531, "logps/chosen": -3289.987548828125, "logps/rejected": -3196.163330078125, "loss": 4.1213, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -178.29763793945312, "rewards/margins": -0.3961414396762848, "rewards/rejected": -177.90151977539062, "step": 44740 }, { "epoch": 2.59, "grad_norm": 0.0001574292400619015, "learning_rate": 0.00013696350477959674, "logits/chosen": -16.429903030395508, "logits/rejected": -19.3804931640625, "logps/chosen": -3067.172119140625, "logps/rejected": -3266.231201171875, "loss": 2.5959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -161.75816345214844, "rewards/margins": 17.79895782470703, "rewards/rejected": -179.55714416503906, "step": 44750 }, { "epoch": 2.59, "grad_norm": 0.00393647188320756, "learning_rate": 0.00013676999883896436, "logits/chosen": -15.677408218383789, "logits/rejected": -18.137039184570312, "logps/chosen": -3564.77734375, "logps/rejected": -3294.623779296875, "loss": 27.1179, "rewards/accuracies": 0.5, "rewards/chosen": -200.51966857910156, "rewards/margins": -22.696033477783203, "rewards/rejected": -177.82363891601562, "step": 44760 }, { "epoch": 2.59, "grad_norm": 0.0017296798760071397, "learning_rate": 0.000136576492898332, "logits/chosen": -19.709150314331055, "logits/rejected": -20.018447875976562, "logps/chosen": -2557.232421875, "logps/rejected": -2848.832275390625, "loss": 1.9962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.78366088867188, "rewards/margins": 23.81341552734375, "rewards/rejected": -221.59707641601562, "step": 44770 }, { "epoch": 2.59, "grad_norm": 5.482067584991455, "learning_rate": 0.0001363829869576996, "logits/chosen": -18.772979736328125, "logits/rejected": -22.830127716064453, "logps/chosen": -2659.508056640625, "logps/rejected": -2533.742919921875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -215.08407592773438, "rewards/margins": 11.651426315307617, "rewards/rejected": -226.7355194091797, "step": 44780 }, { "epoch": 2.59, "grad_norm": 12.947394371032715, "learning_rate": 0.00013618948101706724, "logits/chosen": -17.833473205566406, "logits/rejected": -18.560304641723633, "logps/chosen": -2906.0927734375, "logps/rejected": -2975.31005859375, "loss": 2.1392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -218.67446899414062, "rewards/margins": 6.291945457458496, "rewards/rejected": -224.96640014648438, "step": 44790 }, { "epoch": 2.59, "grad_norm": 0.040835533291101456, "learning_rate": 0.00013599597507643485, "logits/chosen": -17.881547927856445, "logits/rejected": -17.923742294311523, "logps/chosen": -2982.52978515625, "logps/rejected": -3043.29443359375, "loss": 1.2073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -155.53842163085938, "rewards/margins": 8.674242973327637, "rewards/rejected": -164.21263122558594, "step": 44800 }, { "epoch": 2.59, "grad_norm": 656.11328125, "learning_rate": 0.00013580246913580247, "logits/chosen": -17.663558959960938, "logits/rejected": -18.23033905029297, "logps/chosen": -2898.85302734375, "logps/rejected": -2804.0244140625, "loss": 7.9794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -183.56301879882812, "rewards/margins": -0.9546686410903931, "rewards/rejected": -182.6083221435547, "step": 44810 }, { "epoch": 2.59, "grad_norm": 0.002841006498783827, "learning_rate": 0.00013560896319517011, "logits/chosen": -18.625673294067383, "logits/rejected": -17.780231475830078, "logps/chosen": -2749.345458984375, "logps/rejected": -2625.10107421875, "loss": 2.0271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -211.4658966064453, "rewards/margins": 7.6904754638671875, "rewards/rejected": -219.15634155273438, "step": 44820 }, { "epoch": 2.59, "grad_norm": 38.747230529785156, "learning_rate": 0.0001354154572545377, "logits/chosen": -17.44405746459961, "logits/rejected": -19.712276458740234, "logps/chosen": -3064.232666015625, "logps/rejected": -2934.16015625, "loss": 5.64, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.2872314453125, "rewards/margins": 10.710525512695312, "rewards/rejected": -215.9977264404297, "step": 44830 }, { "epoch": 2.6, "grad_norm": 0.0012871139915660024, "learning_rate": 0.00013522195131390532, "logits/chosen": -17.248306274414062, "logits/rejected": -17.44149398803711, "logps/chosen": -2853.895263671875, "logps/rejected": -2555.53662109375, "loss": 3.0392, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.5576171875, "rewards/margins": 5.399465084075928, "rewards/rejected": -181.95709228515625, "step": 44840 }, { "epoch": 2.6, "grad_norm": 8.720500773051754e-06, "learning_rate": 0.00013502844537327297, "logits/chosen": -16.571453094482422, "logits/rejected": -15.988334655761719, "logps/chosen": -2833.954833984375, "logps/rejected": -2572.712646484375, "loss": 1.7111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.51504516601562, "rewards/margins": 6.644125461578369, "rewards/rejected": -197.15914916992188, "step": 44850 }, { "epoch": 2.6, "grad_norm": 0.00022933149011805654, "learning_rate": 0.00013483493943264058, "logits/chosen": -18.4139347076416, "logits/rejected": -19.876564025878906, "logps/chosen": -2813.5576171875, "logps/rejected": -2534.54638671875, "loss": 0.7032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.85748291015625, "rewards/margins": 9.360373497009277, "rewards/rejected": -191.2178497314453, "step": 44860 }, { "epoch": 2.6, "grad_norm": 53.18843078613281, "learning_rate": 0.00013464143349200823, "logits/chosen": -16.498310089111328, "logits/rejected": -18.214956283569336, "logps/chosen": -3144.027099609375, "logps/rejected": -3088.78955078125, "loss": 6.1371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -204.39453125, "rewards/margins": 6.281029224395752, "rewards/rejected": -210.6755828857422, "step": 44870 }, { "epoch": 2.6, "grad_norm": 0.0347098708152771, "learning_rate": 0.00013444792755137584, "logits/chosen": -15.967874526977539, "logits/rejected": -15.572607040405273, "logps/chosen": -2840.314453125, "logps/rejected": -2865.407470703125, "loss": 1.5005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.72259521484375, "rewards/margins": 3.125777006149292, "rewards/rejected": -174.84837341308594, "step": 44880 }, { "epoch": 2.6, "grad_norm": 62.01921081542969, "learning_rate": 0.00013425442161074343, "logits/chosen": -14.592846870422363, "logits/rejected": -14.255317687988281, "logps/chosen": -3178.35888671875, "logps/rejected": -2532.76123046875, "loss": 6.0341, "rewards/accuracies": 0.5, "rewards/chosen": -162.0535888671875, "rewards/margins": 3.702667236328125, "rewards/rejected": -165.75624084472656, "step": 44890 }, { "epoch": 2.6, "grad_norm": 0.0002740852360147983, "learning_rate": 0.00013406091567011108, "logits/chosen": -18.597925186157227, "logits/rejected": -18.898204803466797, "logps/chosen": -2870.837158203125, "logps/rejected": -3045.499267578125, "loss": 2.3729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -257.1164855957031, "rewards/margins": 11.254003524780273, "rewards/rejected": -268.3704833984375, "step": 44900 }, { "epoch": 2.6, "grad_norm": 3.0382673373713365e-13, "learning_rate": 0.0001338674097294787, "logits/chosen": -14.79261302947998, "logits/rejected": -15.135826110839844, "logps/chosen": -3033.397705078125, "logps/rejected": -2690.76220703125, "loss": 3.0556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.99978637695312, "rewards/margins": 8.924159049987793, "rewards/rejected": -163.92393493652344, "step": 44910 }, { "epoch": 2.6, "grad_norm": 3.990847582957535e-11, "learning_rate": 0.0001336739037888463, "logits/chosen": -19.170730590820312, "logits/rejected": -21.798463821411133, "logps/chosen": -2782.70361328125, "logps/rejected": -2651.510986328125, "loss": 0.8526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.5557403564453, "rewards/margins": 12.931846618652344, "rewards/rejected": -167.48757934570312, "step": 44920 }, { "epoch": 2.6, "grad_norm": 38.69377136230469, "learning_rate": 0.00013348039784821396, "logits/chosen": -18.093975067138672, "logits/rejected": -17.83950424194336, "logps/chosen": -3094.461669921875, "logps/rejected": -2894.442138671875, "loss": 7.0966, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -181.38046264648438, "rewards/margins": -3.0033347606658936, "rewards/rejected": -178.37713623046875, "step": 44930 }, { "epoch": 2.6, "grad_norm": 2.858651839687809e-07, "learning_rate": 0.00013328689190758155, "logits/chosen": -17.069406509399414, "logits/rejected": -19.485219955444336, "logps/chosen": -2929.584228515625, "logps/rejected": -2864.59130859375, "loss": 2.1583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -220.8673553466797, "rewards/margins": 8.692533493041992, "rewards/rejected": -229.5598907470703, "step": 44940 }, { "epoch": 2.6, "grad_norm": 69.51779174804688, "learning_rate": 0.0001330933859669492, "logits/chosen": -17.430843353271484, "logits/rejected": -17.94037628173828, "logps/chosen": -2492.47021484375, "logps/rejected": -2623.00537109375, "loss": 5.8602, "rewards/accuracies": 0.5, "rewards/chosen": -144.47268676757812, "rewards/margins": 1.9121805429458618, "rewards/rejected": -146.38485717773438, "step": 44950 }, { "epoch": 2.6, "grad_norm": 0.0005500839906744659, "learning_rate": 0.0001328998800263168, "logits/chosen": -16.942487716674805, "logits/rejected": -18.023395538330078, "logps/chosen": -2993.348388671875, "logps/rejected": -3008.20751953125, "loss": 1.5609, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -186.8707275390625, "rewards/margins": 6.797316074371338, "rewards/rejected": -193.66806030273438, "step": 44960 }, { "epoch": 2.6, "grad_norm": 3.5702171911390374e-15, "learning_rate": 0.00013270637408568443, "logits/chosen": -17.95224380493164, "logits/rejected": -19.103181838989258, "logps/chosen": -2937.094970703125, "logps/rejected": -2891.787353515625, "loss": 4.5089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -110.47920227050781, "rewards/margins": 13.377168655395508, "rewards/rejected": -123.85638427734375, "step": 44970 }, { "epoch": 2.6, "grad_norm": 90.1292495727539, "learning_rate": 0.00013251286814505207, "logits/chosen": -14.998196601867676, "logits/rejected": -14.928136825561523, "logps/chosen": -3021.3349609375, "logps/rejected": -3024.460205078125, "loss": 1.8077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.80264282226562, "rewards/margins": 7.324930667877197, "rewards/rejected": -159.1275634765625, "step": 44980 }, { "epoch": 2.6, "grad_norm": 0.00032255426049232483, "learning_rate": 0.0001323193622044197, "logits/chosen": -16.415454864501953, "logits/rejected": -15.367387771606445, "logps/chosen": -3011.379150390625, "logps/rejected": -2861.93017578125, "loss": 5.9036, "rewards/accuracies": 0.5, "rewards/chosen": -103.45440673828125, "rewards/margins": 1.0293976068496704, "rewards/rejected": -104.48380279541016, "step": 44990 }, { "epoch": 2.6, "grad_norm": 6.692032814025879, "learning_rate": 0.0001321258562637873, "logits/chosen": -17.294261932373047, "logits/rejected": -17.27100372314453, "logps/chosen": -2469.820556640625, "logps/rejected": -3116.25439453125, "loss": 4.1608, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -188.4612274169922, "rewards/margins": 12.817797660827637, "rewards/rejected": -201.27902221679688, "step": 45000 }, { "epoch": 2.61, "grad_norm": 11.778629302978516, "learning_rate": 0.00013193235032315492, "logits/chosen": -18.900455474853516, "logits/rejected": -20.09762191772461, "logps/chosen": -2397.211181640625, "logps/rejected": -2363.90087890625, "loss": 3.7241, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -214.7603759765625, "rewards/margins": -0.8917228579521179, "rewards/rejected": -213.86865234375, "step": 45010 }, { "epoch": 2.61, "grad_norm": 65.5177993774414, "learning_rate": 0.00013173884438252254, "logits/chosen": -17.0584774017334, "logits/rejected": -18.64912986755371, "logps/chosen": -2962.130126953125, "logps/rejected": -2700.572509765625, "loss": 0.2937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -218.28573608398438, "rewards/margins": 11.61815071105957, "rewards/rejected": -229.90390014648438, "step": 45020 }, { "epoch": 2.61, "grad_norm": 147.9893035888672, "learning_rate": 0.00013154533844189018, "logits/chosen": -16.89753532409668, "logits/rejected": -16.66330337524414, "logps/chosen": -2962.489013671875, "logps/rejected": -2870.645751953125, "loss": 8.617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -177.95094299316406, "rewards/margins": -0.9063301086425781, "rewards/rejected": -177.0446319580078, "step": 45030 }, { "epoch": 2.61, "grad_norm": 0.0013682227581739426, "learning_rate": 0.0001313518325012578, "logits/chosen": -16.701221466064453, "logits/rejected": -17.952600479125977, "logps/chosen": -2745.282470703125, "logps/rejected": -2897.551025390625, "loss": 0.903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.9308624267578, "rewards/margins": 18.640438079833984, "rewards/rejected": -155.57130432128906, "step": 45040 }, { "epoch": 2.61, "grad_norm": 0.011730210855603218, "learning_rate": 0.0001311583265606254, "logits/chosen": -13.769554138183594, "logits/rejected": -13.886919021606445, "logps/chosen": -2956.700927734375, "logps/rejected": -3220.68994140625, "loss": 8.669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -181.0659637451172, "rewards/margins": -3.891162157058716, "rewards/rejected": -177.1748046875, "step": 45050 }, { "epoch": 2.61, "grad_norm": 92.11688232421875, "learning_rate": 0.00013096482061999303, "logits/chosen": -19.823528289794922, "logits/rejected": -23.513198852539062, "logps/chosen": -3459.46728515625, "logps/rejected": -2999.43408203125, "loss": 14.6455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -236.37496948242188, "rewards/margins": -3.6018295288085938, "rewards/rejected": -232.7731475830078, "step": 45060 }, { "epoch": 2.61, "grad_norm": 82.95674133300781, "learning_rate": 0.00013077131467936065, "logits/chosen": -18.658065795898438, "logits/rejected": -21.247785568237305, "logps/chosen": -2702.462158203125, "logps/rejected": -3073.025390625, "loss": 3.4854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -190.19931030273438, "rewards/margins": 13.119470596313477, "rewards/rejected": -203.3187713623047, "step": 45070 }, { "epoch": 2.61, "grad_norm": 3.4440888612152776e-06, "learning_rate": 0.0001305778087387283, "logits/chosen": -17.463882446289062, "logits/rejected": -19.219213485717773, "logps/chosen": -2982.88232421875, "logps/rejected": -2673.87548828125, "loss": 2.0733, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -222.0325469970703, "rewards/margins": 19.705303192138672, "rewards/rejected": -241.7378692626953, "step": 45080 }, { "epoch": 2.61, "grad_norm": 9.484188922215253e-06, "learning_rate": 0.0001303843027980959, "logits/chosen": -20.71733856201172, "logits/rejected": -19.32408905029297, "logps/chosen": -3109.888671875, "logps/rejected": -3161.502685546875, "loss": 2.2932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -245.4076385498047, "rewards/margins": 9.011493682861328, "rewards/rejected": -254.4191131591797, "step": 45090 }, { "epoch": 2.61, "grad_norm": 8.763112418819219e-06, "learning_rate": 0.0001301907968574635, "logits/chosen": -21.311246871948242, "logits/rejected": -20.65773582458496, "logps/chosen": -2342.080078125, "logps/rejected": -2561.8671875, "loss": 2.9376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.3358612060547, "rewards/margins": 24.491962432861328, "rewards/rejected": -192.82781982421875, "step": 45100 }, { "epoch": 2.61, "grad_norm": 0.1617920696735382, "learning_rate": 0.00012999729091683115, "logits/chosen": -20.189958572387695, "logits/rejected": -21.222087860107422, "logps/chosen": -2575.433837890625, "logps/rejected": -2903.3916015625, "loss": 0.17, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -223.12832641601562, "rewards/margins": 9.052682876586914, "rewards/rejected": -232.18099975585938, "step": 45110 }, { "epoch": 2.61, "grad_norm": 1.9815845489501953, "learning_rate": 0.00012980378497619877, "logits/chosen": -15.12324333190918, "logits/rejected": -15.060627937316895, "logps/chosen": -3095.7890625, "logps/rejected": -3030.077392578125, "loss": 4.686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -152.99598693847656, "rewards/margins": 5.010659217834473, "rewards/rejected": -158.00665283203125, "step": 45120 }, { "epoch": 2.61, "grad_norm": 0.18510758876800537, "learning_rate": 0.00012961027903556638, "logits/chosen": -19.620405197143555, "logits/rejected": -23.211824417114258, "logps/chosen": -3215.869873046875, "logps/rejected": -2939.758544921875, "loss": 2.0321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -210.2102508544922, "rewards/margins": 12.556729316711426, "rewards/rejected": -222.76699829101562, "step": 45130 }, { "epoch": 2.61, "grad_norm": 1.7083316167888127e-17, "learning_rate": 0.00012941677309493403, "logits/chosen": -16.699888229370117, "logits/rejected": -18.558277130126953, "logps/chosen": -3220.80810546875, "logps/rejected": -2917.65380859375, "loss": 3.392, "rewards/accuracies": 0.5, "rewards/chosen": -141.08847045898438, "rewards/margins": 12.178861618041992, "rewards/rejected": -153.26731872558594, "step": 45140 }, { "epoch": 2.61, "grad_norm": 0.0, "learning_rate": 0.00012922326715430164, "logits/chosen": -16.51581573486328, "logits/rejected": -16.498519897460938, "logps/chosen": -2548.049072265625, "logps/rejected": -2364.839111328125, "loss": 5.3863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -146.75738525390625, "rewards/margins": 4.737697124481201, "rewards/rejected": -151.49508666992188, "step": 45150 }, { "epoch": 2.61, "grad_norm": 4.181526597191413e-20, "learning_rate": 0.00012902976121366926, "logits/chosen": -19.805110931396484, "logits/rejected": -20.648235321044922, "logps/chosen": -2786.92822265625, "logps/rejected": -2593.97900390625, "loss": 20.5987, "rewards/accuracies": 0.5, "rewards/chosen": -243.52749633789062, "rewards/margins": -11.505929946899414, "rewards/rejected": -232.02157592773438, "step": 45160 }, { "epoch": 2.61, "grad_norm": 4.481819776414042e-11, "learning_rate": 0.00012883625527303688, "logits/chosen": -13.397789001464844, "logits/rejected": -13.360328674316406, "logps/chosen": -3790.526611328125, "logps/rejected": -3162.70947265625, "loss": 2.0307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -65.83684539794922, "rewards/margins": 9.982114791870117, "rewards/rejected": -75.81895446777344, "step": 45170 }, { "epoch": 2.62, "grad_norm": 0.017127856612205505, "learning_rate": 0.0001286427493324045, "logits/chosen": -15.29633617401123, "logits/rejected": -15.464157104492188, "logps/chosen": -2919.69580078125, "logps/rejected": -3213.899169921875, "loss": 0.9679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -157.68409729003906, "rewards/margins": 7.390380859375, "rewards/rejected": -165.07447814941406, "step": 45180 }, { "epoch": 2.62, "grad_norm": 0.0009466262417845428, "learning_rate": 0.00012844924339177214, "logits/chosen": -17.211517333984375, "logits/rejected": -17.40561294555664, "logps/chosen": -2939.69091796875, "logps/rejected": -2858.0390625, "loss": 4.2223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.1245880126953, "rewards/margins": 5.747262477874756, "rewards/rejected": -231.871826171875, "step": 45190 }, { "epoch": 2.62, "grad_norm": 306.1008605957031, "learning_rate": 0.00012825573745113976, "logits/chosen": -16.80646514892578, "logits/rejected": -17.543550491333008, "logps/chosen": -2713.8994140625, "logps/rejected": -2591.734375, "loss": 5.9719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -174.13949584960938, "rewards/margins": 4.5228590965271, "rewards/rejected": -178.662353515625, "step": 45200 }, { "epoch": 2.62, "grad_norm": 82.95870971679688, "learning_rate": 0.0001280622315105074, "logits/chosen": -16.085002899169922, "logits/rejected": -16.230913162231445, "logps/chosen": -3085.794189453125, "logps/rejected": -2898.965576171875, "loss": 14.5206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -159.5064239501953, "rewards/margins": -3.982651472091675, "rewards/rejected": -155.52377319335938, "step": 45210 }, { "epoch": 2.62, "grad_norm": 0.0027074592653661966, "learning_rate": 0.000127868725569875, "logits/chosen": -16.220905303955078, "logits/rejected": -16.657352447509766, "logps/chosen": -2914.42822265625, "logps/rejected": -2841.1865234375, "loss": 0.6587, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -184.19747924804688, "rewards/margins": 18.049291610717773, "rewards/rejected": -202.2467803955078, "step": 45220 }, { "epoch": 2.62, "grad_norm": 0.10982603579759598, "learning_rate": 0.0001276752196292426, "logits/chosen": -15.169013977050781, "logits/rejected": -15.905563354492188, "logps/chosen": -3158.41455078125, "logps/rejected": -3180.274658203125, "loss": 2.6682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -157.49429321289062, "rewards/margins": 6.165982246398926, "rewards/rejected": -163.6602783203125, "step": 45230 }, { "epoch": 2.62, "grad_norm": 2.0672416212619282e-05, "learning_rate": 0.00012748171368861025, "logits/chosen": -17.545948028564453, "logits/rejected": -18.27825164794922, "logps/chosen": -2690.840576171875, "logps/rejected": -2616.288330078125, "loss": 1.3426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -170.2144775390625, "rewards/margins": 12.045936584472656, "rewards/rejected": -182.26040649414062, "step": 45240 }, { "epoch": 2.62, "grad_norm": 5.140998304148381e-13, "learning_rate": 0.00012728820774797787, "logits/chosen": -19.28480339050293, "logits/rejected": -21.136951446533203, "logps/chosen": -2827.007568359375, "logps/rejected": -2707.899658203125, "loss": 19.4415, "rewards/accuracies": 0.5, "rewards/chosen": -243.7369384765625, "rewards/margins": -6.72012186050415, "rewards/rejected": -237.0167999267578, "step": 45250 }, { "epoch": 2.62, "grad_norm": 76.22181701660156, "learning_rate": 0.0001270947018073455, "logits/chosen": -18.850181579589844, "logits/rejected": -18.027149200439453, "logps/chosen": -3026.27587890625, "logps/rejected": -2813.826171875, "loss": 2.8344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.71131896972656, "rewards/margins": 11.480501174926758, "rewards/rejected": -213.1918487548828, "step": 45260 }, { "epoch": 2.62, "grad_norm": 91.3864974975586, "learning_rate": 0.0001269011958667131, "logits/chosen": -17.273426055908203, "logits/rejected": -18.560861587524414, "logps/chosen": -2843.006103515625, "logps/rejected": -2861.064697265625, "loss": 1.7485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -176.02203369140625, "rewards/margins": 11.844383239746094, "rewards/rejected": -187.86642456054688, "step": 45270 }, { "epoch": 2.62, "grad_norm": 135.90309143066406, "learning_rate": 0.00012670768992608072, "logits/chosen": -19.12411880493164, "logits/rejected": -20.783390045166016, "logps/chosen": -2992.71923828125, "logps/rejected": -2881.06640625, "loss": 18.2779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -203.70626831054688, "rewards/margins": -12.707513809204102, "rewards/rejected": -190.99874877929688, "step": 45280 }, { "epoch": 2.62, "grad_norm": 32.879302978515625, "learning_rate": 0.00012651418398544837, "logits/chosen": -17.431493759155273, "logits/rejected": -17.66953468322754, "logps/chosen": -2643.074951171875, "logps/rejected": -2669.78955078125, "loss": 1.3795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -188.0088348388672, "rewards/margins": 7.668679714202881, "rewards/rejected": -195.677490234375, "step": 45290 }, { "epoch": 2.62, "grad_norm": 2.7617903519683296e-09, "learning_rate": 0.00012632067804481598, "logits/chosen": -15.263232231140137, "logits/rejected": -15.145172119140625, "logps/chosen": -2856.04833984375, "logps/rejected": -2851.969482421875, "loss": 2.8881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.86700439453125, "rewards/margins": 10.159414291381836, "rewards/rejected": -182.0264129638672, "step": 45300 }, { "epoch": 2.62, "grad_norm": 0.017981089651584625, "learning_rate": 0.0001261271721041836, "logits/chosen": -16.642757415771484, "logits/rejected": -17.983631134033203, "logps/chosen": -3130.822021484375, "logps/rejected": -3085.98046875, "loss": 3.0894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -186.07327270507812, "rewards/margins": 5.704540252685547, "rewards/rejected": -191.77780151367188, "step": 45310 }, { "epoch": 2.62, "grad_norm": 0.0008612382807768881, "learning_rate": 0.00012593366616355122, "logits/chosen": -17.415119171142578, "logits/rejected": -19.138023376464844, "logps/chosen": -3036.891357421875, "logps/rejected": -2731.117431640625, "loss": 1.0594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -168.92861938476562, "rewards/margins": 11.250764846801758, "rewards/rejected": -180.1793670654297, "step": 45320 }, { "epoch": 2.62, "grad_norm": 1.444272756576538, "learning_rate": 0.00012574016022291883, "logits/chosen": -13.528963088989258, "logits/rejected": -13.517598152160645, "logps/chosen": -3375.61474609375, "logps/rejected": -2695.206298828125, "loss": 0.2565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -98.21839904785156, "rewards/margins": 14.197413444519043, "rewards/rejected": -112.41580963134766, "step": 45330 }, { "epoch": 2.62, "grad_norm": 15.307496070861816, "learning_rate": 0.00012554665428228648, "logits/chosen": -17.79730987548828, "logits/rejected": -18.85289764404297, "logps/chosen": -2815.03125, "logps/rejected": -2586.21728515625, "loss": 0.3232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.58663940429688, "rewards/margins": 12.26953411102295, "rewards/rejected": -183.8561553955078, "step": 45340 }, { "epoch": 2.63, "grad_norm": 0.007266778964549303, "learning_rate": 0.0001253531483416541, "logits/chosen": -15.4501371383667, "logits/rejected": -15.365107536315918, "logps/chosen": -2885.210205078125, "logps/rejected": -2732.48681640625, "loss": 1.6323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -103.32341003417969, "rewards/margins": 5.115061283111572, "rewards/rejected": -108.4384765625, "step": 45350 }, { "epoch": 2.63, "grad_norm": 0.6580430269241333, "learning_rate": 0.0001251596424010217, "logits/chosen": -16.381803512573242, "logits/rejected": -16.39959144592285, "logps/chosen": -2715.136962890625, "logps/rejected": -2741.139404296875, "loss": 1.0838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.58303833007812, "rewards/margins": 7.0293169021606445, "rewards/rejected": -141.61236572265625, "step": 45360 }, { "epoch": 2.63, "grad_norm": 0.03532774746417999, "learning_rate": 0.00012496613646038933, "logits/chosen": -15.604253768920898, "logits/rejected": -16.213647842407227, "logps/chosen": -3229.02392578125, "logps/rejected": -2897.833251953125, "loss": 2.9728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -147.88693237304688, "rewards/margins": 5.093113899230957, "rewards/rejected": -152.9800567626953, "step": 45370 }, { "epoch": 2.63, "grad_norm": 0.08148425072431564, "learning_rate": 0.00012477263051975695, "logits/chosen": -17.05118751525879, "logits/rejected": -17.715171813964844, "logps/chosen": -3034.604248046875, "logps/rejected": -3048.099609375, "loss": 17.9693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -171.41293334960938, "rewards/margins": -13.615972518920898, "rewards/rejected": -157.79696655273438, "step": 45380 }, { "epoch": 2.63, "grad_norm": 0.15148282051086426, "learning_rate": 0.0001245791245791246, "logits/chosen": -19.216846466064453, "logits/rejected": -20.04242515563965, "logps/chosen": -2765.95068359375, "logps/rejected": -2647.489501953125, "loss": 9.0059, "rewards/accuracies": 0.5, "rewards/chosen": -244.5430908203125, "rewards/margins": -3.185220241546631, "rewards/rejected": -241.35787963867188, "step": 45390 }, { "epoch": 2.63, "grad_norm": 0.20919083058834076, "learning_rate": 0.0001243856186384922, "logits/chosen": -15.874940872192383, "logits/rejected": -15.5685453414917, "logps/chosen": -2657.468505859375, "logps/rejected": -2725.7529296875, "loss": 15.2104, "rewards/accuracies": 0.5, "rewards/chosen": -159.54901123046875, "rewards/margins": -9.79541301727295, "rewards/rejected": -149.75360107421875, "step": 45400 }, { "epoch": 2.63, "grad_norm": 1.3030157089233398, "learning_rate": 0.00012419211269785983, "logits/chosen": -17.988506317138672, "logits/rejected": -19.05315589904785, "logps/chosen": -2917.76025390625, "logps/rejected": -3004.508544921875, "loss": 0.7453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -217.9678497314453, "rewards/margins": 12.263504028320312, "rewards/rejected": -230.2313232421875, "step": 45410 }, { "epoch": 2.63, "grad_norm": 2.0684826374053955, "learning_rate": 0.00012399860675722744, "logits/chosen": -13.637434005737305, "logits/rejected": -13.905415534973145, "logps/chosen": -3129.92041015625, "logps/rejected": -3117.53271484375, "loss": 4.2515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -117.79020690917969, "rewards/margins": 10.682645797729492, "rewards/rejected": -128.4728546142578, "step": 45420 }, { "epoch": 2.63, "grad_norm": 0.03818189352750778, "learning_rate": 0.00012380510081659506, "logits/chosen": -18.247112274169922, "logits/rejected": -20.671632766723633, "logps/chosen": -3089.729248046875, "logps/rejected": -2824.666748046875, "loss": 1.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -233.84329223632812, "rewards/margins": 6.991396903991699, "rewards/rejected": -240.8346710205078, "step": 45430 }, { "epoch": 2.63, "grad_norm": 1.2245135307312012, "learning_rate": 0.0001236115948759627, "logits/chosen": -18.31026268005371, "logits/rejected": -18.082544326782227, "logps/chosen": -2830.975341796875, "logps/rejected": -2542.262451171875, "loss": 1.1186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.29782104492188, "rewards/margins": 3.9926936626434326, "rewards/rejected": -218.29052734375, "step": 45440 }, { "epoch": 2.63, "grad_norm": 3.961300080845831e-06, "learning_rate": 0.00012341808893533032, "logits/chosen": -16.8262939453125, "logits/rejected": -16.667387008666992, "logps/chosen": -2886.242919921875, "logps/rejected": -2664.739990234375, "loss": 3.0264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -152.96588134765625, "rewards/margins": 3.2153220176696777, "rewards/rejected": -156.1811981201172, "step": 45450 }, { "epoch": 2.63, "grad_norm": 0.000339982274454087, "learning_rate": 0.00012322458299469794, "logits/chosen": -17.5198974609375, "logits/rejected": -16.516799926757812, "logps/chosen": -2993.45556640625, "logps/rejected": -2548.85205078125, "loss": 4.8991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -148.14129638671875, "rewards/margins": -0.28863397240638733, "rewards/rejected": -147.8526611328125, "step": 45460 }, { "epoch": 2.63, "grad_norm": 81.20372772216797, "learning_rate": 0.00012303107705406556, "logits/chosen": -18.752059936523438, "logits/rejected": -19.212736129760742, "logps/chosen": -2739.37353515625, "logps/rejected": -2864.288330078125, "loss": 1.2795, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -245.0765838623047, "rewards/margins": 6.8972368240356445, "rewards/rejected": -251.9738006591797, "step": 45470 }, { "epoch": 2.63, "grad_norm": 0.5576460361480713, "learning_rate": 0.0001228375711134332, "logits/chosen": -16.380224227905273, "logits/rejected": -17.678394317626953, "logps/chosen": -2641.80126953125, "logps/rejected": -2499.8740234375, "loss": 25.1022, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -190.40093994140625, "rewards/margins": -21.34210205078125, "rewards/rejected": -169.058837890625, "step": 45480 }, { "epoch": 2.63, "grad_norm": 36.21331024169922, "learning_rate": 0.00012264406517280082, "logits/chosen": -18.012025833129883, "logits/rejected": -17.01126480102539, "logps/chosen": -2393.57373046875, "logps/rejected": -2651.138916015625, "loss": 5.0384, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -146.59445190429688, "rewards/margins": 23.800600051879883, "rewards/rejected": -170.3950653076172, "step": 45490 }, { "epoch": 2.63, "grad_norm": 3.114759838940273e-11, "learning_rate": 0.00012245055923216843, "logits/chosen": -14.904937744140625, "logits/rejected": -15.961230278015137, "logps/chosen": -3041.294677734375, "logps/rejected": -2561.79345703125, "loss": 1.584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -131.65609741210938, "rewards/margins": 15.239115715026855, "rewards/rejected": -146.89523315429688, "step": 45500 }, { "epoch": 2.63, "grad_norm": 3.425639633070077e-11, "learning_rate": 0.00012225705329153605, "logits/chosen": -18.4251766204834, "logits/rejected": -20.522212982177734, "logps/chosen": -2669.94287109375, "logps/rejected": -2786.011962890625, "loss": 6.0449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -211.1191864013672, "rewards/margins": 6.432009220123291, "rewards/rejected": -217.5512237548828, "step": 45510 }, { "epoch": 2.63, "grad_norm": 2.3182477951049805, "learning_rate": 0.00012206354735090368, "logits/chosen": -14.564416885375977, "logits/rejected": -14.575727462768555, "logps/chosen": -2912.750732421875, "logps/rejected": -2533.19091796875, "loss": 0.5897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -158.5373077392578, "rewards/margins": 8.461682319641113, "rewards/rejected": -166.99899291992188, "step": 45520 }, { "epoch": 2.64, "grad_norm": 0.0033816187642514706, "learning_rate": 0.0001218700414102713, "logits/chosen": -15.557479858398438, "logits/rejected": -16.029613494873047, "logps/chosen": -3337.96728515625, "logps/rejected": -3108.73974609375, "loss": 2.4561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.47976684570312, "rewards/margins": 5.935494422912598, "rewards/rejected": -150.41525268554688, "step": 45530 }, { "epoch": 2.64, "grad_norm": 0.012713325209915638, "learning_rate": 0.00012167653546963892, "logits/chosen": -16.149856567382812, "logits/rejected": -16.537322998046875, "logps/chosen": -2991.062744140625, "logps/rejected": -3045.178466796875, "loss": 3.181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -189.12376403808594, "rewards/margins": 9.554469108581543, "rewards/rejected": -198.67823791503906, "step": 45540 }, { "epoch": 2.64, "grad_norm": 6.280243269429775e-06, "learning_rate": 0.00012148302952900655, "logits/chosen": -15.473688125610352, "logits/rejected": -19.044200897216797, "logps/chosen": -3443.905029296875, "logps/rejected": -3380.104248046875, "loss": 1.7013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -165.88217163085938, "rewards/margins": 13.60619068145752, "rewards/rejected": -179.4883575439453, "step": 45550 }, { "epoch": 2.64, "grad_norm": 79.76459503173828, "learning_rate": 0.00012128952358837417, "logits/chosen": -15.598172187805176, "logits/rejected": -17.452016830444336, "logps/chosen": -3371.833251953125, "logps/rejected": -3105.38134765625, "loss": 24.4754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.70565795898438, "rewards/margins": -11.66661262512207, "rewards/rejected": -138.03903198242188, "step": 45560 }, { "epoch": 2.64, "grad_norm": 27.848793029785156, "learning_rate": 0.0001210960176477418, "logits/chosen": -16.490291595458984, "logits/rejected": -17.918046951293945, "logps/chosen": -3138.20703125, "logps/rejected": -2991.8017578125, "loss": 2.2829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -162.35238647460938, "rewards/margins": 10.003999710083008, "rewards/rejected": -172.35638427734375, "step": 45570 }, { "epoch": 2.64, "grad_norm": 0.00016870298713911325, "learning_rate": 0.0001209025117071094, "logits/chosen": -15.00133228302002, "logits/rejected": -15.176080703735352, "logps/chosen": -3226.958984375, "logps/rejected": -3258.871826171875, "loss": 0.5771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -175.24560546875, "rewards/margins": 9.93692398071289, "rewards/rejected": -185.1825408935547, "step": 45580 }, { "epoch": 2.64, "grad_norm": 2.2176501750946045, "learning_rate": 0.00012070900576647703, "logits/chosen": -16.93991470336914, "logits/rejected": -17.052703857421875, "logps/chosen": -3312.579345703125, "logps/rejected": -3357.88818359375, "loss": 3.091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.77508544921875, "rewards/margins": 3.318683624267578, "rewards/rejected": -152.09378051757812, "step": 45590 }, { "epoch": 2.64, "grad_norm": 56.2535285949707, "learning_rate": 0.00012051549982584466, "logits/chosen": -17.433635711669922, "logits/rejected": -18.22675323486328, "logps/chosen": -2981.49169921875, "logps/rejected": -2857.498291015625, "loss": 2.8088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -191.18333435058594, "rewards/margins": 9.195390701293945, "rewards/rejected": -200.3787384033203, "step": 45600 }, { "epoch": 2.64, "grad_norm": 0.0, "learning_rate": 0.00012032199388521228, "logits/chosen": -14.552980422973633, "logits/rejected": -15.034339904785156, "logps/chosen": -2930.21337890625, "logps/rejected": -2650.69091796875, "loss": 3.645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -176.25265502929688, "rewards/margins": 8.140373229980469, "rewards/rejected": -184.39303588867188, "step": 45610 }, { "epoch": 2.64, "grad_norm": 272.4520568847656, "learning_rate": 0.0001201284879445799, "logits/chosen": -13.733866691589355, "logits/rejected": -13.787490844726562, "logps/chosen": -3280.801513671875, "logps/rejected": -2743.58740234375, "loss": 1.6447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -91.50444793701172, "rewards/margins": 8.614161491394043, "rewards/rejected": -100.11860656738281, "step": 45620 }, { "epoch": 2.64, "grad_norm": 21.9542179107666, "learning_rate": 0.00011993498200394753, "logits/chosen": -15.524194717407227, "logits/rejected": -16.722736358642578, "logps/chosen": -3038.53076171875, "logps/rejected": -2951.428955078125, "loss": 2.9345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -226.41683959960938, "rewards/margins": 3.7976233959198, "rewards/rejected": -230.21444702148438, "step": 45630 }, { "epoch": 2.64, "grad_norm": 0.01340670045465231, "learning_rate": 0.00011974147606331514, "logits/chosen": -16.555316925048828, "logits/rejected": -19.768693923950195, "logps/chosen": -3254.002197265625, "logps/rejected": -2415.5810546875, "loss": 30.5645, "rewards/accuracies": 0.5, "rewards/chosen": -221.88644409179688, "rewards/margins": -20.720874786376953, "rewards/rejected": -201.16555786132812, "step": 45640 }, { "epoch": 2.64, "grad_norm": 1.7155227396870032e-05, "learning_rate": 0.00011954797012268277, "logits/chosen": -16.197153091430664, "logits/rejected": -17.23193359375, "logps/chosen": -2694.326171875, "logps/rejected": -2639.08203125, "loss": 1.2538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -172.0593719482422, "rewards/margins": 19.353322982788086, "rewards/rejected": -191.41268920898438, "step": 45650 }, { "epoch": 2.64, "grad_norm": 0.012099981307983398, "learning_rate": 0.00011935446418205039, "logits/chosen": -18.136760711669922, "logits/rejected": -19.94674301147461, "logps/chosen": -2723.969970703125, "logps/rejected": -2402.33935546875, "loss": 26.6235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -200.609375, "rewards/margins": -18.193702697753906, "rewards/rejected": -182.41566467285156, "step": 45660 }, { "epoch": 2.64, "grad_norm": 19.294837951660156, "learning_rate": 0.00011916095824141801, "logits/chosen": -16.493244171142578, "logits/rejected": -16.451786041259766, "logps/chosen": -2663.63037109375, "logps/rejected": -2396.62744140625, "loss": 0.5737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -147.92581176757812, "rewards/margins": 19.315231323242188, "rewards/rejected": -167.24102783203125, "step": 45670 }, { "epoch": 2.64, "grad_norm": 0.027379726991057396, "learning_rate": 0.00011896745230078564, "logits/chosen": -17.242704391479492, "logits/rejected": -18.01432228088379, "logps/chosen": -3110.657958984375, "logps/rejected": -3064.932373046875, "loss": 0.6911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -130.90469360351562, "rewards/margins": 6.228913307189941, "rewards/rejected": -137.13363647460938, "step": 45680 }, { "epoch": 2.64, "grad_norm": 39.810997009277344, "learning_rate": 0.00011877394636015326, "logits/chosen": -18.390687942504883, "logits/rejected": -19.551952362060547, "logps/chosen": -2614.29638671875, "logps/rejected": -2425.650390625, "loss": 2.2712, "rewards/accuracies": 0.5, "rewards/chosen": -196.1259765625, "rewards/margins": 5.461941719055176, "rewards/rejected": -201.58792114257812, "step": 45690 }, { "epoch": 2.65, "grad_norm": 71.76618957519531, "learning_rate": 0.00011858044041952089, "logits/chosen": -17.253055572509766, "logits/rejected": -17.407808303833008, "logps/chosen": -2383.40771484375, "logps/rejected": -2170.0947265625, "loss": 2.8023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.82040405273438, "rewards/margins": 6.437375068664551, "rewards/rejected": -145.25778198242188, "step": 45700 }, { "epoch": 2.65, "grad_norm": 1.7016361951828003, "learning_rate": 0.0001183869344788885, "logits/chosen": -15.91480541229248, "logits/rejected": -16.705036163330078, "logps/chosen": -2817.03271484375, "logps/rejected": -2788.43408203125, "loss": 2.2545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.9210662841797, "rewards/margins": 8.387526512145996, "rewards/rejected": -213.30862426757812, "step": 45710 }, { "epoch": 2.65, "grad_norm": 0.0, "learning_rate": 0.00011819342853825612, "logits/chosen": -15.989705085754395, "logits/rejected": -16.007211685180664, "logps/chosen": -3046.41650390625, "logps/rejected": -2536.196533203125, "loss": 5.9386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -126.50801086425781, "rewards/margins": 25.23104476928711, "rewards/rejected": -151.7390594482422, "step": 45720 }, { "epoch": 2.65, "grad_norm": 3.551835106918588e-05, "learning_rate": 0.00011799992259762375, "logits/chosen": -20.37883949279785, "logits/rejected": -20.970151901245117, "logps/chosen": -2692.6279296875, "logps/rejected": -2687.61474609375, "loss": 0.2185, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -231.3951416015625, "rewards/margins": 8.447408676147461, "rewards/rejected": -239.842529296875, "step": 45730 }, { "epoch": 2.65, "grad_norm": 1.8098555756296264e-06, "learning_rate": 0.00011780641665699138, "logits/chosen": -18.929393768310547, "logits/rejected": -20.672462463378906, "logps/chosen": -2976.92822265625, "logps/rejected": -2902.248046875, "loss": 7.396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -190.08816528320312, "rewards/margins": 13.920389175415039, "rewards/rejected": -204.00856018066406, "step": 45740 }, { "epoch": 2.65, "grad_norm": 5.607432740362128e-09, "learning_rate": 0.00011761291071635899, "logits/chosen": -18.292362213134766, "logits/rejected": -19.241539001464844, "logps/chosen": -3113.187744140625, "logps/rejected": -3180.03857421875, "loss": 2.9527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -177.64833068847656, "rewards/margins": 7.36785888671875, "rewards/rejected": -185.01617431640625, "step": 45750 }, { "epoch": 2.65, "grad_norm": 48.0555305480957, "learning_rate": 0.00011741940477572662, "logits/chosen": -19.600187301635742, "logits/rejected": -19.81410026550293, "logps/chosen": -2945.922607421875, "logps/rejected": -2666.277587890625, "loss": 12.3216, "rewards/accuracies": 0.5, "rewards/chosen": -166.24171447753906, "rewards/margins": -7.483716011047363, "rewards/rejected": -158.7579803466797, "step": 45760 }, { "epoch": 2.65, "grad_norm": 0.0031106562819331884, "learning_rate": 0.00011722589883509425, "logits/chosen": -18.83817481994629, "logits/rejected": -19.040542602539062, "logps/chosen": -2845.37451171875, "logps/rejected": -2798.15283203125, "loss": 0.7231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -249.3370361328125, "rewards/margins": 3.6911940574645996, "rewards/rejected": -253.02822875976562, "step": 45770 }, { "epoch": 2.65, "grad_norm": 0.00011537015234353021, "learning_rate": 0.00011703239289446187, "logits/chosen": -15.157369613647461, "logits/rejected": -15.920048713684082, "logps/chosen": -3061.35888671875, "logps/rejected": -2351.93017578125, "loss": 1.4966, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -123.20838928222656, "rewards/margins": 11.685905456542969, "rewards/rejected": -134.894287109375, "step": 45780 }, { "epoch": 2.65, "grad_norm": 0.07898224890232086, "learning_rate": 0.00011683888695382948, "logits/chosen": -15.854455947875977, "logits/rejected": -16.951190948486328, "logps/chosen": -3032.355712890625, "logps/rejected": -2862.491943359375, "loss": 0.356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -120.9799575805664, "rewards/margins": 15.436286926269531, "rewards/rejected": -136.416259765625, "step": 45790 }, { "epoch": 2.65, "grad_norm": 106.0308609008789, "learning_rate": 0.0001166453810131971, "logits/chosen": -19.226062774658203, "logits/rejected": -20.91381072998047, "logps/chosen": -2684.72412109375, "logps/rejected": -2969.033447265625, "loss": 4.3424, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -182.4886474609375, "rewards/margins": 18.570911407470703, "rewards/rejected": -201.05953979492188, "step": 45800 }, { "epoch": 2.65, "grad_norm": 4.904139132122509e-06, "learning_rate": 0.00011645187507256473, "logits/chosen": -19.09389877319336, "logits/rejected": -18.536785125732422, "logps/chosen": -2835.079345703125, "logps/rejected": -2889.755126953125, "loss": 0.1526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -171.08737182617188, "rewards/margins": 10.613419532775879, "rewards/rejected": -181.70079040527344, "step": 45810 }, { "epoch": 2.65, "grad_norm": 0.36195483803749084, "learning_rate": 0.00011625836913193236, "logits/chosen": -19.666301727294922, "logits/rejected": -19.720365524291992, "logps/chosen": -3117.18603515625, "logps/rejected": -3117.35009765625, "loss": 0.3393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -182.96266174316406, "rewards/margins": 7.625180244445801, "rewards/rejected": -190.58786010742188, "step": 45820 }, { "epoch": 2.65, "grad_norm": 2.8690805287112653e-10, "learning_rate": 0.00011606486319129996, "logits/chosen": -19.32744026184082, "logits/rejected": -18.549001693725586, "logps/chosen": -2840.102783203125, "logps/rejected": -2415.501953125, "loss": 8.3837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -219.3545684814453, "rewards/margins": -2.29024076461792, "rewards/rejected": -217.0643310546875, "step": 45830 }, { "epoch": 2.65, "grad_norm": 22.089763641357422, "learning_rate": 0.0001158713572506676, "logits/chosen": -19.153446197509766, "logits/rejected": -19.89617919921875, "logps/chosen": -2866.94091796875, "logps/rejected": -2963.502685546875, "loss": 0.271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -236.31399536132812, "rewards/margins": 15.683792114257812, "rewards/rejected": -251.99777221679688, "step": 45840 }, { "epoch": 2.65, "grad_norm": 5.540810120289487e-14, "learning_rate": 0.00011567785131003523, "logits/chosen": -16.659414291381836, "logits/rejected": -17.753765106201172, "logps/chosen": -3123.38232421875, "logps/rejected": -3114.82763671875, "loss": 0.7368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -128.93655395507812, "rewards/margins": 14.954426765441895, "rewards/rejected": -143.8909912109375, "step": 45850 }, { "epoch": 2.65, "grad_norm": 49.27497100830078, "learning_rate": 0.00011548434536940284, "logits/chosen": -18.972169876098633, "logits/rejected": -20.64425277709961, "logps/chosen": -2846.50732421875, "logps/rejected": -2798.197509765625, "loss": 1.2818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -212.701171875, "rewards/margins": 10.774190902709961, "rewards/rejected": -223.475341796875, "step": 45860 }, { "epoch": 2.66, "grad_norm": 7.317061658795865e-07, "learning_rate": 0.00011529083942877047, "logits/chosen": -17.507221221923828, "logits/rejected": -18.41783332824707, "logps/chosen": -2966.484375, "logps/rejected": -2721.27490234375, "loss": 10.9048, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -150.96826171875, "rewards/margins": -6.2131147384643555, "rewards/rejected": -144.75515747070312, "step": 45870 }, { "epoch": 2.66, "grad_norm": 0.00345898000523448, "learning_rate": 0.00011509733348813808, "logits/chosen": -20.39549446105957, "logits/rejected": -20.84804344177246, "logps/chosen": -2640.723876953125, "logps/rejected": -2672.465087890625, "loss": 3.056, "rewards/accuracies": 0.5, "rewards/chosen": -170.45550537109375, "rewards/margins": 14.727407455444336, "rewards/rejected": -185.1829071044922, "step": 45880 }, { "epoch": 2.66, "grad_norm": 8.153055387083441e-05, "learning_rate": 0.00011490382754750571, "logits/chosen": -19.077852249145508, "logits/rejected": -19.008031845092773, "logps/chosen": -2899.43359375, "logps/rejected": -2604.263916015625, "loss": 1.4262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.6018524169922, "rewards/margins": 7.210649013519287, "rewards/rejected": -195.8125, "step": 45890 }, { "epoch": 2.66, "grad_norm": 66.52565002441406, "learning_rate": 0.00011471032160687334, "logits/chosen": -17.541860580444336, "logits/rejected": -17.18334197998047, "logps/chosen": -2768.51611328125, "logps/rejected": -3054.074462890625, "loss": 16.9707, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.88140869140625, "rewards/margins": -14.284817695617676, "rewards/rejected": -156.5966033935547, "step": 45900 }, { "epoch": 2.66, "grad_norm": 4.3317552922417235e-07, "learning_rate": 0.00011451681566624096, "logits/chosen": -19.949085235595703, "logits/rejected": -21.63367462158203, "logps/chosen": -3068.17333984375, "logps/rejected": -3250.622314453125, "loss": 0.1091, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -204.71853637695312, "rewards/margins": 21.564300537109375, "rewards/rejected": -226.2827911376953, "step": 45910 }, { "epoch": 2.66, "grad_norm": 195.2276153564453, "learning_rate": 0.00011432330972560857, "logits/chosen": -21.365171432495117, "logits/rejected": -24.915908813476562, "logps/chosen": -2817.451171875, "logps/rejected": -2348.64013671875, "loss": 14.3313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -216.01687622070312, "rewards/margins": -5.987831115722656, "rewards/rejected": -210.029052734375, "step": 45920 }, { "epoch": 2.66, "grad_norm": 1.3961572647094727, "learning_rate": 0.0001141298037849762, "logits/chosen": -21.707399368286133, "logits/rejected": -24.510961532592773, "logps/chosen": -2911.979736328125, "logps/rejected": -2844.089599609375, "loss": 2.4471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -236.98727416992188, "rewards/margins": 2.192187547683716, "rewards/rejected": -239.179443359375, "step": 45930 }, { "epoch": 2.66, "grad_norm": 0.3373767137527466, "learning_rate": 0.00011393629784434382, "logits/chosen": -19.17342185974121, "logits/rejected": -19.205062866210938, "logps/chosen": -2776.72412109375, "logps/rejected": -2640.40478515625, "loss": 14.8673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -208.2528076171875, "rewards/margins": -5.679522514343262, "rewards/rejected": -202.57325744628906, "step": 45940 }, { "epoch": 2.66, "grad_norm": 0.008508906699717045, "learning_rate": 0.00011374279190371145, "logits/chosen": -21.10394859313965, "logits/rejected": -21.592391967773438, "logps/chosen": -2619.728271484375, "logps/rejected": -2511.80908203125, "loss": 0.3407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.00979614257812, "rewards/margins": 7.992491245269775, "rewards/rejected": -168.00228881835938, "step": 45950 }, { "epoch": 2.66, "grad_norm": 0.000751459039747715, "learning_rate": 0.00011354928596307907, "logits/chosen": -18.614194869995117, "logits/rejected": -19.22527503967285, "logps/chosen": -2910.278076171875, "logps/rejected": -2621.686279296875, "loss": 3.3183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.00509643554688, "rewards/margins": 4.002371311187744, "rewards/rejected": -163.0074462890625, "step": 45960 }, { "epoch": 2.66, "grad_norm": 0.010097187012434006, "learning_rate": 0.00011335578002244669, "logits/chosen": -17.406330108642578, "logits/rejected": -18.1767635345459, "logps/chosen": -2915.212890625, "logps/rejected": -2768.770751953125, "loss": 14.5926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -217.69985961914062, "rewards/margins": -7.5315752029418945, "rewards/rejected": -210.1682586669922, "step": 45970 }, { "epoch": 2.66, "grad_norm": 0.00969569943845272, "learning_rate": 0.00011316227408181432, "logits/chosen": -17.695571899414062, "logits/rejected": -16.767192840576172, "logps/chosen": -3165.993896484375, "logps/rejected": -3103.04443359375, "loss": 4.2012, "rewards/accuracies": 0.5, "rewards/chosen": -180.71360778808594, "rewards/margins": 0.6330133676528931, "rewards/rejected": -181.34661865234375, "step": 45980 }, { "epoch": 2.66, "grad_norm": 119.2015151977539, "learning_rate": 0.00011296876814118193, "logits/chosen": -16.495189666748047, "logits/rejected": -16.714027404785156, "logps/chosen": -2502.27197265625, "logps/rejected": -2737.79052734375, "loss": 9.4523, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -154.96981811523438, "rewards/margins": -7.6944122314453125, "rewards/rejected": -147.275390625, "step": 45990 }, { "epoch": 2.66, "grad_norm": 14.159279823303223, "learning_rate": 0.00011277526220054955, "logits/chosen": -19.220333099365234, "logits/rejected": -19.532785415649414, "logps/chosen": -3050.63232421875, "logps/rejected": -2930.15576171875, "loss": 1.5474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -231.8854217529297, "rewards/margins": 4.353968143463135, "rewards/rejected": -236.2393798828125, "step": 46000 }, { "epoch": 2.66, "grad_norm": 0.8214126229286194, "learning_rate": 0.00011258175625991718, "logits/chosen": -17.56984519958496, "logits/rejected": -17.587207794189453, "logps/chosen": -2954.99853515625, "logps/rejected": -2816.810546875, "loss": 0.6819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -219.07699584960938, "rewards/margins": 9.893537521362305, "rewards/rejected": -228.9705352783203, "step": 46010 }, { "epoch": 2.66, "grad_norm": 87.15876770019531, "learning_rate": 0.0001123882503192848, "logits/chosen": -18.366863250732422, "logits/rejected": -18.27608299255371, "logps/chosen": -3020.56787109375, "logps/rejected": -2810.79736328125, "loss": 3.8785, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -165.88360595703125, "rewards/margins": 0.752188503742218, "rewards/rejected": -166.63580322265625, "step": 46020 }, { "epoch": 2.66, "grad_norm": 0.16616298258304596, "learning_rate": 0.00011219474437865243, "logits/chosen": -16.634450912475586, "logits/rejected": -16.772916793823242, "logps/chosen": -2817.39453125, "logps/rejected": -2701.12841796875, "loss": 2.2567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.68687438964844, "rewards/margins": 8.935368537902832, "rewards/rejected": -153.6222381591797, "step": 46030 }, { "epoch": 2.66, "grad_norm": 0.8556159734725952, "learning_rate": 0.00011200123843802006, "logits/chosen": -18.714290618896484, "logits/rejected": -18.967090606689453, "logps/chosen": -2583.63818359375, "logps/rejected": -2492.53466796875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -168.45321655273438, "rewards/margins": 10.923861503601074, "rewards/rejected": -179.37709045410156, "step": 46040 }, { "epoch": 2.67, "grad_norm": 103.40438842773438, "learning_rate": 0.00011180773249738766, "logits/chosen": -18.195083618164062, "logits/rejected": -20.739116668701172, "logps/chosen": -2813.254638671875, "logps/rejected": -2668.522216796875, "loss": 0.1869, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -162.52145385742188, "rewards/margins": 11.182107925415039, "rewards/rejected": -173.70358276367188, "step": 46050 }, { "epoch": 2.67, "grad_norm": 29.861526489257812, "learning_rate": 0.0001116142265567553, "logits/chosen": -16.643383026123047, "logits/rejected": -16.789806365966797, "logps/chosen": -2865.356201171875, "logps/rejected": -2871.661376953125, "loss": 1.8801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.23243713378906, "rewards/margins": 7.341300964355469, "rewards/rejected": -167.57374572753906, "step": 46060 }, { "epoch": 2.67, "grad_norm": 96.58026885986328, "learning_rate": 0.00011142072061612293, "logits/chosen": -20.31916046142578, "logits/rejected": -20.845590591430664, "logps/chosen": -2808.900390625, "logps/rejected": -2790.658447265625, "loss": 9.3874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -215.8798828125, "rewards/margins": -2.5595169067382812, "rewards/rejected": -213.3203582763672, "step": 46070 }, { "epoch": 2.67, "grad_norm": 5.704786090347924e-17, "learning_rate": 0.00011122721467549054, "logits/chosen": -19.4349308013916, "logits/rejected": -20.36288833618164, "logps/chosen": -3283.61181640625, "logps/rejected": -2880.712890625, "loss": 10.8135, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -210.7382049560547, "rewards/margins": -0.7122093439102173, "rewards/rejected": -210.0260009765625, "step": 46080 }, { "epoch": 2.67, "grad_norm": 8.696243323971092e-14, "learning_rate": 0.00011103370873485816, "logits/chosen": -16.487529754638672, "logits/rejected": -16.57120132446289, "logps/chosen": -3231.19775390625, "logps/rejected": -2422.20458984375, "loss": 3.2032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.5223388671875, "rewards/margins": 21.67250633239746, "rewards/rejected": -190.19483947753906, "step": 46090 }, { "epoch": 2.67, "grad_norm": 0.0, "learning_rate": 0.00011084020279422578, "logits/chosen": -19.070459365844727, "logits/rejected": -20.191356658935547, "logps/chosen": -2674.514404296875, "logps/rejected": -2810.50439453125, "loss": 0.4192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -230.14328002929688, "rewards/margins": 23.799562454223633, "rewards/rejected": -253.94284057617188, "step": 46100 }, { "epoch": 2.67, "grad_norm": 13.263559341430664, "learning_rate": 0.00011064669685359341, "logits/chosen": -16.27805519104004, "logits/rejected": -16.843109130859375, "logps/chosen": -3034.58349609375, "logps/rejected": -2888.421630859375, "loss": 6.7829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -123.1821060180664, "rewards/margins": 2.2120070457458496, "rewards/rejected": -125.39411926269531, "step": 46110 }, { "epoch": 2.67, "grad_norm": 29.850311279296875, "learning_rate": 0.00011045319091296104, "logits/chosen": -19.926767349243164, "logits/rejected": -19.641071319580078, "logps/chosen": -2891.11572265625, "logps/rejected": -2852.967529296875, "loss": 3.3037, "rewards/accuracies": 0.5, "rewards/chosen": -184.78134155273438, "rewards/margins": 11.247831344604492, "rewards/rejected": -196.0291748046875, "step": 46120 }, { "epoch": 2.67, "grad_norm": 2.421742919977987e-06, "learning_rate": 0.00011025968497232864, "logits/chosen": -19.98491668701172, "logits/rejected": -19.737293243408203, "logps/chosen": -2954.352294921875, "logps/rejected": -2375.07080078125, "loss": 3.004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.22604370117188, "rewards/margins": 13.35266399383545, "rewards/rejected": -173.57870483398438, "step": 46130 }, { "epoch": 2.67, "grad_norm": 98.73473358154297, "learning_rate": 0.00011006617903169627, "logits/chosen": -18.184551239013672, "logits/rejected": -20.408103942871094, "logps/chosen": -3255.130126953125, "logps/rejected": -2992.72314453125, "loss": 1.18, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -217.98800659179688, "rewards/margins": 12.086946487426758, "rewards/rejected": -230.074951171875, "step": 46140 }, { "epoch": 2.67, "grad_norm": 0.0331384502351284, "learning_rate": 0.0001098726730910639, "logits/chosen": -18.11789321899414, "logits/rejected": -19.22968101501465, "logps/chosen": -2592.962890625, "logps/rejected": -2360.285888671875, "loss": 25.1284, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -191.34927368164062, "rewards/margins": -19.270832061767578, "rewards/rejected": -172.07846069335938, "step": 46150 }, { "epoch": 2.67, "grad_norm": 2.1226769604254514e-06, "learning_rate": 0.00010967916715043152, "logits/chosen": -17.5955867767334, "logits/rejected": -20.388463973999023, "logps/chosen": -2635.99853515625, "logps/rejected": -2348.43798828125, "loss": 2.4169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -171.15054321289062, "rewards/margins": 4.945728302001953, "rewards/rejected": -176.09628295898438, "step": 46160 }, { "epoch": 2.67, "grad_norm": 66.30364990234375, "learning_rate": 0.00010948566120979914, "logits/chosen": -15.932289123535156, "logits/rejected": -16.415332794189453, "logps/chosen": -2800.22607421875, "logps/rejected": -2447.2119140625, "loss": 9.9893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -93.72087860107422, "rewards/margins": 0.40591126680374146, "rewards/rejected": -94.12678527832031, "step": 46170 }, { "epoch": 2.67, "grad_norm": 1.3670270471166646e-09, "learning_rate": 0.00010929215526916676, "logits/chosen": -19.717321395874023, "logits/rejected": -20.7312068939209, "logps/chosen": -2669.615478515625, "logps/rejected": -2586.64306640625, "loss": 16.9984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.33120727539062, "rewards/margins": -2.030147075653076, "rewards/rejected": -178.30105590820312, "step": 46180 }, { "epoch": 2.67, "grad_norm": 11.166942596435547, "learning_rate": 0.00010909864932853439, "logits/chosen": -17.646007537841797, "logits/rejected": -18.250232696533203, "logps/chosen": -3231.09619140625, "logps/rejected": -2828.453125, "loss": 0.2741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -141.67617797851562, "rewards/margins": 13.501620292663574, "rewards/rejected": -155.1777801513672, "step": 46190 }, { "epoch": 2.67, "grad_norm": 0.0004666132153943181, "learning_rate": 0.00010890514338790202, "logits/chosen": -18.29136085510254, "logits/rejected": -19.258567810058594, "logps/chosen": -2963.397216796875, "logps/rejected": -2897.578125, "loss": 18.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -160.47894287109375, "rewards/margins": -5.5393829345703125, "rewards/rejected": -154.93954467773438, "step": 46200 }, { "epoch": 2.67, "grad_norm": 332.13250732421875, "learning_rate": 0.00010871163744726963, "logits/chosen": -15.626383781433105, "logits/rejected": -16.228519439697266, "logps/chosen": -2577.55029296875, "logps/rejected": -2500.91357421875, "loss": 6.16, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.01573181152344, "rewards/margins": 2.965942859649658, "rewards/rejected": -146.98167419433594, "step": 46210 }, { "epoch": 2.68, "grad_norm": 6.767464947188273e-05, "learning_rate": 0.00010851813150663725, "logits/chosen": -15.092859268188477, "logits/rejected": -16.30124282836914, "logps/chosen": -2958.79541015625, "logps/rejected": -2711.62109375, "loss": 5.2377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.16253662109375, "rewards/margins": 4.426072120666504, "rewards/rejected": -138.58860778808594, "step": 46220 }, { "epoch": 2.68, "grad_norm": 0.00022041639022063464, "learning_rate": 0.00010832462556600488, "logits/chosen": -16.596715927124023, "logits/rejected": -18.150976181030273, "logps/chosen": -3068.451171875, "logps/rejected": -2999.74365234375, "loss": 1.0113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -199.17515563964844, "rewards/margins": 7.409201145172119, "rewards/rejected": -206.5843505859375, "step": 46230 }, { "epoch": 2.68, "grad_norm": 24.707197189331055, "learning_rate": 0.0001081311196253725, "logits/chosen": -18.672807693481445, "logits/rejected": -21.853160858154297, "logps/chosen": -2742.03564453125, "logps/rejected": -2854.43701171875, "loss": 3.2181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -218.64260864257812, "rewards/margins": 6.592134952545166, "rewards/rejected": -225.2347412109375, "step": 46240 }, { "epoch": 2.68, "grad_norm": 5.771310497948434e-06, "learning_rate": 0.00010793761368474013, "logits/chosen": -15.004135131835938, "logits/rejected": -15.269430160522461, "logps/chosen": -3358.926513671875, "logps/rejected": -3069.478271484375, "loss": 1.9649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -90.29978942871094, "rewards/margins": 7.87540340423584, "rewards/rejected": -98.17518615722656, "step": 46250 }, { "epoch": 2.68, "grad_norm": 4.357956886291504, "learning_rate": 0.00010774410774410775, "logits/chosen": -19.827417373657227, "logits/rejected": -20.09494972229004, "logps/chosen": -2786.43212890625, "logps/rejected": -2854.650390625, "loss": 1.4562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -174.6587371826172, "rewards/margins": 9.466936111450195, "rewards/rejected": -184.1256866455078, "step": 46260 }, { "epoch": 2.68, "grad_norm": 0.0008711421978659928, "learning_rate": 0.00010755060180347536, "logits/chosen": -17.384777069091797, "logits/rejected": -19.034440994262695, "logps/chosen": -2624.217041015625, "logps/rejected": -2487.02783203125, "loss": 1.607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.94627380371094, "rewards/margins": 7.9922590255737305, "rewards/rejected": -161.93853759765625, "step": 46270 }, { "epoch": 2.68, "grad_norm": 0.07163587212562561, "learning_rate": 0.000107357095862843, "logits/chosen": -18.57179069519043, "logits/rejected": -18.270261764526367, "logps/chosen": -2663.833740234375, "logps/rejected": -2835.6328125, "loss": 2.9581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -198.6416778564453, "rewards/margins": 8.08765697479248, "rewards/rejected": -206.72933959960938, "step": 46280 }, { "epoch": 2.68, "grad_norm": 0.10407274961471558, "learning_rate": 0.00010716358992221061, "logits/chosen": -15.398675918579102, "logits/rejected": -15.972894668579102, "logps/chosen": -3083.3037109375, "logps/rejected": -2982.42333984375, "loss": 2.8787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.2501678466797, "rewards/margins": 3.646826982498169, "rewards/rejected": -164.89700317382812, "step": 46290 }, { "epoch": 2.68, "grad_norm": 0.009534459561109543, "learning_rate": 0.00010697008398157823, "logits/chosen": -18.063297271728516, "logits/rejected": -18.02138328552246, "logps/chosen": -2296.620361328125, "logps/rejected": -2467.132568359375, "loss": 2.1687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -176.08473205566406, "rewards/margins": 4.13156795501709, "rewards/rejected": -180.21629333496094, "step": 46300 }, { "epoch": 2.68, "grad_norm": 74.41783142089844, "learning_rate": 0.00010677657804094586, "logits/chosen": -19.161277770996094, "logits/rejected": -18.769256591796875, "logps/chosen": -2581.65380859375, "logps/rejected": -2573.391357421875, "loss": 2.4381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -174.09486389160156, "rewards/margins": 3.093782901763916, "rewards/rejected": -177.18865966796875, "step": 46310 }, { "epoch": 2.68, "grad_norm": 70.1304931640625, "learning_rate": 0.00010658307210031348, "logits/chosen": -16.43465805053711, "logits/rejected": -17.029544830322266, "logps/chosen": -2662.841796875, "logps/rejected": -2449.91845703125, "loss": 2.6488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -162.8777313232422, "rewards/margins": 1.3137527704238892, "rewards/rejected": -164.1914825439453, "step": 46320 }, { "epoch": 2.68, "grad_norm": 290.5907287597656, "learning_rate": 0.00010638956615968111, "logits/chosen": -17.247093200683594, "logits/rejected": -17.426376342773438, "logps/chosen": -2885.099853515625, "logps/rejected": -2871.33837890625, "loss": 20.411, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -170.6077117919922, "rewards/margins": -10.24026870727539, "rewards/rejected": -160.36744689941406, "step": 46330 }, { "epoch": 2.68, "grad_norm": 0.7818731665611267, "learning_rate": 0.00010619606021904873, "logits/chosen": -15.090448379516602, "logits/rejected": -16.002662658691406, "logps/chosen": -3047.2236328125, "logps/rejected": -2910.63427734375, "loss": 6.0533, "rewards/accuracies": 0.5, "rewards/chosen": -127.45179748535156, "rewards/margins": 0.5066299438476562, "rewards/rejected": -127.95841217041016, "step": 46340 }, { "epoch": 2.68, "grad_norm": 5.723686990677379e-05, "learning_rate": 0.00010600255427841634, "logits/chosen": -16.971101760864258, "logits/rejected": -17.609067916870117, "logps/chosen": -2887.135009765625, "logps/rejected": -2785.62255859375, "loss": 4.1178, "rewards/accuracies": 0.5, "rewards/chosen": -219.62545776367188, "rewards/margins": 2.167105197906494, "rewards/rejected": -221.79257202148438, "step": 46350 }, { "epoch": 2.68, "grad_norm": 49.73131561279297, "learning_rate": 0.00010580904833778397, "logits/chosen": -21.242040634155273, "logits/rejected": -21.44698143005371, "logps/chosen": -2857.448974609375, "logps/rejected": -3023.6142578125, "loss": 2.3854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.59417724609375, "rewards/margins": 8.198570251464844, "rewards/rejected": -195.79273986816406, "step": 46360 }, { "epoch": 2.68, "grad_norm": 0.02901977300643921, "learning_rate": 0.0001056155423971516, "logits/chosen": -17.75381088256836, "logits/rejected": -18.91513442993164, "logps/chosen": -2957.478271484375, "logps/rejected": -2967.571044921875, "loss": 3.0898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.8866729736328, "rewards/margins": 7.870388984680176, "rewards/rejected": -176.75704956054688, "step": 46370 }, { "epoch": 2.68, "grad_norm": 1.4646692761743907e-05, "learning_rate": 0.00010542203645651922, "logits/chosen": -18.314714431762695, "logits/rejected": -20.110309600830078, "logps/chosen": -2574.3857421875, "logps/rejected": -2258.19482421875, "loss": 12.964, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -183.0651092529297, "rewards/margins": -9.840795516967773, "rewards/rejected": -173.2243194580078, "step": 46380 }, { "epoch": 2.69, "grad_norm": 66.65823364257812, "learning_rate": 0.00010522853051588684, "logits/chosen": -17.130369186401367, "logits/rejected": -18.09638786315918, "logps/chosen": -2875.090087890625, "logps/rejected": -2817.692138671875, "loss": 0.8415, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -209.14956665039062, "rewards/margins": 14.239148139953613, "rewards/rejected": -223.3887176513672, "step": 46390 }, { "epoch": 2.69, "grad_norm": 69.52952575683594, "learning_rate": 0.00010503502457525446, "logits/chosen": -15.651690483093262, "logits/rejected": -15.89640998840332, "logps/chosen": -2935.96923828125, "logps/rejected": -2643.89892578125, "loss": 4.7935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -150.60494995117188, "rewards/margins": 2.242013454437256, "rewards/rejected": -152.84695434570312, "step": 46400 }, { "epoch": 2.69, "grad_norm": 0.0, "learning_rate": 0.00010484151863462209, "logits/chosen": -17.309133529663086, "logits/rejected": -17.368093490600586, "logps/chosen": -3083.5068359375, "logps/rejected": -3127.28173828125, "loss": 1.0676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -188.87527465820312, "rewards/margins": 11.322972297668457, "rewards/rejected": -200.19822692871094, "step": 46410 }, { "epoch": 2.69, "grad_norm": 0.005639415234327316, "learning_rate": 0.00010464801269398972, "logits/chosen": -17.355579376220703, "logits/rejected": -17.786731719970703, "logps/chosen": -2692.095947265625, "logps/rejected": -2761.75439453125, "loss": 5.0931, "rewards/accuracies": 0.5, "rewards/chosen": -176.1258544921875, "rewards/margins": 1.99271559715271, "rewards/rejected": -178.1185760498047, "step": 46420 }, { "epoch": 2.69, "grad_norm": 0.40236908197402954, "learning_rate": 0.00010445450675335732, "logits/chosen": -17.592121124267578, "logits/rejected": -19.180355072021484, "logps/chosen": -2799.752685546875, "logps/rejected": -2987.3984375, "loss": 4.761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -175.62387084960938, "rewards/margins": 12.979089736938477, "rewards/rejected": -188.60296630859375, "step": 46430 }, { "epoch": 2.69, "grad_norm": 5.6178096201620065e-06, "learning_rate": 0.00010426100081272495, "logits/chosen": -18.0841007232666, "logits/rejected": -17.840816497802734, "logps/chosen": -3193.75146484375, "logps/rejected": -3050.615478515625, "loss": 2.3252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.73324584960938, "rewards/margins": 5.48183536529541, "rewards/rejected": -210.215087890625, "step": 46440 }, { "epoch": 2.69, "grad_norm": 1.502797886132612e-07, "learning_rate": 0.00010406749487209258, "logits/chosen": -16.41653060913086, "logits/rejected": -16.41274642944336, "logps/chosen": -2853.400634765625, "logps/rejected": -2453.16748046875, "loss": 4.6125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -133.42689514160156, "rewards/margins": 9.156976699829102, "rewards/rejected": -142.5838623046875, "step": 46450 }, { "epoch": 2.69, "grad_norm": 0.02215106412768364, "learning_rate": 0.0001038739889314602, "logits/chosen": -16.564239501953125, "logits/rejected": -17.090456008911133, "logps/chosen": -2945.3564453125, "logps/rejected": -2891.45068359375, "loss": 0.6922, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -150.53085327148438, "rewards/margins": 8.982272148132324, "rewards/rejected": -159.5131072998047, "step": 46460 }, { "epoch": 2.69, "grad_norm": 99.54556274414062, "learning_rate": 0.00010368048299082782, "logits/chosen": -18.926132202148438, "logits/rejected": -21.012205123901367, "logps/chosen": -2981.3095703125, "logps/rejected": -2905.8427734375, "loss": 0.8561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.30210876464844, "rewards/margins": 8.91778564453125, "rewards/rejected": -203.2198944091797, "step": 46470 }, { "epoch": 2.69, "grad_norm": 0.017834579572081566, "learning_rate": 0.00010348697705019545, "logits/chosen": -17.508167266845703, "logits/rejected": -18.1572265625, "logps/chosen": -2954.991455078125, "logps/rejected": -3004.386962890625, "loss": 1.1569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -153.1527099609375, "rewards/margins": 22.862346649169922, "rewards/rejected": -176.01504516601562, "step": 46480 }, { "epoch": 2.69, "grad_norm": 0.004868598654866219, "learning_rate": 0.00010329347110956306, "logits/chosen": -18.086246490478516, "logits/rejected": -17.653173446655273, "logps/chosen": -2978.27490234375, "logps/rejected": -2847.03076171875, "loss": 1.8672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.81320190429688, "rewards/margins": 8.930707931518555, "rewards/rejected": -203.74391174316406, "step": 46490 }, { "epoch": 2.69, "grad_norm": 0.0014549213228747249, "learning_rate": 0.0001030999651689307, "logits/chosen": -14.875439643859863, "logits/rejected": -15.743242263793945, "logps/chosen": -3227.240234375, "logps/rejected": -2685.72998046875, "loss": 5.2421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -114.74516296386719, "rewards/margins": 14.336097717285156, "rewards/rejected": -129.08126831054688, "step": 46500 }, { "epoch": 2.69, "grad_norm": 72.42445373535156, "learning_rate": 0.0001029064592282983, "logits/chosen": -16.964420318603516, "logits/rejected": -17.863290786743164, "logps/chosen": -3148.740234375, "logps/rejected": -3103.21337890625, "loss": 1.6695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -164.9805145263672, "rewards/margins": 5.989813804626465, "rewards/rejected": -170.97032165527344, "step": 46510 }, { "epoch": 2.69, "grad_norm": 2.0512557341589854e-07, "learning_rate": 0.00010271295328766593, "logits/chosen": -19.217761993408203, "logits/rejected": -20.787458419799805, "logps/chosen": -2563.864501953125, "logps/rejected": -2317.893798828125, "loss": 1.9098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -201.55514526367188, "rewards/margins": 9.84068489074707, "rewards/rejected": -211.3957977294922, "step": 46520 }, { "epoch": 2.69, "grad_norm": 0.3586861491203308, "learning_rate": 0.00010251944734703356, "logits/chosen": -15.772378921508789, "logits/rejected": -15.428057670593262, "logps/chosen": -3119.27978515625, "logps/rejected": -2895.27197265625, "loss": 0.9712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -158.3206024169922, "rewards/margins": 12.899897575378418, "rewards/rejected": -171.22048950195312, "step": 46530 }, { "epoch": 2.69, "grad_norm": 46.838626861572266, "learning_rate": 0.00010232594140640118, "logits/chosen": -17.236907958984375, "logits/rejected": -17.34890365600586, "logps/chosen": -2759.990478515625, "logps/rejected": -2893.512451171875, "loss": 2.8431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.11090087890625, "rewards/margins": 4.330853462219238, "rewards/rejected": -145.4417724609375, "step": 46540 }, { "epoch": 2.69, "grad_norm": 30.4646053314209, "learning_rate": 0.0001021324354657688, "logits/chosen": -18.608518600463867, "logits/rejected": -18.65777587890625, "logps/chosen": -2870.824951171875, "logps/rejected": -2954.057861328125, "loss": 2.6865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -214.9339141845703, "rewards/margins": 6.9000139236450195, "rewards/rejected": -221.8339385986328, "step": 46550 }, { "epoch": 2.7, "grad_norm": 2.306509494781494, "learning_rate": 0.00010193892952513643, "logits/chosen": -21.223459243774414, "logits/rejected": -22.189306259155273, "logps/chosen": -2718.896240234375, "logps/rejected": -2513.318115234375, "loss": 5.6463, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -214.86026000976562, "rewards/margins": -1.51445734500885, "rewards/rejected": -213.34579467773438, "step": 46560 }, { "epoch": 2.7, "grad_norm": 13.9459228515625, "learning_rate": 0.00010174542358450404, "logits/chosen": -19.372507095336914, "logits/rejected": -21.284351348876953, "logps/chosen": -3046.794189453125, "logps/rejected": -3088.32666015625, "loss": 1.7342, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -236.095458984375, "rewards/margins": 10.5126314163208, "rewards/rejected": -246.6081085205078, "step": 46570 }, { "epoch": 2.7, "grad_norm": 1.0622061491012573, "learning_rate": 0.00010155191764387167, "logits/chosen": -17.79758071899414, "logits/rejected": -18.18631935119629, "logps/chosen": -2945.664794921875, "logps/rejected": -2948.9404296875, "loss": 3.2844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -214.1649169921875, "rewards/margins": 3.6462676525115967, "rewards/rejected": -217.8112030029297, "step": 46580 }, { "epoch": 2.7, "grad_norm": 2.8321215594928617e-08, "learning_rate": 0.00010135841170323929, "logits/chosen": -16.39462661743164, "logits/rejected": -16.808359146118164, "logps/chosen": -2977.458984375, "logps/rejected": -2916.326171875, "loss": 6.2827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.0830535888672, "rewards/margins": 4.6940813064575195, "rewards/rejected": -201.77713012695312, "step": 46590 }, { "epoch": 2.7, "grad_norm": 55.0532112121582, "learning_rate": 0.00010116490576260691, "logits/chosen": -15.597017288208008, "logits/rejected": -16.410261154174805, "logps/chosen": -3156.93798828125, "logps/rejected": -2863.463134765625, "loss": 16.1322, "rewards/accuracies": 0.5, "rewards/chosen": -172.67544555664062, "rewards/margins": -6.670193672180176, "rewards/rejected": -166.00526428222656, "step": 46600 }, { "epoch": 2.7, "grad_norm": 2.409732223895844e-05, "learning_rate": 0.00010097139982197454, "logits/chosen": -17.0760555267334, "logits/rejected": -17.45660400390625, "logps/chosen": -2674.526611328125, "logps/rejected": -2577.03271484375, "loss": 2.9817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -161.56101989746094, "rewards/margins": 3.782989501953125, "rewards/rejected": -165.34402465820312, "step": 46610 }, { "epoch": 2.7, "grad_norm": 32.51351547241211, "learning_rate": 0.00010077789388134216, "logits/chosen": -16.958250045776367, "logits/rejected": -17.613134384155273, "logps/chosen": -3258.8662109375, "logps/rejected": -2966.767578125, "loss": 5.1883, "rewards/accuracies": 0.5, "rewards/chosen": -153.5772705078125, "rewards/margins": 0.6313356161117554, "rewards/rejected": -154.20860290527344, "step": 46620 }, { "epoch": 2.7, "grad_norm": 5.878363026567968e-06, "learning_rate": 0.00010058438794070979, "logits/chosen": -18.057159423828125, "logits/rejected": -19.997331619262695, "logps/chosen": -3043.94580078125, "logps/rejected": -2523.800048828125, "loss": 18.9787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -151.11976623535156, "rewards/margins": -2.656522274017334, "rewards/rejected": -148.4632568359375, "step": 46630 }, { "epoch": 2.7, "grad_norm": 0.09137076139450073, "learning_rate": 0.0001003908820000774, "logits/chosen": -16.7896785736084, "logits/rejected": -16.845260620117188, "logps/chosen": -2939.9326171875, "logps/rejected": -3050.25634765625, "loss": 4.2503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -227.1961669921875, "rewards/margins": 0.01792297326028347, "rewards/rejected": -227.2140655517578, "step": 46640 }, { "epoch": 2.7, "grad_norm": 228.8798370361328, "learning_rate": 0.00010019737605944502, "logits/chosen": -16.90158462524414, "logits/rejected": -17.538158416748047, "logps/chosen": -2716.046142578125, "logps/rejected": -2618.57177734375, "loss": 7.4616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -230.1680450439453, "rewards/margins": -1.7112144231796265, "rewards/rejected": -228.4568328857422, "step": 46650 }, { "epoch": 2.7, "grad_norm": 4.335697667556815e-08, "learning_rate": 0.00010000387011881265, "logits/chosen": -14.99681568145752, "logits/rejected": -15.395368576049805, "logps/chosen": -3114.07666015625, "logps/rejected": -2763.371337890625, "loss": 0.6835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -119.18504333496094, "rewards/margins": 13.417895317077637, "rewards/rejected": -132.60293579101562, "step": 46660 }, { "epoch": 2.7, "grad_norm": 2.2319777599477675e-06, "learning_rate": 9.981036417818028e-05, "logits/chosen": -15.625581741333008, "logits/rejected": -16.380107879638672, "logps/chosen": -3256.87158203125, "logps/rejected": -2795.036865234375, "loss": 6.3102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -106.40962982177734, "rewards/margins": 9.055540084838867, "rewards/rejected": -115.4651870727539, "step": 46670 }, { "epoch": 2.7, "grad_norm": 0.0026571694761514664, "learning_rate": 9.961685823754789e-05, "logits/chosen": -18.495290756225586, "logits/rejected": -20.131103515625, "logps/chosen": -3023.225341796875, "logps/rejected": -3044.53173828125, "loss": 3.4577, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -226.2761993408203, "rewards/margins": 8.610950469970703, "rewards/rejected": -234.8871307373047, "step": 46680 }, { "epoch": 2.7, "grad_norm": 42.9807014465332, "learning_rate": 9.942335229691552e-05, "logits/chosen": -17.07271957397461, "logits/rejected": -17.165935516357422, "logps/chosen": -2990.85595703125, "logps/rejected": -3171.95556640625, "loss": 5.3037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.29641723632812, "rewards/margins": 0.43370017409324646, "rewards/rejected": -134.73013305664062, "step": 46690 }, { "epoch": 2.7, "grad_norm": 90.0389633178711, "learning_rate": 9.922984635628313e-05, "logits/chosen": -17.22553825378418, "logits/rejected": -18.063617706298828, "logps/chosen": -3116.22705078125, "logps/rejected": -3017.147705078125, "loss": 5.5786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.22317504882812, "rewards/margins": 5.087296962738037, "rewards/rejected": -137.31045532226562, "step": 46700 }, { "epoch": 2.7, "grad_norm": 0.00034155469620600343, "learning_rate": 9.903634041565076e-05, "logits/chosen": -18.054126739501953, "logits/rejected": -18.680511474609375, "logps/chosen": -2822.98876953125, "logps/rejected": -2564.807861328125, "loss": 0.4675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -170.26358032226562, "rewards/margins": 16.408845901489258, "rewards/rejected": -186.6724090576172, "step": 46710 }, { "epoch": 2.7, "grad_norm": 2.4334872224945903e-12, "learning_rate": 9.884283447501838e-05, "logits/chosen": -19.50316047668457, "logits/rejected": -20.773704528808594, "logps/chosen": -2457.788818359375, "logps/rejected": -2431.64990234375, "loss": 1.8824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -191.7957763671875, "rewards/margins": 7.974970817565918, "rewards/rejected": -199.77072143554688, "step": 46720 }, { "epoch": 2.7, "grad_norm": 0.014151734299957752, "learning_rate": 9.8649328534386e-05, "logits/chosen": -17.695783615112305, "logits/rejected": -19.22191619873047, "logps/chosen": -2643.3310546875, "logps/rejected": -2492.318603515625, "loss": 0.3725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -137.61337280273438, "rewards/margins": 13.846014022827148, "rewards/rejected": -151.45938110351562, "step": 46730 }, { "epoch": 2.71, "grad_norm": 1.3236893525725435e-10, "learning_rate": 9.845582259375363e-05, "logits/chosen": -16.078094482421875, "logits/rejected": -15.863143920898438, "logps/chosen": -2979.645751953125, "logps/rejected": -2630.701904296875, "loss": 5.5305, "rewards/accuracies": 0.5, "rewards/chosen": -128.32618713378906, "rewards/margins": 9.991689682006836, "rewards/rejected": -138.31787109375, "step": 46740 }, { "epoch": 2.71, "grad_norm": 0.0005451945471577346, "learning_rate": 9.826231665312126e-05, "logits/chosen": -19.258617401123047, "logits/rejected": -20.19265365600586, "logps/chosen": -2806.531982421875, "logps/rejected": -2858.31787109375, "loss": 0.4364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -209.99978637695312, "rewards/margins": 11.07597541809082, "rewards/rejected": -221.0757293701172, "step": 46750 }, { "epoch": 2.71, "grad_norm": 0.00028429250232875347, "learning_rate": 9.806881071248888e-05, "logits/chosen": -20.443355560302734, "logits/rejected": -22.17592430114746, "logps/chosen": -3091.39697265625, "logps/rejected": -2810.955322265625, "loss": 2.9317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.9973907470703, "rewards/margins": 8.397673606872559, "rewards/rejected": -179.39505004882812, "step": 46760 }, { "epoch": 2.71, "grad_norm": 308.9090270996094, "learning_rate": 9.78753047718565e-05, "logits/chosen": -15.788190841674805, "logits/rejected": -15.65893840789795, "logps/chosen": -2912.780029296875, "logps/rejected": -2703.28271484375, "loss": 6.2717, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -147.20028686523438, "rewards/margins": -2.179064989089966, "rewards/rejected": -145.021240234375, "step": 46770 }, { "epoch": 2.71, "grad_norm": 0.2947498559951782, "learning_rate": 9.768179883122413e-05, "logits/chosen": -16.88890266418457, "logits/rejected": -17.336624145507812, "logps/chosen": -2653.221435546875, "logps/rejected": -2578.759033203125, "loss": 3.9802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -111.02738189697266, "rewards/margins": 9.229902267456055, "rewards/rejected": -120.25728607177734, "step": 46780 }, { "epoch": 2.71, "grad_norm": 0.06777416169643402, "learning_rate": 9.748829289059174e-05, "logits/chosen": -18.90350341796875, "logits/rejected": -18.473529815673828, "logps/chosen": -2720.208740234375, "logps/rejected": -2662.145263671875, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.8153533935547, "rewards/margins": 13.407205581665039, "rewards/rejected": -218.22256469726562, "step": 46790 }, { "epoch": 2.71, "grad_norm": 105.83768463134766, "learning_rate": 9.729478694995937e-05, "logits/chosen": -18.924074172973633, "logits/rejected": -18.917984008789062, "logps/chosen": -2883.607177734375, "logps/rejected": -2767.81591796875, "loss": 4.0741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -137.4510498046875, "rewards/margins": 0.4640413224697113, "rewards/rejected": -137.9150848388672, "step": 46800 }, { "epoch": 2.71, "grad_norm": 0.008671843446791172, "learning_rate": 9.710128100932698e-05, "logits/chosen": -19.343578338623047, "logits/rejected": -19.24422264099121, "logps/chosen": -3236.971923828125, "logps/rejected": -3338.13720703125, "loss": 3.993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.1547088623047, "rewards/margins": 9.382128715515137, "rewards/rejected": -156.53684997558594, "step": 46810 }, { "epoch": 2.71, "grad_norm": 56.715484619140625, "learning_rate": 9.690777506869461e-05, "logits/chosen": -20.340030670166016, "logits/rejected": -22.032604217529297, "logps/chosen": -2437.628662109375, "logps/rejected": -2287.856201171875, "loss": 2.8112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -167.7366485595703, "rewards/margins": 2.2605538368225098, "rewards/rejected": -169.99722290039062, "step": 46820 }, { "epoch": 2.71, "grad_norm": 78.07473754882812, "learning_rate": 9.671426912806224e-05, "logits/chosen": -17.633678436279297, "logits/rejected": -17.817901611328125, "logps/chosen": -2926.43359375, "logps/rejected": -3080.650146484375, "loss": 5.5851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.3411407470703, "rewards/margins": 2.0142064094543457, "rewards/rejected": -163.3553466796875, "step": 46830 }, { "epoch": 2.71, "grad_norm": 141.41043090820312, "learning_rate": 9.652076318742986e-05, "logits/chosen": -16.726381301879883, "logits/rejected": -16.265674591064453, "logps/chosen": -2696.701171875, "logps/rejected": -2482.532958984375, "loss": 12.7732, "rewards/accuracies": 0.5, "rewards/chosen": -164.00677490234375, "rewards/margins": -7.074586391448975, "rewards/rejected": -156.9322052001953, "step": 46840 }, { "epoch": 2.71, "grad_norm": 86.14472961425781, "learning_rate": 9.632725724679747e-05, "logits/chosen": -17.454282760620117, "logits/rejected": -18.83640480041504, "logps/chosen": -2999.435791015625, "logps/rejected": -3168.2978515625, "loss": 5.0239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -184.62062072753906, "rewards/margins": 2.8987603187561035, "rewards/rejected": -187.51937866210938, "step": 46850 }, { "epoch": 2.71, "grad_norm": 130.04861450195312, "learning_rate": 9.61337513061651e-05, "logits/chosen": -15.975921630859375, "logits/rejected": -16.599308013916016, "logps/chosen": -2500.61376953125, "logps/rejected": -1927.85546875, "loss": 27.4032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -165.63214111328125, "rewards/margins": -22.626190185546875, "rewards/rejected": -143.00595092773438, "step": 46860 }, { "epoch": 2.71, "grad_norm": 64.08805847167969, "learning_rate": 9.594024536553272e-05, "logits/chosen": -18.127622604370117, "logits/rejected": -18.746437072753906, "logps/chosen": -2385.577392578125, "logps/rejected": -2476.48583984375, "loss": 2.3302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.34613037109375, "rewards/margins": 10.833535194396973, "rewards/rejected": -190.17967224121094, "step": 46870 }, { "epoch": 2.71, "grad_norm": 2.2462193216554738e-10, "learning_rate": 9.574673942490035e-05, "logits/chosen": -17.377735137939453, "logits/rejected": -18.179309844970703, "logps/chosen": -2855.1865234375, "logps/rejected": -2752.628173828125, "loss": 11.9729, "rewards/accuracies": 0.5, "rewards/chosen": -180.8517608642578, "rewards/margins": 7.611991882324219, "rewards/rejected": -188.46376037597656, "step": 46880 }, { "epoch": 2.71, "grad_norm": 0.001891492516733706, "learning_rate": 9.555323348426796e-05, "logits/chosen": -15.190340042114258, "logits/rejected": -15.375805854797363, "logps/chosen": -3150.67333984375, "logps/rejected": -2841.519775390625, "loss": 5.5511, "rewards/accuracies": 0.5, "rewards/chosen": -158.49862670898438, "rewards/margins": 2.724229335784912, "rewards/rejected": -161.2228546142578, "step": 46890 }, { "epoch": 2.71, "grad_norm": 0.2942066490650177, "learning_rate": 9.535972754363559e-05, "logits/chosen": -19.098270416259766, "logits/rejected": -19.317256927490234, "logps/chosen": -2642.167236328125, "logps/rejected": -2730.08837890625, "loss": 1.4037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -194.9054718017578, "rewards/margins": 12.714174270629883, "rewards/rejected": -207.61962890625, "step": 46900 }, { "epoch": 2.72, "grad_norm": 0.012308057397603989, "learning_rate": 9.516622160300322e-05, "logits/chosen": -16.890291213989258, "logits/rejected": -17.74423599243164, "logps/chosen": -2976.3505859375, "logps/rejected": -2541.65869140625, "loss": 1.1866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -128.24700927734375, "rewards/margins": 9.26893138885498, "rewards/rejected": -137.5159454345703, "step": 46910 }, { "epoch": 2.72, "grad_norm": 233.16522216796875, "learning_rate": 9.497271566237083e-05, "logits/chosen": -14.567733764648438, "logits/rejected": -14.768058776855469, "logps/chosen": -2929.927001953125, "logps/rejected": -2951.619873046875, "loss": 8.7897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -124.65804290771484, "rewards/margins": 3.5660414695739746, "rewards/rejected": -128.22409057617188, "step": 46920 }, { "epoch": 2.72, "grad_norm": 7.332197782261574e-08, "learning_rate": 9.477920972173847e-05, "logits/chosen": -17.80716896057129, "logits/rejected": -17.765586853027344, "logps/chosen": -3014.05517578125, "logps/rejected": -2844.803955078125, "loss": 3.4924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.75247192382812, "rewards/margins": 3.415675401687622, "rewards/rejected": -200.1681671142578, "step": 46930 }, { "epoch": 2.72, "grad_norm": 2.372164317421266e-06, "learning_rate": 9.458570378110608e-05, "logits/chosen": -16.045642852783203, "logits/rejected": -14.698420524597168, "logps/chosen": -3038.832275390625, "logps/rejected": -3343.680419921875, "loss": 0.7308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -118.71653747558594, "rewards/margins": 11.227834701538086, "rewards/rejected": -129.94436645507812, "step": 46940 }, { "epoch": 2.72, "grad_norm": 0.3678848147392273, "learning_rate": 9.43921978404737e-05, "logits/chosen": -18.380069732666016, "logits/rejected": -19.69981575012207, "logps/chosen": -2561.2509765625, "logps/rejected": -2373.733154296875, "loss": 14.0633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -183.97442626953125, "rewards/margins": -1.9470531940460205, "rewards/rejected": -182.02737426757812, "step": 46950 }, { "epoch": 2.72, "grad_norm": 121.47106170654297, "learning_rate": 9.419869189984133e-05, "logits/chosen": -18.493013381958008, "logits/rejected": -19.29393196105957, "logps/chosen": -2959.98193359375, "logps/rejected": -2880.45166015625, "loss": 16.2716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.44566345214844, "rewards/margins": -7.393816947937012, "rewards/rejected": -182.05184936523438, "step": 46960 }, { "epoch": 2.72, "grad_norm": 2.403105270332162e-09, "learning_rate": 9.400518595920896e-05, "logits/chosen": -19.57381820678711, "logits/rejected": -20.304561614990234, "logps/chosen": -2780.52392578125, "logps/rejected": -2384.017333984375, "loss": 2.4211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -152.45774841308594, "rewards/margins": 7.485877990722656, "rewards/rejected": -159.94363403320312, "step": 46970 }, { "epoch": 2.72, "grad_norm": 0.005854803137481213, "learning_rate": 9.381168001857656e-05, "logits/chosen": -16.592044830322266, "logits/rejected": -17.493972778320312, "logps/chosen": -2628.2236328125, "logps/rejected": -2556.293701171875, "loss": 0.4695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -184.9599151611328, "rewards/margins": 5.674899101257324, "rewards/rejected": -190.6348114013672, "step": 46980 }, { "epoch": 2.72, "grad_norm": 1.8880326747894287, "learning_rate": 9.36181740779442e-05, "logits/chosen": -18.08363914489746, "logits/rejected": -20.507722854614258, "logps/chosen": -2711.209228515625, "logps/rejected": -2760.921142578125, "loss": 0.6884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -230.38778686523438, "rewards/margins": 12.584356307983398, "rewards/rejected": -242.97213745117188, "step": 46990 }, { "epoch": 2.72, "grad_norm": 0.050982698798179626, "learning_rate": 9.342466813731181e-05, "logits/chosen": -19.4686279296875, "logits/rejected": -19.645299911499023, "logps/chosen": -2608.778076171875, "logps/rejected": -2657.998291015625, "loss": 1.1705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.3511505126953, "rewards/margins": 21.29375648498535, "rewards/rejected": -165.64491271972656, "step": 47000 }, { "epoch": 2.72, "grad_norm": 214.31082153320312, "learning_rate": 9.323116219667944e-05, "logits/chosen": -16.07670021057129, "logits/rejected": -16.16388511657715, "logps/chosen": -2968.40234375, "logps/rejected": -2640.176025390625, "loss": 4.6476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -104.9427261352539, "rewards/margins": 4.072274208068848, "rewards/rejected": -109.0149917602539, "step": 47010 }, { "epoch": 2.72, "grad_norm": 0.009555382654070854, "learning_rate": 9.303765625604706e-05, "logits/chosen": -15.905512809753418, "logits/rejected": -16.15439224243164, "logps/chosen": -3281.158203125, "logps/rejected": -3166.048095703125, "loss": 3.3743, "rewards/accuracies": 0.5, "rewards/chosen": -172.73387145996094, "rewards/margins": 1.3594944477081299, "rewards/rejected": -174.09335327148438, "step": 47020 }, { "epoch": 2.72, "grad_norm": 140.15179443359375, "learning_rate": 9.284415031541468e-05, "logits/chosen": -17.26930809020996, "logits/rejected": -17.62939453125, "logps/chosen": -2981.2138671875, "logps/rejected": -3005.6826171875, "loss": 12.2138, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -164.3195343017578, "rewards/margins": -8.542142868041992, "rewards/rejected": -155.7773895263672, "step": 47030 }, { "epoch": 2.72, "grad_norm": 4.364371761766961e-06, "learning_rate": 9.265064437478231e-05, "logits/chosen": -16.49339485168457, "logits/rejected": -16.861461639404297, "logps/chosen": -2986.580810546875, "logps/rejected": -2766.635498046875, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -182.5279998779297, "rewards/margins": 15.271214485168457, "rewards/rejected": -197.79922485351562, "step": 47040 }, { "epoch": 2.72, "grad_norm": 0.0769069716334343, "learning_rate": 9.245713843414994e-05, "logits/chosen": -18.688312530517578, "logits/rejected": -19.21283721923828, "logps/chosen": -2461.828369140625, "logps/rejected": -2496.54931640625, "loss": 0.65, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -209.4751739501953, "rewards/margins": 7.510459899902344, "rewards/rejected": -216.98562622070312, "step": 47050 }, { "epoch": 2.72, "grad_norm": 2.416415867401156e-08, "learning_rate": 9.226363249351754e-05, "logits/chosen": -20.096269607543945, "logits/rejected": -19.785009384155273, "logps/chosen": -2834.614501953125, "logps/rejected": -2554.296142578125, "loss": 3.9849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -223.0517120361328, "rewards/margins": 3.4454193115234375, "rewards/rejected": -226.4971466064453, "step": 47060 }, { "epoch": 2.72, "grad_norm": 3.891935110092163, "learning_rate": 9.207012655288517e-05, "logits/chosen": -17.268924713134766, "logits/rejected": -18.401897430419922, "logps/chosen": -2803.27783203125, "logps/rejected": -2748.785888671875, "loss": 3.3565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -143.23538208007812, "rewards/margins": 17.80661392211914, "rewards/rejected": -161.04197692871094, "step": 47070 }, { "epoch": 2.73, "grad_norm": 0.003979544620960951, "learning_rate": 9.18766206122528e-05, "logits/chosen": -22.097332000732422, "logits/rejected": -21.88504981994629, "logps/chosen": -2558.890380859375, "logps/rejected": -2782.850830078125, "loss": 6.7913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.2987823486328, "rewards/margins": 4.92611026763916, "rewards/rejected": -168.2248992919922, "step": 47080 }, { "epoch": 2.73, "grad_norm": 41.8127326965332, "learning_rate": 9.168311467162042e-05, "logits/chosen": -16.489276885986328, "logits/rejected": -16.43653678894043, "logps/chosen": -3093.823486328125, "logps/rejected": -2956.55322265625, "loss": 1.1616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -117.41636657714844, "rewards/margins": 14.1762113571167, "rewards/rejected": -131.5925750732422, "step": 47090 }, { "epoch": 2.73, "grad_norm": 0.0001555004419060424, "learning_rate": 9.148960873098805e-05, "logits/chosen": -17.778987884521484, "logits/rejected": -18.50061798095703, "logps/chosen": -2966.696044921875, "logps/rejected": -2766.9931640625, "loss": 3.8432, "rewards/accuracies": 0.5, "rewards/chosen": -191.39918518066406, "rewards/margins": 2.501786708831787, "rewards/rejected": -193.90097045898438, "step": 47100 }, { "epoch": 2.73, "grad_norm": 63.726524353027344, "learning_rate": 9.129610279035566e-05, "logits/chosen": -17.12319564819336, "logits/rejected": -18.01656150817871, "logps/chosen": -2540.120849609375, "logps/rejected": -2692.91845703125, "loss": 3.4201, "rewards/accuracies": 0.5, "rewards/chosen": -201.70712280273438, "rewards/margins": 2.2810111045837402, "rewards/rejected": -203.98814392089844, "step": 47110 }, { "epoch": 2.73, "grad_norm": 84.49758911132812, "learning_rate": 9.110259684972329e-05, "logits/chosen": -17.568674087524414, "logits/rejected": -19.19782257080078, "logps/chosen": -2969.473388671875, "logps/rejected": -2659.61083984375, "loss": 1.8642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -180.74192810058594, "rewards/margins": 17.562847137451172, "rewards/rejected": -198.30477905273438, "step": 47120 }, { "epoch": 2.73, "grad_norm": 2.264414513102747e-14, "learning_rate": 9.090909090909092e-05, "logits/chosen": -21.120119094848633, "logits/rejected": -21.603885650634766, "logps/chosen": -2487.46923828125, "logps/rejected": -2555.231689453125, "loss": 5.6628, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -189.2828826904297, "rewards/margins": 0.668765664100647, "rewards/rejected": -189.95162963867188, "step": 47130 }, { "epoch": 2.73, "grad_norm": 9.64339285508542e-12, "learning_rate": 9.071558496845853e-05, "logits/chosen": -17.3255672454834, "logits/rejected": -18.240293502807617, "logps/chosen": -2607.751708984375, "logps/rejected": -2459.900146484375, "loss": 4.4512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -146.2892608642578, "rewards/margins": 2.058741331100464, "rewards/rejected": -148.34800720214844, "step": 47140 }, { "epoch": 2.73, "grad_norm": 0.01480395719408989, "learning_rate": 9.052207902782615e-05, "logits/chosen": -16.73450469970703, "logits/rejected": -17.325946807861328, "logps/chosen": -3039.42236328125, "logps/rejected": -2856.93115234375, "loss": 0.9496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -134.97393798828125, "rewards/margins": 6.319639682769775, "rewards/rejected": -141.2935791015625, "step": 47150 }, { "epoch": 2.73, "grad_norm": 100.01811981201172, "learning_rate": 9.032857308719378e-05, "logits/chosen": -15.689329147338867, "logits/rejected": -17.36211395263672, "logps/chosen": -3251.12451171875, "logps/rejected": -3255.80712890625, "loss": 0.3572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -168.64527893066406, "rewards/margins": 15.894304275512695, "rewards/rejected": -184.53958129882812, "step": 47160 }, { "epoch": 2.73, "grad_norm": 5.261358091956936e-05, "learning_rate": 9.01350671465614e-05, "logits/chosen": -14.030499458312988, "logits/rejected": -14.173405647277832, "logps/chosen": -2916.523193359375, "logps/rejected": -2858.143310546875, "loss": 0.5441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -96.73309326171875, "rewards/margins": 6.864691257476807, "rewards/rejected": -103.5977783203125, "step": 47170 }, { "epoch": 2.73, "grad_norm": 43.87968063354492, "learning_rate": 8.994156120592903e-05, "logits/chosen": -19.33584213256836, "logits/rejected": -18.40804672241211, "logps/chosen": -3227.751220703125, "logps/rejected": -3202.7490234375, "loss": 8.5076, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -181.9225311279297, "rewards/margins": -7.301258087158203, "rewards/rejected": -174.6212921142578, "step": 47180 }, { "epoch": 2.73, "grad_norm": 0.9834911227226257, "learning_rate": 8.974805526529665e-05, "logits/chosen": -17.272178649902344, "logits/rejected": -17.97565269470215, "logps/chosen": -2809.35400390625, "logps/rejected": -2795.49462890625, "loss": 0.1654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -199.69686889648438, "rewards/margins": 11.27357292175293, "rewards/rejected": -210.97042846679688, "step": 47190 }, { "epoch": 2.73, "grad_norm": 120.87271881103516, "learning_rate": 8.955454932466426e-05, "logits/chosen": -14.611906051635742, "logits/rejected": -15.09815502166748, "logps/chosen": -2813.538330078125, "logps/rejected": -2396.77001953125, "loss": 3.7254, "rewards/accuracies": 0.5, "rewards/chosen": -120.83625793457031, "rewards/margins": 11.876794815063477, "rewards/rejected": -132.7130584716797, "step": 47200 }, { "epoch": 2.73, "grad_norm": 3.4643935578060336e-06, "learning_rate": 8.93610433840319e-05, "logits/chosen": -16.588918685913086, "logits/rejected": -16.73183822631836, "logps/chosen": -2995.800048828125, "logps/rejected": -2903.821044921875, "loss": 1.0591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.25157165527344, "rewards/margins": 8.824104309082031, "rewards/rejected": -171.07568359375, "step": 47210 }, { "epoch": 2.73, "grad_norm": 113.83354949951172, "learning_rate": 8.916753744339951e-05, "logits/chosen": -16.03011131286621, "logits/rejected": -16.353872299194336, "logps/chosen": -3138.51220703125, "logps/rejected": -2992.90478515625, "loss": 9.835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.04566955566406, "rewards/margins": -5.105288505554199, "rewards/rejected": -155.94039916992188, "step": 47220 }, { "epoch": 2.73, "grad_norm": 3.813368891492086e-13, "learning_rate": 8.897403150276713e-05, "logits/chosen": -17.379945755004883, "logits/rejected": -17.54629898071289, "logps/chosen": -3195.92626953125, "logps/rejected": -2636.043701171875, "loss": 25.6974, "rewards/accuracies": 0.5, "rewards/chosen": -201.63201904296875, "rewards/margins": -13.064834594726562, "rewards/rejected": -188.5672149658203, "step": 47230 }, { "epoch": 2.73, "grad_norm": 0.05172373726963997, "learning_rate": 8.878052556213476e-05, "logits/chosen": -17.167766571044922, "logits/rejected": -18.194374084472656, "logps/chosen": -3041.45947265625, "logps/rejected": -2753.900390625, "loss": 4.8731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.4348602294922, "rewards/margins": 2.863802433013916, "rewards/rejected": -169.29867553710938, "step": 47240 }, { "epoch": 2.74, "grad_norm": 0.0, "learning_rate": 8.858701962150238e-05, "logits/chosen": -19.983612060546875, "logits/rejected": -19.548526763916016, "logps/chosen": -2945.5771484375, "logps/rejected": -2962.171875, "loss": 7.745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -221.8064727783203, "rewards/margins": 6.223801136016846, "rewards/rejected": -228.0302734375, "step": 47250 }, { "epoch": 2.74, "grad_norm": 50.62356948852539, "learning_rate": 8.839351368087001e-05, "logits/chosen": -16.977983474731445, "logits/rejected": -16.967540740966797, "logps/chosen": -2873.724609375, "logps/rejected": -3012.00927734375, "loss": 2.2418, "rewards/accuracies": 0.5, "rewards/chosen": -163.754150390625, "rewards/margins": 2.576866626739502, "rewards/rejected": -166.33102416992188, "step": 47260 }, { "epoch": 2.74, "grad_norm": 0.6169374585151672, "learning_rate": 8.820000774023764e-05, "logits/chosen": -18.05877113342285, "logits/rejected": -18.683517456054688, "logps/chosen": -2962.09326171875, "logps/rejected": -2845.2744140625, "loss": 3.2999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -193.81715393066406, "rewards/margins": 1.482142448425293, "rewards/rejected": -195.29928588867188, "step": 47270 }, { "epoch": 2.74, "grad_norm": 6.322757428733894e-08, "learning_rate": 8.800650179960524e-05, "logits/chosen": -19.061864852905273, "logits/rejected": -19.186208724975586, "logps/chosen": -3002.59326171875, "logps/rejected": -2809.17626953125, "loss": 0.1265, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -187.95156860351562, "rewards/margins": 11.069008827209473, "rewards/rejected": -199.02056884765625, "step": 47280 }, { "epoch": 2.74, "grad_norm": 0.0006779898540116847, "learning_rate": 8.781299585897287e-05, "logits/chosen": -16.648271560668945, "logits/rejected": -17.827869415283203, "logps/chosen": -2993.30615234375, "logps/rejected": -2645.509521484375, "loss": 1.7352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -180.81382751464844, "rewards/margins": 9.445456504821777, "rewards/rejected": -190.25929260253906, "step": 47290 }, { "epoch": 2.74, "grad_norm": 2.710559129714966, "learning_rate": 8.76194899183405e-05, "logits/chosen": -18.10491371154785, "logits/rejected": -18.43417739868164, "logps/chosen": -2919.96142578125, "logps/rejected": -2620.911376953125, "loss": 7.0271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -115.30326080322266, "rewards/margins": -0.018642712384462357, "rewards/rejected": -115.28460693359375, "step": 47300 }, { "epoch": 2.74, "grad_norm": 8.392436257054214e-08, "learning_rate": 8.742598397770812e-05, "logits/chosen": -16.850589752197266, "logits/rejected": -16.017330169677734, "logps/chosen": -2770.010009765625, "logps/rejected": -3012.195556640625, "loss": 6.7345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.10107421875, "rewards/margins": 2.8502049446105957, "rewards/rejected": -163.95127868652344, "step": 47310 }, { "epoch": 2.74, "grad_norm": 3.6497268676757812, "learning_rate": 8.723247803707574e-05, "logits/chosen": -20.41473960876465, "logits/rejected": -21.2739315032959, "logps/chosen": -2546.007568359375, "logps/rejected": -2555.4267578125, "loss": 0.5027, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -223.96444702148438, "rewards/margins": 8.984048843383789, "rewards/rejected": -232.94851684570312, "step": 47320 }, { "epoch": 2.74, "grad_norm": 1.5041824579238892, "learning_rate": 8.703897209644336e-05, "logits/chosen": -23.169912338256836, "logits/rejected": -23.9212589263916, "logps/chosen": -2719.78759765625, "logps/rejected": -2716.7705078125, "loss": 4.9856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -244.0699005126953, "rewards/margins": 3.066270351409912, "rewards/rejected": -247.1361541748047, "step": 47330 }, { "epoch": 2.74, "grad_norm": 1.0317741105936307e-09, "learning_rate": 8.684546615581099e-05, "logits/chosen": -14.604784965515137, "logits/rejected": -14.812049865722656, "logps/chosen": -3227.210693359375, "logps/rejected": -3116.358642578125, "loss": 2.8211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.5214080810547, "rewards/margins": 10.185593605041504, "rewards/rejected": -154.70700073242188, "step": 47340 }, { "epoch": 2.74, "grad_norm": 6.2433282437268645e-06, "learning_rate": 8.665196021517862e-05, "logits/chosen": -13.555932998657227, "logits/rejected": -14.216781616210938, "logps/chosen": -3400.1171875, "logps/rejected": -3207.42138671875, "loss": 1.7592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -103.73600006103516, "rewards/margins": 7.20568323135376, "rewards/rejected": -110.9416732788086, "step": 47350 }, { "epoch": 2.74, "grad_norm": 1.1069369065808132e-06, "learning_rate": 8.645845427454622e-05, "logits/chosen": -15.388574600219727, "logits/rejected": -15.380746841430664, "logps/chosen": -2585.870849609375, "logps/rejected": -2988.95654296875, "loss": 7.1209, "rewards/accuracies": 0.5, "rewards/chosen": -134.15316772460938, "rewards/margins": -4.252368450164795, "rewards/rejected": -129.9008026123047, "step": 47360 }, { "epoch": 2.74, "grad_norm": 3.0068941179193023e-14, "learning_rate": 8.626494833391385e-05, "logits/chosen": -17.29294204711914, "logits/rejected": -17.675073623657227, "logps/chosen": -3089.63916015625, "logps/rejected": -2469.039306640625, "loss": 3.0806, "rewards/accuracies": 0.5, "rewards/chosen": -175.81565856933594, "rewards/margins": 6.735597133636475, "rewards/rejected": -182.55125427246094, "step": 47370 }, { "epoch": 2.74, "grad_norm": 1.5201366554418172e-10, "learning_rate": 8.607144239328148e-05, "logits/chosen": -15.114240646362305, "logits/rejected": -15.251867294311523, "logps/chosen": -3323.751220703125, "logps/rejected": -2998.799072265625, "loss": 3.4668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -145.80783081054688, "rewards/margins": 5.112765312194824, "rewards/rejected": -150.92059326171875, "step": 47380 }, { "epoch": 2.74, "grad_norm": 0.04394284263253212, "learning_rate": 8.58779364526491e-05, "logits/chosen": -17.46829605102539, "logits/rejected": -18.517770767211914, "logps/chosen": -2387.425048828125, "logps/rejected": -2562.242919921875, "loss": 0.975, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -165.79977416992188, "rewards/margins": 6.541795253753662, "rewards/rejected": -172.34156799316406, "step": 47390 }, { "epoch": 2.74, "grad_norm": 65.1903076171875, "learning_rate": 8.568443051201672e-05, "logits/chosen": -16.937053680419922, "logits/rejected": -17.111629486083984, "logps/chosen": -2935.989013671875, "logps/rejected": -2986.363037109375, "loss": 6.8454, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -120.77767181396484, "rewards/margins": -0.8175009489059448, "rewards/rejected": -119.9601821899414, "step": 47400 }, { "epoch": 2.74, "grad_norm": 0.03145989030599594, "learning_rate": 8.549092457138433e-05, "logits/chosen": -14.754219055175781, "logits/rejected": -15.280370712280273, "logps/chosen": -3058.22998046875, "logps/rejected": -3113.88720703125, "loss": 2.358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -163.1621856689453, "rewards/margins": 5.062331199645996, "rewards/rejected": -168.22451782226562, "step": 47410 }, { "epoch": 2.74, "grad_norm": 9.397537727849618e-18, "learning_rate": 8.529741863075196e-05, "logits/chosen": -15.855981826782227, "logits/rejected": -16.656192779541016, "logps/chosen": -3192.85205078125, "logps/rejected": -3082.787841796875, "loss": 1.7733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -234.3723602294922, "rewards/margins": 16.133922576904297, "rewards/rejected": -250.5062713623047, "step": 47420 }, { "epoch": 2.75, "grad_norm": 2.2018717960037293e-09, "learning_rate": 8.51039126901196e-05, "logits/chosen": -16.034164428710938, "logits/rejected": -15.28472900390625, "logps/chosen": -3051.648681640625, "logps/rejected": -3043.614013671875, "loss": 0.916, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -139.3990020751953, "rewards/margins": 19.981176376342773, "rewards/rejected": -159.3801727294922, "step": 47430 }, { "epoch": 2.75, "grad_norm": 24.162071228027344, "learning_rate": 8.491040674948721e-05, "logits/chosen": -15.51909351348877, "logits/rejected": -15.313003540039062, "logps/chosen": -3288.452392578125, "logps/rejected": -3025.362548828125, "loss": 1.1301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.67015075683594, "rewards/margins": 8.94209098815918, "rewards/rejected": -153.61224365234375, "step": 47440 }, { "epoch": 2.75, "grad_norm": 1.972048913057023e-10, "learning_rate": 8.471690080885483e-05, "logits/chosen": -16.694772720336914, "logits/rejected": -17.825368881225586, "logps/chosen": -3014.27783203125, "logps/rejected": -2787.46337890625, "loss": 9.0424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -191.76841735839844, "rewards/margins": -2.430389881134033, "rewards/rejected": -189.33804321289062, "step": 47450 }, { "epoch": 2.75, "grad_norm": 0.00013179396046325564, "learning_rate": 8.452339486822246e-05, "logits/chosen": -16.40399742126465, "logits/rejected": -18.33819580078125, "logps/chosen": -3325.33203125, "logps/rejected": -3129.701171875, "loss": 1.978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -119.98805236816406, "rewards/margins": 6.582453727722168, "rewards/rejected": -126.57049560546875, "step": 47460 }, { "epoch": 2.75, "grad_norm": 0.0001646138698561117, "learning_rate": 8.432988892759008e-05, "logits/chosen": -17.240245819091797, "logits/rejected": -17.71604347229004, "logps/chosen": -2717.90966796875, "logps/rejected": -2873.974609375, "loss": 0.4715, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -185.81130981445312, "rewards/margins": 9.871084213256836, "rewards/rejected": -195.682373046875, "step": 47470 }, { "epoch": 2.75, "grad_norm": 8.407256661679963e-12, "learning_rate": 8.413638298695771e-05, "logits/chosen": -12.710954666137695, "logits/rejected": -13.345100402832031, "logps/chosen": -3347.3828125, "logps/rejected": -3419.43408203125, "loss": 4.4924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -118.08902740478516, "rewards/margins": 7.076484680175781, "rewards/rejected": -125.16551208496094, "step": 47480 }, { "epoch": 2.75, "grad_norm": 57.40622329711914, "learning_rate": 8.394287704632533e-05, "logits/chosen": -16.249460220336914, "logits/rejected": -17.029422760009766, "logps/chosen": -3101.00537109375, "logps/rejected": -3030.8330078125, "loss": 0.5892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -156.16844177246094, "rewards/margins": 10.903219223022461, "rewards/rejected": -167.07168579101562, "step": 47490 }, { "epoch": 2.75, "grad_norm": 0.09347186237573624, "learning_rate": 8.374937110569294e-05, "logits/chosen": -18.75590705871582, "logits/rejected": -19.60524559020996, "logps/chosen": -2578.306640625, "logps/rejected": -2598.99169921875, "loss": 2.5515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -181.37124633789062, "rewards/margins": 7.707930088043213, "rewards/rejected": -189.0791778564453, "step": 47500 }, { "epoch": 2.75, "grad_norm": 104.34685516357422, "learning_rate": 8.355586516506057e-05, "logits/chosen": -13.718118667602539, "logits/rejected": -13.52794075012207, "logps/chosen": -2745.32666015625, "logps/rejected": -2807.56689453125, "loss": 4.117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -73.00257110595703, "rewards/margins": 8.294477462768555, "rewards/rejected": -81.29704284667969, "step": 47510 }, { "epoch": 2.75, "grad_norm": 5.172362216399051e-05, "learning_rate": 8.336235922442819e-05, "logits/chosen": -16.4176025390625, "logits/rejected": -18.86435317993164, "logps/chosen": -2792.23681640625, "logps/rejected": -2798.968017578125, "loss": 6.0697, "rewards/accuracies": 0.5, "rewards/chosen": -144.16006469726562, "rewards/margins": 4.145327091217041, "rewards/rejected": -148.30538940429688, "step": 47520 }, { "epoch": 2.75, "grad_norm": 30.73624038696289, "learning_rate": 8.316885328379581e-05, "logits/chosen": -17.706401824951172, "logits/rejected": -17.257598876953125, "logps/chosen": -3213.427490234375, "logps/rejected": -3178.21630859375, "loss": 10.8777, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -210.032470703125, "rewards/margins": -9.224161148071289, "rewards/rejected": -200.8083038330078, "step": 47530 }, { "epoch": 2.75, "grad_norm": 82.75797271728516, "learning_rate": 8.297534734316344e-05, "logits/chosen": -18.385374069213867, "logits/rejected": -19.930139541625977, "logps/chosen": -2571.38525390625, "logps/rejected": -2584.49853515625, "loss": 1.4896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -153.86141967773438, "rewards/margins": 11.810126304626465, "rewards/rejected": -165.67156982421875, "step": 47540 }, { "epoch": 2.75, "grad_norm": 114.1341781616211, "learning_rate": 8.278184140253106e-05, "logits/chosen": -19.807056427001953, "logits/rejected": -22.19816017150879, "logps/chosen": -3045.26904296875, "logps/rejected": -2987.123046875, "loss": 6.5298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.4727325439453, "rewards/margins": 3.665438175201416, "rewards/rejected": -201.13815307617188, "step": 47550 }, { "epoch": 2.75, "grad_norm": 0.011859605088829994, "learning_rate": 8.258833546189869e-05, "logits/chosen": -15.670740127563477, "logits/rejected": -16.079845428466797, "logps/chosen": -2953.982421875, "logps/rejected": -3075.889892578125, "loss": 4.1965, "rewards/accuracies": 0.5, "rewards/chosen": -161.33761596679688, "rewards/margins": 3.8825440406799316, "rewards/rejected": -165.2201690673828, "step": 47560 }, { "epoch": 2.75, "grad_norm": 0.24795107543468475, "learning_rate": 8.23948295212663e-05, "logits/chosen": -17.6588077545166, "logits/rejected": -19.012012481689453, "logps/chosen": -2934.41650390625, "logps/rejected": -3050.332763671875, "loss": 1.871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -191.7216339111328, "rewards/margins": 13.737895011901855, "rewards/rejected": -205.4595489501953, "step": 47570 }, { "epoch": 2.75, "grad_norm": 1.5346805981788205e-17, "learning_rate": 8.220132358063392e-05, "logits/chosen": -15.0368070602417, "logits/rejected": -15.045602798461914, "logps/chosen": -3221.856689453125, "logps/rejected": -3223.29150390625, "loss": 1.0023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.11598205566406, "rewards/margins": 11.346078872680664, "rewards/rejected": -152.46206665039062, "step": 47580 }, { "epoch": 2.75, "grad_norm": 2.4788031627309692e-09, "learning_rate": 8.200781764000155e-05, "logits/chosen": -14.196672439575195, "logits/rejected": -14.672322273254395, "logps/chosen": -3110.173095703125, "logps/rejected": -2983.42724609375, "loss": 2.5075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.62074279785156, "rewards/margins": 11.358617782592773, "rewards/rejected": -170.97935485839844, "step": 47590 }, { "epoch": 2.76, "grad_norm": 7.290053005704067e-10, "learning_rate": 8.181431169936918e-05, "logits/chosen": -18.708370208740234, "logits/rejected": -20.377758026123047, "logps/chosen": -2769.50244140625, "logps/rejected": -2736.35986328125, "loss": 4.7277, "rewards/accuracies": 0.5, "rewards/chosen": -241.85009765625, "rewards/margins": 3.878308057785034, "rewards/rejected": -245.72842407226562, "step": 47600 }, { "epoch": 2.76, "grad_norm": 3.1686866672209213e-12, "learning_rate": 8.162080575873679e-05, "logits/chosen": -17.866619110107422, "logits/rejected": -18.253849029541016, "logps/chosen": -2404.880615234375, "logps/rejected": -2426.461181640625, "loss": 8.9913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -144.5657958984375, "rewards/margins": 10.837214469909668, "rewards/rejected": -155.40301513671875, "step": 47610 }, { "epoch": 2.76, "grad_norm": 39.80873107910156, "learning_rate": 8.142729981810442e-05, "logits/chosen": -17.610116958618164, "logits/rejected": -18.296829223632812, "logps/chosen": -2960.90380859375, "logps/rejected": -2710.007080078125, "loss": 5.5813, "rewards/accuracies": 0.5, "rewards/chosen": -190.46279907226562, "rewards/margins": -2.0359435081481934, "rewards/rejected": -188.42684936523438, "step": 47620 }, { "epoch": 2.76, "grad_norm": 21.855287551879883, "learning_rate": 8.123379387747203e-05, "logits/chosen": -16.468801498413086, "logits/rejected": -16.108661651611328, "logps/chosen": -2813.535400390625, "logps/rejected": -3003.826171875, "loss": 4.746, "rewards/accuracies": 0.5, "rewards/chosen": -145.8641357421875, "rewards/margins": 1.6571252346038818, "rewards/rejected": -147.52125549316406, "step": 47630 }, { "epoch": 2.76, "grad_norm": 91.92042541503906, "learning_rate": 8.104028793683966e-05, "logits/chosen": -14.16038990020752, "logits/rejected": -14.558525085449219, "logps/chosen": -2930.10791015625, "logps/rejected": -2740.828857421875, "loss": 1.1342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -151.37998962402344, "rewards/margins": 6.822295188903809, "rewards/rejected": -158.20230102539062, "step": 47640 }, { "epoch": 2.76, "grad_norm": 1.3557860346555262e-12, "learning_rate": 8.08467819962073e-05, "logits/chosen": -19.983930587768555, "logits/rejected": -19.449430465698242, "logps/chosen": -2974.85595703125, "logps/rejected": -3021.206298828125, "loss": 1.7406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -202.24020385742188, "rewards/margins": 9.95700740814209, "rewards/rejected": -212.1972198486328, "step": 47650 }, { "epoch": 2.76, "grad_norm": 0.9011901617050171, "learning_rate": 8.06532760555749e-05, "logits/chosen": -14.979762077331543, "logits/rejected": -14.962453842163086, "logps/chosen": -2909.185302734375, "logps/rejected": -3059.153076171875, "loss": 1.9209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.20223999023438, "rewards/margins": 7.534562110900879, "rewards/rejected": -195.73681640625, "step": 47660 }, { "epoch": 2.76, "grad_norm": 4.154006481170654, "learning_rate": 8.045977011494253e-05, "logits/chosen": -14.431938171386719, "logits/rejected": -14.43797779083252, "logps/chosen": -3087.3740234375, "logps/rejected": -2823.664794921875, "loss": 5.7584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -166.7181854248047, "rewards/margins": -0.24644088745117188, "rewards/rejected": -166.4717559814453, "step": 47670 }, { "epoch": 2.76, "grad_norm": 0.0007382318144664168, "learning_rate": 8.026626417431016e-05, "logits/chosen": -15.763934135437012, "logits/rejected": -16.186141967773438, "logps/chosen": -3072.844970703125, "logps/rejected": -2827.334716796875, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -123.58103942871094, "rewards/margins": 12.20495891571045, "rewards/rejected": -135.78599548339844, "step": 47680 }, { "epoch": 2.76, "grad_norm": 0.05327896401286125, "learning_rate": 8.007275823367778e-05, "logits/chosen": -16.690296173095703, "logits/rejected": -16.78142738342285, "logps/chosen": -3002.0078125, "logps/rejected": -2795.315185546875, "loss": 1.1379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -154.8206024169922, "rewards/margins": 20.568262100219727, "rewards/rejected": -175.3888702392578, "step": 47690 }, { "epoch": 2.76, "grad_norm": 1.2832923857786227e-05, "learning_rate": 7.98792522930454e-05, "logits/chosen": -19.345191955566406, "logits/rejected": -18.730684280395508, "logps/chosen": -2466.71875, "logps/rejected": -2549.438232421875, "loss": 1.2098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -183.10189819335938, "rewards/margins": 8.002791404724121, "rewards/rejected": -191.1046905517578, "step": 47700 }, { "epoch": 2.76, "grad_norm": 106.55347442626953, "learning_rate": 7.968574635241301e-05, "logits/chosen": -15.588325500488281, "logits/rejected": -15.624926567077637, "logps/chosen": -2719.093994140625, "logps/rejected": -2700.385009765625, "loss": 1.4661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -149.3945770263672, "rewards/margins": 11.921091079711914, "rewards/rejected": -161.31565856933594, "step": 47710 }, { "epoch": 2.76, "grad_norm": 87.01944732666016, "learning_rate": 7.949224041178064e-05, "logits/chosen": -14.768899917602539, "logits/rejected": -14.869722366333008, "logps/chosen": -3194.530517578125, "logps/rejected": -3091.619384765625, "loss": 3.1488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -143.34805297851562, "rewards/margins": 1.8151156902313232, "rewards/rejected": -145.16317749023438, "step": 47720 }, { "epoch": 2.76, "grad_norm": 108.03256225585938, "learning_rate": 7.929873447114827e-05, "logits/chosen": -14.482864379882812, "logits/rejected": -14.304901123046875, "logps/chosen": -3143.61767578125, "logps/rejected": -2878.6767578125, "loss": 2.0063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.93377685546875, "rewards/margins": 6.348143577575684, "rewards/rejected": -139.2819366455078, "step": 47730 }, { "epoch": 2.76, "grad_norm": 0.020080741494894028, "learning_rate": 7.910522853051588e-05, "logits/chosen": -16.221349716186523, "logits/rejected": -16.349557876586914, "logps/chosen": -3063.34765625, "logps/rejected": -2584.97021484375, "loss": 1.4867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -124.22945404052734, "rewards/margins": 10.201382637023926, "rewards/rejected": -134.4308319091797, "step": 47740 }, { "epoch": 2.76, "grad_norm": 4.045415878295898, "learning_rate": 7.891172258988351e-05, "logits/chosen": -16.971981048583984, "logits/rejected": -17.982023239135742, "logps/chosen": -3168.385986328125, "logps/rejected": -3129.512939453125, "loss": 2.3599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -169.5927276611328, "rewards/margins": 17.970319747924805, "rewards/rejected": -187.5630645751953, "step": 47750 }, { "epoch": 2.76, "grad_norm": 15.984807968139648, "learning_rate": 7.871821664925114e-05, "logits/chosen": -18.47589683532715, "logits/rejected": -19.355018615722656, "logps/chosen": -2408.733642578125, "logps/rejected": -2395.639404296875, "loss": 0.1424, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -171.47152709960938, "rewards/margins": 14.953592300415039, "rewards/rejected": -186.4251251220703, "step": 47760 }, { "epoch": 2.77, "grad_norm": 1.459495413413947e-09, "learning_rate": 7.852471070861876e-05, "logits/chosen": -22.133548736572266, "logits/rejected": -22.843463897705078, "logps/chosen": -2743.481201171875, "logps/rejected": -2813.94677734375, "loss": 0.5988, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -209.99960327148438, "rewards/margins": 8.514829635620117, "rewards/rejected": -218.5144500732422, "step": 47770 }, { "epoch": 2.77, "grad_norm": 1.2748958139185973e-11, "learning_rate": 7.833120476798637e-05, "logits/chosen": -13.186986923217773, "logits/rejected": -12.872039794921875, "logps/chosen": -3120.08447265625, "logps/rejected": -2890.231689453125, "loss": 2.4364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -78.5771713256836, "rewards/margins": 5.692828178405762, "rewards/rejected": -84.27000427246094, "step": 47780 }, { "epoch": 2.77, "grad_norm": 2.4094769579646683e-21, "learning_rate": 7.8137698827354e-05, "logits/chosen": -17.617053985595703, "logits/rejected": -18.93935203552246, "logps/chosen": -2762.35888671875, "logps/rejected": -3057.998779296875, "loss": 2.1041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.26168823242188, "rewards/margins": 11.851893424987793, "rewards/rejected": -187.1136016845703, "step": 47790 }, { "epoch": 2.77, "grad_norm": 85.22762298583984, "learning_rate": 7.794419288672162e-05, "logits/chosen": -16.3154354095459, "logits/rejected": -16.193622589111328, "logps/chosen": -2885.37744140625, "logps/rejected": -3071.00244140625, "loss": 4.1524, "rewards/accuracies": 0.5, "rewards/chosen": -130.1229248046875, "rewards/margins": 3.3388423919677734, "rewards/rejected": -133.46176147460938, "step": 47800 }, { "epoch": 2.77, "grad_norm": 0.0007538052159361541, "learning_rate": 7.775068694608925e-05, "logits/chosen": -15.150152206420898, "logits/rejected": -15.67805290222168, "logps/chosen": -3041.348388671875, "logps/rejected": -2947.876708984375, "loss": 9.578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -224.81771850585938, "rewards/margins": -0.6192436218261719, "rewards/rejected": -224.198486328125, "step": 47810 }, { "epoch": 2.77, "grad_norm": 0.09747639298439026, "learning_rate": 7.755718100545687e-05, "logits/chosen": -16.387737274169922, "logits/rejected": -19.203872680664062, "logps/chosen": -2572.678466796875, "logps/rejected": -2687.12744140625, "loss": 1.6879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -197.84017944335938, "rewards/margins": 11.396524429321289, "rewards/rejected": -209.23672485351562, "step": 47820 }, { "epoch": 2.77, "grad_norm": 52.45663070678711, "learning_rate": 7.736367506482449e-05, "logits/chosen": -18.239421844482422, "logits/rejected": -16.908021926879883, "logps/chosen": -2952.86767578125, "logps/rejected": -2985.92236328125, "loss": 4.0911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -230.25674438476562, "rewards/margins": 3.018967390060425, "rewards/rejected": -233.2756805419922, "step": 47830 }, { "epoch": 2.77, "grad_norm": 0.028010942041873932, "learning_rate": 7.717016912419212e-05, "logits/chosen": -14.928520202636719, "logits/rejected": -15.124580383300781, "logps/chosen": -3212.00927734375, "logps/rejected": -3087.031005859375, "loss": 12.6133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.32485961914062, "rewards/margins": -1.7868702411651611, "rewards/rejected": -178.53799438476562, "step": 47840 }, { "epoch": 2.77, "grad_norm": 9.727533340454102, "learning_rate": 7.697666318355973e-05, "logits/chosen": -17.326858520507812, "logits/rejected": -17.3531494140625, "logps/chosen": -2722.77490234375, "logps/rejected": -2703.332763671875, "loss": 1.9697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.81997680664062, "rewards/margins": 5.9013543128967285, "rewards/rejected": -172.72134399414062, "step": 47850 }, { "epoch": 2.77, "grad_norm": 98.9539794921875, "learning_rate": 7.678315724292736e-05, "logits/chosen": -17.55999755859375, "logits/rejected": -18.43118667602539, "logps/chosen": -2798.66748046875, "logps/rejected": -2725.31787109375, "loss": 3.3507, "rewards/accuracies": 0.5, "rewards/chosen": -206.1182861328125, "rewards/margins": 4.935372829437256, "rewards/rejected": -211.0536346435547, "step": 47860 }, { "epoch": 2.77, "grad_norm": 71.87908172607422, "learning_rate": 7.658965130229498e-05, "logits/chosen": -18.139453887939453, "logits/rejected": -18.641889572143555, "logps/chosen": -2669.000732421875, "logps/rejected": -2519.796875, "loss": 13.2443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -235.8768310546875, "rewards/margins": -8.636772155761719, "rewards/rejected": -227.2400665283203, "step": 47870 }, { "epoch": 2.77, "grad_norm": 61.7551155090332, "learning_rate": 7.63961453616626e-05, "logits/chosen": -17.131851196289062, "logits/rejected": -17.28411293029785, "logps/chosen": -3008.29541015625, "logps/rejected": -3083.962890625, "loss": 4.8789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -138.70193481445312, "rewards/margins": 1.966395616531372, "rewards/rejected": -140.66830444335938, "step": 47880 }, { "epoch": 2.77, "grad_norm": 9.968956327099932e-09, "learning_rate": 7.620263942103023e-05, "logits/chosen": -14.913047790527344, "logits/rejected": -15.50433349609375, "logps/chosen": -3073.884033203125, "logps/rejected": -2935.4697265625, "loss": 0.5868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -199.2675323486328, "rewards/margins": 12.763227462768555, "rewards/rejected": -212.03076171875, "step": 47890 }, { "epoch": 2.77, "grad_norm": 0.05893512815237045, "learning_rate": 7.600913348039786e-05, "logits/chosen": -15.575323104858398, "logits/rejected": -16.50619125366211, "logps/chosen": -3078.7724609375, "logps/rejected": -2734.545654296875, "loss": 0.7643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -154.97727966308594, "rewards/margins": 16.293869018554688, "rewards/rejected": -171.27114868164062, "step": 47900 }, { "epoch": 2.77, "grad_norm": 0.5254961848258972, "learning_rate": 7.581562753976546e-05, "logits/chosen": -17.25163459777832, "logits/rejected": -16.989927291870117, "logps/chosen": -2763.201416015625, "logps/rejected": -2764.415771484375, "loss": 0.6899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.7274932861328, "rewards/margins": 13.93249797821045, "rewards/rejected": -184.66000366210938, "step": 47910 }, { "epoch": 2.77, "grad_norm": 0.0580577477812767, "learning_rate": 7.56221215991331e-05, "logits/chosen": -18.69370460510254, "logits/rejected": -19.916122436523438, "logps/chosen": -2697.37353515625, "logps/rejected": -2599.155029296875, "loss": 0.455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -178.09080505371094, "rewards/margins": 9.38615608215332, "rewards/rejected": -187.47695922851562, "step": 47920 }, { "epoch": 2.77, "grad_norm": 68.60777282714844, "learning_rate": 7.542861565850071e-05, "logits/chosen": -17.328594207763672, "logits/rejected": -17.058879852294922, "logps/chosen": -2729.93896484375, "logps/rejected": -2718.48681640625, "loss": 1.4946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -201.8804473876953, "rewards/margins": 7.194645881652832, "rewards/rejected": -209.07510375976562, "step": 47930 }, { "epoch": 2.77, "grad_norm": 19.312108993530273, "learning_rate": 7.523510971786834e-05, "logits/chosen": -12.92161750793457, "logits/rejected": -13.930662155151367, "logps/chosen": -3440.29638671875, "logps/rejected": -2663.25634765625, "loss": 1.9326, "rewards/accuracies": 0.5, "rewards/chosen": -84.6861801147461, "rewards/margins": 10.660913467407227, "rewards/rejected": -95.34709167480469, "step": 47940 }, { "epoch": 2.78, "grad_norm": 70.2379150390625, "learning_rate": 7.504160377723596e-05, "logits/chosen": -15.545930862426758, "logits/rejected": -15.632080078125, "logps/chosen": -2751.43212890625, "logps/rejected": -2899.07373046875, "loss": 7.3123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -152.85035705566406, "rewards/margins": 3.853320598602295, "rewards/rejected": -156.7036895751953, "step": 47950 }, { "epoch": 2.78, "grad_norm": 1.7806778487283736e-05, "learning_rate": 7.484809783660358e-05, "logits/chosen": -19.851030349731445, "logits/rejected": -20.944103240966797, "logps/chosen": -3172.099365234375, "logps/rejected": -2546.986083984375, "loss": 2.4328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -212.21499633789062, "rewards/margins": 5.8026227951049805, "rewards/rejected": -218.01760864257812, "step": 47960 }, { "epoch": 2.78, "grad_norm": 0.1691831648349762, "learning_rate": 7.465459189597121e-05, "logits/chosen": -16.063705444335938, "logits/rejected": -16.205379486083984, "logps/chosen": -3070.845458984375, "logps/rejected": -2753.88916015625, "loss": 1.9079, "rewards/accuracies": 0.5, "rewards/chosen": -195.00851440429688, "rewards/margins": 9.820834159851074, "rewards/rejected": -204.829345703125, "step": 47970 }, { "epoch": 2.78, "grad_norm": 28.096370697021484, "learning_rate": 7.446108595533884e-05, "logits/chosen": -17.295059204101562, "logits/rejected": -16.904438018798828, "logps/chosen": -2698.60498046875, "logps/rejected": -2652.038818359375, "loss": 0.8198, "rewards/accuracies": 0.5, "rewards/chosen": -230.73812866210938, "rewards/margins": 2.119807481765747, "rewards/rejected": -232.8579559326172, "step": 47980 }, { "epoch": 2.78, "grad_norm": 0.005948617123067379, "learning_rate": 7.426758001470646e-05, "logits/chosen": -16.637449264526367, "logits/rejected": -16.571552276611328, "logps/chosen": -2774.727783203125, "logps/rejected": -2989.86181640625, "loss": 1.0823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -213.4300994873047, "rewards/margins": 10.20158576965332, "rewards/rejected": -223.6317138671875, "step": 47990 }, { "epoch": 2.78, "grad_norm": 41.99845886230469, "learning_rate": 7.407407407407407e-05, "logits/chosen": -16.88983726501465, "logits/rejected": -17.723621368408203, "logps/chosen": -2530.70263671875, "logps/rejected": -2540.60986328125, "loss": 11.1486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -156.50088500976562, "rewards/margins": 8.716713905334473, "rewards/rejected": -165.2176055908203, "step": 48000 }, { "epoch": 2.78, "grad_norm": 8.622241809419506e-11, "learning_rate": 7.38805681334417e-05, "logits/chosen": -13.3042631149292, "logits/rejected": -13.389732360839844, "logps/chosen": -3284.561279296875, "logps/rejected": -3385.87646484375, "loss": 0.3482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -82.39350128173828, "rewards/margins": 12.82104206085205, "rewards/rejected": -95.21453857421875, "step": 48010 }, { "epoch": 2.78, "grad_norm": 287.5250549316406, "learning_rate": 7.368706219280932e-05, "logits/chosen": -15.61286449432373, "logits/rejected": -15.836015701293945, "logps/chosen": -2884.52294921875, "logps/rejected": -3126.17626953125, "loss": 5.5385, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.92166137695312, "rewards/margins": -1.649693489074707, "rewards/rejected": -133.27195739746094, "step": 48020 }, { "epoch": 2.78, "grad_norm": 3.4529186621057306e-08, "learning_rate": 7.349355625217695e-05, "logits/chosen": -15.807291984558105, "logits/rejected": -16.326637268066406, "logps/chosen": -2942.256591796875, "logps/rejected": -2594.365966796875, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": -118.8252182006836, "rewards/margins": 12.946003913879395, "rewards/rejected": -131.77122497558594, "step": 48030 }, { "epoch": 2.78, "grad_norm": 0.5433983206748962, "learning_rate": 7.330005031154456e-05, "logits/chosen": -18.4888973236084, "logits/rejected": -18.541019439697266, "logps/chosen": -2944.531494140625, "logps/rejected": -3040.10400390625, "loss": 1.3675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -259.7125549316406, "rewards/margins": 10.076398849487305, "rewards/rejected": -269.7889404296875, "step": 48040 }, { "epoch": 2.78, "grad_norm": 1.314728021621704, "learning_rate": 7.310654437091219e-05, "logits/chosen": -15.847394943237305, "logits/rejected": -15.777430534362793, "logps/chosen": -2583.66650390625, "logps/rejected": -2498.79150390625, "loss": 4.2956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -180.61766052246094, "rewards/margins": 4.139929294586182, "rewards/rejected": -184.75758361816406, "step": 48050 }, { "epoch": 2.78, "grad_norm": 0.41881394386291504, "learning_rate": 7.291303843027982e-05, "logits/chosen": -15.05305290222168, "logits/rejected": -15.225529670715332, "logps/chosen": -3007.7744140625, "logps/rejected": -2736.441650390625, "loss": 4.8363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.95846557617188, "rewards/margins": 2.3403544425964355, "rewards/rejected": -157.29884338378906, "step": 48060 }, { "epoch": 2.78, "grad_norm": 13.861214637756348, "learning_rate": 7.271953248964743e-05, "logits/chosen": -17.116836547851562, "logits/rejected": -17.170570373535156, "logps/chosen": -2855.81298828125, "logps/rejected": -2627.37548828125, "loss": 3.0078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -156.72195434570312, "rewards/margins": 0.39989835023880005, "rewards/rejected": -157.12184143066406, "step": 48070 }, { "epoch": 2.78, "grad_norm": 0.0005245545762591064, "learning_rate": 7.252602654901505e-05, "logits/chosen": -18.687152862548828, "logits/rejected": -19.105119705200195, "logps/chosen": -3036.466552734375, "logps/rejected": -2989.2568359375, "loss": 4.9804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -134.52780151367188, "rewards/margins": 2.736642599105835, "rewards/rejected": -137.2644500732422, "step": 48080 }, { "epoch": 2.78, "grad_norm": 0.0, "learning_rate": 7.233252060838268e-05, "logits/chosen": -17.24667739868164, "logits/rejected": -17.635921478271484, "logps/chosen": -2680.986572265625, "logps/rejected": -2702.339111328125, "loss": 1.8105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -193.57199096679688, "rewards/margins": 11.528036117553711, "rewards/rejected": -205.1000518798828, "step": 48090 }, { "epoch": 2.78, "grad_norm": 7.494385719299316, "learning_rate": 7.21390146677503e-05, "logits/chosen": -15.228134155273438, "logits/rejected": -15.485635757446289, "logps/chosen": -3119.643798828125, "logps/rejected": -2680.55322265625, "loss": 0.5271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -163.88986206054688, "rewards/margins": 9.28724193572998, "rewards/rejected": -173.17710876464844, "step": 48100 }, { "epoch": 2.78, "grad_norm": 0.010275430046021938, "learning_rate": 7.194550872711793e-05, "logits/chosen": -15.991169929504395, "logits/rejected": -16.541975021362305, "logps/chosen": -2836.38232421875, "logps/rejected": -2775.57080078125, "loss": 15.558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -149.1579132080078, "rewards/margins": -10.595365524291992, "rewards/rejected": -138.56253051757812, "step": 48110 }, { "epoch": 2.79, "grad_norm": 0.006618978921324015, "learning_rate": 7.175200278648553e-05, "logits/chosen": -17.40857696533203, "logits/rejected": -19.860647201538086, "logps/chosen": -2917.262939453125, "logps/rejected": -2826.402587890625, "loss": 2.5711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -213.6178436279297, "rewards/margins": 17.27973747253418, "rewards/rejected": -230.89761352539062, "step": 48120 }, { "epoch": 2.79, "grad_norm": 8.288511889986694e-05, "learning_rate": 7.155849684585316e-05, "logits/chosen": -19.674213409423828, "logits/rejected": -20.4986629486084, "logps/chosen": -2659.301513671875, "logps/rejected": -2835.53662109375, "loss": 2.3997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -223.1975860595703, "rewards/margins": 5.7402448654174805, "rewards/rejected": -228.9378204345703, "step": 48130 }, { "epoch": 2.79, "grad_norm": 4.37605125747087e-11, "learning_rate": 7.13649909052208e-05, "logits/chosen": -15.84081745147705, "logits/rejected": -15.407907485961914, "logps/chosen": -3009.42041015625, "logps/rejected": -2697.3740234375, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -196.7152099609375, "rewards/margins": 18.5704402923584, "rewards/rejected": -215.28564453125, "step": 48140 }, { "epoch": 2.79, "grad_norm": 1.8604432823818426e-16, "learning_rate": 7.117148496458841e-05, "logits/chosen": -16.815658569335938, "logits/rejected": -18.053476333618164, "logps/chosen": -2960.46728515625, "logps/rejected": -2926.966552734375, "loss": 8.6327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -188.16793823242188, "rewards/margins": 3.1101181507110596, "rewards/rejected": -191.27804565429688, "step": 48150 }, { "epoch": 2.79, "grad_norm": 28.119823455810547, "learning_rate": 7.097797902395604e-05, "logits/chosen": -16.40169906616211, "logits/rejected": -17.13123321533203, "logps/chosen": -2704.69189453125, "logps/rejected": -2638.106201171875, "loss": 2.7972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.20289611816406, "rewards/margins": 7.104959964752197, "rewards/rejected": -151.307861328125, "step": 48160 }, { "epoch": 2.79, "grad_norm": 1.999796106488816e-09, "learning_rate": 7.078447308332366e-05, "logits/chosen": -19.790199279785156, "logits/rejected": -20.672292709350586, "logps/chosen": -2818.621826171875, "logps/rejected": -2820.34765625, "loss": 2.2419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -249.5193634033203, "rewards/margins": 6.856441497802734, "rewards/rejected": -256.3758239746094, "step": 48170 }, { "epoch": 2.79, "grad_norm": 58.509765625, "learning_rate": 7.059096714269128e-05, "logits/chosen": -15.876592636108398, "logits/rejected": -16.85613250732422, "logps/chosen": -3125.6455078125, "logps/rejected": -3121.85107421875, "loss": 8.9167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -115.22480773925781, "rewards/margins": -1.036992073059082, "rewards/rejected": -114.18782043457031, "step": 48180 }, { "epoch": 2.79, "grad_norm": 85.54193878173828, "learning_rate": 7.039746120205891e-05, "logits/chosen": -22.377012252807617, "logits/rejected": -21.38148307800293, "logps/chosen": -3062.037841796875, "logps/rejected": -2928.557373046875, "loss": 3.4776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -209.4871368408203, "rewards/margins": 15.951379776000977, "rewards/rejected": -225.4385223388672, "step": 48190 }, { "epoch": 2.79, "grad_norm": 4.369166851043701, "learning_rate": 7.020395526142654e-05, "logits/chosen": -15.58368968963623, "logits/rejected": -15.816751480102539, "logps/chosen": -2930.401611328125, "logps/rejected": -2969.992431640625, "loss": 0.5844, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -128.02220153808594, "rewards/margins": 8.918024063110352, "rewards/rejected": -136.9402313232422, "step": 48200 }, { "epoch": 2.79, "grad_norm": 2.1784917407785542e-05, "learning_rate": 7.001044932079414e-05, "logits/chosen": -15.066760063171387, "logits/rejected": -15.020724296569824, "logps/chosen": -3471.682861328125, "logps/rejected": -3446.26513671875, "loss": 1.7633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -50.98542022705078, "rewards/margins": 7.458979606628418, "rewards/rejected": -58.44440460205078, "step": 48210 }, { "epoch": 2.79, "grad_norm": 17.483871459960938, "learning_rate": 6.981694338016177e-05, "logits/chosen": -18.05026626586914, "logits/rejected": -18.73857307434082, "logps/chosen": -2532.38720703125, "logps/rejected": -2391.429443359375, "loss": 0.9306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.3922882080078, "rewards/margins": 10.153611183166504, "rewards/rejected": -183.54588317871094, "step": 48220 }, { "epoch": 2.79, "grad_norm": 8.45162048790371e-06, "learning_rate": 6.962343743952939e-05, "logits/chosen": -15.089508056640625, "logits/rejected": -14.858189582824707, "logps/chosen": -3054.232666015625, "logps/rejected": -3003.0771484375, "loss": 2.9495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -123.9581298828125, "rewards/margins": 3.3955624103546143, "rewards/rejected": -127.35368347167969, "step": 48230 }, { "epoch": 2.79, "grad_norm": 0.0, "learning_rate": 6.942993149889702e-05, "logits/chosen": -16.551342010498047, "logits/rejected": -16.979372024536133, "logps/chosen": -2708.59130859375, "logps/rejected": -2629.420654296875, "loss": 4.3395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -153.25177001953125, "rewards/margins": 7.4377031326293945, "rewards/rejected": -160.68948364257812, "step": 48240 }, { "epoch": 2.79, "grad_norm": 0.06312623620033264, "learning_rate": 6.923642555826464e-05, "logits/chosen": -18.319965362548828, "logits/rejected": -19.775726318359375, "logps/chosen": -2792.25634765625, "logps/rejected": -2580.85595703125, "loss": 1.0652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -149.85491943359375, "rewards/margins": 13.235150337219238, "rewards/rejected": -163.09007263183594, "step": 48250 }, { "epoch": 2.79, "grad_norm": 18.45267677307129, "learning_rate": 6.904291961763226e-05, "logits/chosen": -17.15229034423828, "logits/rejected": -16.99834442138672, "logps/chosen": -2104.384033203125, "logps/rejected": -1997.703857421875, "loss": 7.3784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.95578002929688, "rewards/margins": -0.884962260723114, "rewards/rejected": -136.07083129882812, "step": 48260 }, { "epoch": 2.79, "grad_norm": 14.47404956817627, "learning_rate": 6.884941367699989e-05, "logits/chosen": -20.183292388916016, "logits/rejected": -22.342485427856445, "logps/chosen": -3318.430908203125, "logps/rejected": -3241.57861328125, "loss": 1.9801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -193.22450256347656, "rewards/margins": 10.439537048339844, "rewards/rejected": -203.6640167236328, "step": 48270 }, { "epoch": 2.79, "grad_norm": 34.43256378173828, "learning_rate": 6.865590773636752e-05, "logits/chosen": -17.818002700805664, "logits/rejected": -17.313913345336914, "logps/chosen": -3152.6171875, "logps/rejected": -2954.454345703125, "loss": 5.0625, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -143.2810516357422, "rewards/margins": -0.3207089304924011, "rewards/rejected": -142.96034240722656, "step": 48280 }, { "epoch": 2.8, "grad_norm": 2.343198537826538, "learning_rate": 6.846240179573512e-05, "logits/chosen": -19.71289825439453, "logits/rejected": -21.856386184692383, "logps/chosen": -3193.366943359375, "logps/rejected": -2968.137939453125, "loss": 0.2675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -208.53189086914062, "rewards/margins": 11.939767837524414, "rewards/rejected": -220.47164916992188, "step": 48290 }, { "epoch": 2.8, "grad_norm": 148.63018798828125, "learning_rate": 6.826889585510275e-05, "logits/chosen": -16.91652488708496, "logits/rejected": -17.39181900024414, "logps/chosen": -2453.584228515625, "logps/rejected": -2285.095947265625, "loss": 31.4752, "rewards/accuracies": 0.5, "rewards/chosen": -188.65980529785156, "rewards/margins": -26.402179718017578, "rewards/rejected": -162.25762939453125, "step": 48300 }, { "epoch": 2.8, "grad_norm": 4.638806283541186e-10, "learning_rate": 6.807538991447038e-05, "logits/chosen": -20.475664138793945, "logits/rejected": -21.16121482849121, "logps/chosen": -3128.40380859375, "logps/rejected": -2928.097412109375, "loss": 1.4188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -172.08755493164062, "rewards/margins": 12.308540344238281, "rewards/rejected": -184.39608764648438, "step": 48310 }, { "epoch": 2.8, "grad_norm": 7.274924551659012e-10, "learning_rate": 6.7881883973838e-05, "logits/chosen": -17.735624313354492, "logits/rejected": -18.738718032836914, "logps/chosen": -3140.15771484375, "logps/rejected": -3260.73095703125, "loss": 4.8953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -204.48696899414062, "rewards/margins": 5.95218563079834, "rewards/rejected": -210.4391632080078, "step": 48320 }, { "epoch": 2.8, "grad_norm": 0.7579005360603333, "learning_rate": 6.768837803320563e-05, "logits/chosen": -16.694171905517578, "logits/rejected": -17.36844253540039, "logps/chosen": -3155.71240234375, "logps/rejected": -2726.31494140625, "loss": 0.0733, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -173.06988525390625, "rewards/margins": 18.332704544067383, "rewards/rejected": -191.40255737304688, "step": 48330 }, { "epoch": 2.8, "grad_norm": 4.753236344833597e-15, "learning_rate": 6.749487209257323e-05, "logits/chosen": -22.717885971069336, "logits/rejected": -25.05998992919922, "logps/chosen": -2779.030029296875, "logps/rejected": -2691.246826171875, "loss": 2.8518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -244.0164794921875, "rewards/margins": 9.481549263000488, "rewards/rejected": -253.498046875, "step": 48340 }, { "epoch": 2.8, "grad_norm": 0.0029734426643699408, "learning_rate": 6.730136615194086e-05, "logits/chosen": -18.31667709350586, "logits/rejected": -18.820423126220703, "logps/chosen": -3058.35986328125, "logps/rejected": -3112.93310546875, "loss": 1.0342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -230.42459106445312, "rewards/margins": 5.989019393920898, "rewards/rejected": -236.41360473632812, "step": 48350 }, { "epoch": 2.8, "grad_norm": 48.42873764038086, "learning_rate": 6.71078602113085e-05, "logits/chosen": -21.227567672729492, "logits/rejected": -22.143310546875, "logps/chosen": -3017.96044921875, "logps/rejected": -2941.21337890625, "loss": 2.8357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -237.76864624023438, "rewards/margins": 5.411890983581543, "rewards/rejected": -243.18057250976562, "step": 48360 }, { "epoch": 2.8, "grad_norm": 3.0033170332899317e-06, "learning_rate": 6.691435427067611e-05, "logits/chosen": -17.48177719116211, "logits/rejected": -18.801586151123047, "logps/chosen": -3072.295654296875, "logps/rejected": -2982.492431640625, "loss": 3.6149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -167.6481170654297, "rewards/margins": 5.5816521644592285, "rewards/rejected": -173.22976684570312, "step": 48370 }, { "epoch": 2.8, "grad_norm": 0.04050377383828163, "learning_rate": 6.672084833004373e-05, "logits/chosen": -15.85804271697998, "logits/rejected": -16.248001098632812, "logps/chosen": -3136.87744140625, "logps/rejected": -2652.853759765625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -163.51234436035156, "rewards/margins": 12.2421236038208, "rewards/rejected": -175.7544403076172, "step": 48380 }, { "epoch": 2.8, "grad_norm": 0.0004206007288303226, "learning_rate": 6.652734238941136e-05, "logits/chosen": -20.96574592590332, "logits/rejected": -21.253002166748047, "logps/chosen": -2738.659912109375, "logps/rejected": -2880.466552734375, "loss": 0.6232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -213.44668579101562, "rewards/margins": 12.834173202514648, "rewards/rejected": -226.28085327148438, "step": 48390 }, { "epoch": 2.8, "grad_norm": 29.47324562072754, "learning_rate": 6.633383644877898e-05, "logits/chosen": -19.253604888916016, "logits/rejected": -19.566553115844727, "logps/chosen": -2827.012939453125, "logps/rejected": -2862.753662109375, "loss": 3.17, "rewards/accuracies": 0.5, "rewards/chosen": -243.0419921875, "rewards/margins": 11.264208793640137, "rewards/rejected": -254.30618286132812, "step": 48400 }, { "epoch": 2.8, "grad_norm": 0.3133944869041443, "learning_rate": 6.614033050814661e-05, "logits/chosen": -17.27492904663086, "logits/rejected": -17.078859329223633, "logps/chosen": -2573.993408203125, "logps/rejected": -2489.389404296875, "loss": 2.7197, "rewards/accuracies": 0.5, "rewards/chosen": -153.01942443847656, "rewards/margins": 1.8290132284164429, "rewards/rejected": -154.84841918945312, "step": 48410 }, { "epoch": 2.8, "grad_norm": 24.274324417114258, "learning_rate": 6.594682456751423e-05, "logits/chosen": -17.37369728088379, "logits/rejected": -17.113218307495117, "logps/chosen": -2978.125, "logps/rejected": -2946.928466796875, "loss": 1.4673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -182.76803588867188, "rewards/margins": 5.292389392852783, "rewards/rejected": -188.0604248046875, "step": 48420 }, { "epoch": 2.8, "grad_norm": 1.9392378330230713, "learning_rate": 6.575331862688184e-05, "logits/chosen": -19.10185432434082, "logits/rejected": -19.606454849243164, "logps/chosen": -2473.003662109375, "logps/rejected": -2445.11962890625, "loss": 0.4047, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -182.7614288330078, "rewards/margins": 12.872714042663574, "rewards/rejected": -195.6341552734375, "step": 48430 }, { "epoch": 2.8, "grad_norm": 53.62517166137695, "learning_rate": 6.555981268624947e-05, "logits/chosen": -19.0064697265625, "logits/rejected": -19.8523006439209, "logps/chosen": -3031.87109375, "logps/rejected": -3192.23876953125, "loss": 2.3419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -184.9074249267578, "rewards/margins": 17.177518844604492, "rewards/rejected": -202.08493041992188, "step": 48440 }, { "epoch": 2.8, "grad_norm": 2.144428014755249, "learning_rate": 6.536630674561709e-05, "logits/chosen": -16.195894241333008, "logits/rejected": -16.892013549804688, "logps/chosen": -3134.68310546875, "logps/rejected": -2946.848388671875, "loss": 0.5562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -141.87413024902344, "rewards/margins": 4.988554954528809, "rewards/rejected": -146.86268615722656, "step": 48450 }, { "epoch": 2.81, "grad_norm": 76.7081527709961, "learning_rate": 6.517280080498471e-05, "logits/chosen": -19.180980682373047, "logits/rejected": -18.993385314941406, "logps/chosen": -2831.48779296875, "logps/rejected": -2914.4736328125, "loss": 3.0186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -212.1805877685547, "rewards/margins": 7.516127109527588, "rewards/rejected": -219.6967315673828, "step": 48460 }, { "epoch": 2.81, "grad_norm": 0.0004936571349389851, "learning_rate": 6.497929486435234e-05, "logits/chosen": -18.657726287841797, "logits/rejected": -18.091360092163086, "logps/chosen": -3086.392333984375, "logps/rejected": -3156.86083984375, "loss": 1.0538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -199.47023010253906, "rewards/margins": 10.810402870178223, "rewards/rejected": -210.2806396484375, "step": 48470 }, { "epoch": 2.81, "grad_norm": 0.00017954569193534553, "learning_rate": 6.478578892371996e-05, "logits/chosen": -17.713375091552734, "logits/rejected": -17.51702117919922, "logps/chosen": -3027.72119140625, "logps/rejected": -3097.60986328125, "loss": 3.8935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -194.6684112548828, "rewards/margins": 5.023980140686035, "rewards/rejected": -199.69239807128906, "step": 48480 }, { "epoch": 2.81, "grad_norm": 1.1673192830130574e-06, "learning_rate": 6.459228298308759e-05, "logits/chosen": -15.652654647827148, "logits/rejected": -15.89612865447998, "logps/chosen": -3222.990234375, "logps/rejected": -3230.253662109375, "loss": 8.8805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -144.6468048095703, "rewards/margins": 1.672258973121643, "rewards/rejected": -146.31906127929688, "step": 48490 }, { "epoch": 2.81, "grad_norm": 2.493814577064768e-07, "learning_rate": 6.439877704245522e-05, "logits/chosen": -16.870187759399414, "logits/rejected": -17.59819984436035, "logps/chosen": -2856.587646484375, "logps/rejected": -2494.421875, "loss": 4.9886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -134.19296264648438, "rewards/margins": 0.28491249680519104, "rewards/rejected": -134.47787475585938, "step": 48500 }, { "epoch": 2.81, "grad_norm": 89.89530944824219, "learning_rate": 6.420527110182282e-05, "logits/chosen": -16.893877029418945, "logits/rejected": -17.32686424255371, "logps/chosen": -2629.10400390625, "logps/rejected": -2678.362548828125, "loss": 5.9094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -159.99923706054688, "rewards/margins": 3.6375136375427246, "rewards/rejected": -163.63674926757812, "step": 48510 }, { "epoch": 2.81, "grad_norm": 59.67757797241211, "learning_rate": 6.401176516119045e-05, "logits/chosen": -17.042387008666992, "logits/rejected": -16.69597053527832, "logps/chosen": -2782.714599609375, "logps/rejected": -2893.697265625, "loss": 5.1027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.14830017089844, "rewards/margins": 2.7811856269836426, "rewards/rejected": -147.92950439453125, "step": 48520 }, { "epoch": 2.81, "grad_norm": 2.4740192890167236, "learning_rate": 6.381825922055807e-05, "logits/chosen": -17.43537712097168, "logits/rejected": -17.562679290771484, "logps/chosen": -2418.629150390625, "logps/rejected": -2488.33544921875, "loss": 1.0374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -179.4126434326172, "rewards/margins": 6.313445091247559, "rewards/rejected": -185.72608947753906, "step": 48530 }, { "epoch": 2.81, "grad_norm": 7.867848039744274e-11, "learning_rate": 6.36247532799257e-05, "logits/chosen": -19.162887573242188, "logits/rejected": -19.64031410217285, "logps/chosen": -2712.859130859375, "logps/rejected": -2448.05322265625, "loss": 13.2538, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -207.3308868408203, "rewards/margins": -8.991659164428711, "rewards/rejected": -198.3392333984375, "step": 48540 }, { "epoch": 2.81, "grad_norm": 5.367027551983483e-05, "learning_rate": 6.343124733929332e-05, "logits/chosen": -14.617860794067383, "logits/rejected": -14.597966194152832, "logps/chosen": -2948.12060546875, "logps/rejected": -2785.697998046875, "loss": 4.1791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -126.88981628417969, "rewards/margins": 2.603203535079956, "rewards/rejected": -129.49302673339844, "step": 48550 }, { "epoch": 2.81, "grad_norm": 159.66075134277344, "learning_rate": 6.323774139866093e-05, "logits/chosen": -15.113168716430664, "logits/rejected": -14.926490783691406, "logps/chosen": -2688.2275390625, "logps/rejected": -2777.34716796875, "loss": 3.8868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -168.9417266845703, "rewards/margins": 9.860407829284668, "rewards/rejected": -178.80213928222656, "step": 48560 }, { "epoch": 2.81, "grad_norm": 0.09027548134326935, "learning_rate": 6.304423545802856e-05, "logits/chosen": -20.13546371459961, "logits/rejected": -20.01386070251465, "logps/chosen": -2487.14111328125, "logps/rejected": -2490.4609375, "loss": 1.1899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -218.32211303710938, "rewards/margins": 2.4049148559570312, "rewards/rejected": -220.72702026367188, "step": 48570 }, { "epoch": 2.81, "grad_norm": 0.00015832883946131915, "learning_rate": 6.28507295173962e-05, "logits/chosen": -17.20782470703125, "logits/rejected": -17.576095581054688, "logps/chosen": -2997.198486328125, "logps/rejected": -2891.790283203125, "loss": 1.6362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -162.47976684570312, "rewards/margins": 8.335834503173828, "rewards/rejected": -170.8155975341797, "step": 48580 }, { "epoch": 2.81, "grad_norm": 1.9258077144622803, "learning_rate": 6.26572235767638e-05, "logits/chosen": -16.050514221191406, "logits/rejected": -16.353668212890625, "logps/chosen": -3072.71728515625, "logps/rejected": -2948.11181640625, "loss": 8.768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -150.5615234375, "rewards/margins": 8.870964050292969, "rewards/rejected": -159.43251037597656, "step": 48590 }, { "epoch": 2.81, "grad_norm": 5.10893440246582, "learning_rate": 6.246371763613143e-05, "logits/chosen": -17.90831756591797, "logits/rejected": -18.624860763549805, "logps/chosen": -3003.818115234375, "logps/rejected": -3068.240966796875, "loss": 3.8403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -142.42068481445312, "rewards/margins": 13.19005012512207, "rewards/rejected": -155.61074829101562, "step": 48600 }, { "epoch": 2.81, "grad_norm": 0.0018301517702639103, "learning_rate": 6.227021169549906e-05, "logits/chosen": -17.428722381591797, "logits/rejected": -16.95712661743164, "logps/chosen": -2888.544677734375, "logps/rejected": -2620.3291015625, "loss": 3.7001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -144.20843505859375, "rewards/margins": 5.468947410583496, "rewards/rejected": -149.67739868164062, "step": 48610 }, { "epoch": 2.81, "grad_norm": 1.6456177531765093e-09, "learning_rate": 6.207670575486668e-05, "logits/chosen": -11.682914733886719, "logits/rejected": -11.673070907592773, "logps/chosen": -3292.83056640625, "logps/rejected": -3136.74072265625, "loss": 6.2801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.6451301574707, "rewards/margins": 6.14633321762085, "rewards/rejected": -42.79146194458008, "step": 48620 }, { "epoch": 2.81, "grad_norm": 126.49392700195312, "learning_rate": 6.18831998142343e-05, "logits/chosen": -16.922929763793945, "logits/rejected": -16.867568969726562, "logps/chosen": -3073.51318359375, "logps/rejected": -3022.648681640625, "loss": 10.3152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -163.34268188476562, "rewards/margins": -4.961136817932129, "rewards/rejected": -158.38153076171875, "step": 48630 }, { "epoch": 2.82, "grad_norm": 0.015028323046863079, "learning_rate": 6.168969387360191e-05, "logits/chosen": -21.238737106323242, "logits/rejected": -23.298702239990234, "logps/chosen": -2900.594482421875, "logps/rejected": -2876.287841796875, "loss": 2.04, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -222.85513305664062, "rewards/margins": 4.366446495056152, "rewards/rejected": -227.22158813476562, "step": 48640 }, { "epoch": 2.82, "grad_norm": 3.8611972332000732, "learning_rate": 6.149618793296954e-05, "logits/chosen": -17.1417293548584, "logits/rejected": -16.107955932617188, "logps/chosen": -3265.8525390625, "logps/rejected": -3151.121826171875, "loss": 5.2221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.6172332763672, "rewards/margins": 2.4883718490600586, "rewards/rejected": -176.10562133789062, "step": 48650 }, { "epoch": 2.82, "grad_norm": 0.007765426300466061, "learning_rate": 6.130268199233716e-05, "logits/chosen": -15.301691055297852, "logits/rejected": -15.0969820022583, "logps/chosen": -2869.68212890625, "logps/rejected": -2977.10302734375, "loss": 3.6009, "rewards/accuracies": 0.5, "rewards/chosen": -105.8761215209961, "rewards/margins": 0.26014184951782227, "rewards/rejected": -106.13626861572266, "step": 48660 }, { "epoch": 2.82, "grad_norm": 105.82939147949219, "learning_rate": 6.110917605170479e-05, "logits/chosen": -14.9094820022583, "logits/rejected": -14.722513198852539, "logps/chosen": -3271.83544921875, "logps/rejected": -3434.14013671875, "loss": 3.9716, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -125.33882904052734, "rewards/margins": -1.10794997215271, "rewards/rejected": -124.23088073730469, "step": 48670 }, { "epoch": 2.82, "grad_norm": 5.034924030303955, "learning_rate": 6.0915670111072414e-05, "logits/chosen": -19.154827117919922, "logits/rejected": -22.593297958374023, "logps/chosen": -3042.773681640625, "logps/rejected": -3052.947265625, "loss": 5.7385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -238.496826171875, "rewards/margins": 7.355599880218506, "rewards/rejected": -245.8524627685547, "step": 48680 }, { "epoch": 2.82, "grad_norm": 9.661774674896151e-05, "learning_rate": 6.072216417044003e-05, "logits/chosen": -19.56979751586914, "logits/rejected": -20.308734893798828, "logps/chosen": -2699.5703125, "logps/rejected": -2815.68505859375, "loss": 1.284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -168.73529052734375, "rewards/margins": 15.248684883117676, "rewards/rejected": -183.98397827148438, "step": 48690 }, { "epoch": 2.82, "grad_norm": 10.556634902954102, "learning_rate": 6.052865822980766e-05, "logits/chosen": -18.176725387573242, "logits/rejected": -17.95201301574707, "logps/chosen": -2686.4921875, "logps/rejected": -2661.394775390625, "loss": 3.7125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -197.58692932128906, "rewards/margins": 1.8993438482284546, "rewards/rejected": -199.48629760742188, "step": 48700 }, { "epoch": 2.82, "grad_norm": 6.538034915924072, "learning_rate": 6.033515228917528e-05, "logits/chosen": -18.00235366821289, "logits/rejected": -18.430986404418945, "logps/chosen": -2895.22998046875, "logps/rejected": -2542.151611328125, "loss": 7.0696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -218.55654907226562, "rewards/margins": -1.026460886001587, "rewards/rejected": -217.53005981445312, "step": 48710 }, { "epoch": 2.82, "grad_norm": 3.381118059158325, "learning_rate": 6.0141646348542904e-05, "logits/chosen": -17.986392974853516, "logits/rejected": -18.759002685546875, "logps/chosen": -3169.667236328125, "logps/rejected": -2799.566162109375, "loss": 0.7327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -166.51705932617188, "rewards/margins": 9.97880744934082, "rewards/rejected": -176.495849609375, "step": 48720 }, { "epoch": 2.82, "grad_norm": 76.76714324951172, "learning_rate": 5.994814040791052e-05, "logits/chosen": -17.103073120117188, "logits/rejected": -17.124826431274414, "logps/chosen": -2946.396728515625, "logps/rejected": -3031.042724609375, "loss": 1.6061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -150.39378356933594, "rewards/margins": 7.561924934387207, "rewards/rejected": -157.95571899414062, "step": 48730 }, { "epoch": 2.82, "grad_norm": 7.328142004325855e-08, "learning_rate": 5.975463446727815e-05, "logits/chosen": -20.48509407043457, "logits/rejected": -20.415630340576172, "logps/chosen": -2796.42041015625, "logps/rejected": -2839.16552734375, "loss": 0.6508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -245.8004608154297, "rewards/margins": 5.942451477050781, "rewards/rejected": -251.74288940429688, "step": 48740 }, { "epoch": 2.82, "grad_norm": 0.0, "learning_rate": 5.956112852664577e-05, "logits/chosen": -16.68399429321289, "logits/rejected": -17.117109298706055, "logps/chosen": -3201.251220703125, "logps/rejected": -2936.625, "loss": 2.0636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.40164184570312, "rewards/margins": 14.28007984161377, "rewards/rejected": -218.68173217773438, "step": 48750 }, { "epoch": 2.82, "grad_norm": 126.84013366699219, "learning_rate": 5.936762258601339e-05, "logits/chosen": -15.049230575561523, "logits/rejected": -16.308162689208984, "logps/chosen": -3136.91162109375, "logps/rejected": -3003.8525390625, "loss": 4.858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -131.93106079101562, "rewards/margins": -0.7690336108207703, "rewards/rejected": -131.16204833984375, "step": 48760 }, { "epoch": 2.82, "grad_norm": 5.0952261517522857e-05, "learning_rate": 5.917411664538101e-05, "logits/chosen": -20.88322639465332, "logits/rejected": -21.853803634643555, "logps/chosen": -2881.00146484375, "logps/rejected": -2797.14697265625, "loss": 5.7076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -217.2475128173828, "rewards/margins": 6.329467296600342, "rewards/rejected": -223.57699584960938, "step": 48770 }, { "epoch": 2.82, "grad_norm": 3.581248790851532e-07, "learning_rate": 5.898061070474864e-05, "logits/chosen": -13.313840866088867, "logits/rejected": -13.326423645019531, "logps/chosen": -3417.280517578125, "logps/rejected": -3263.67041015625, "loss": 1.7696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -58.6412353515625, "rewards/margins": 12.540127754211426, "rewards/rejected": -71.18136596679688, "step": 48780 }, { "epoch": 2.82, "grad_norm": 39.261783599853516, "learning_rate": 5.878710476411626e-05, "logits/chosen": -18.19144630432129, "logits/rejected": -18.071760177612305, "logps/chosen": -3186.82666015625, "logps/rejected": -3012.642822265625, "loss": 0.3266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -139.43772888183594, "rewards/margins": 9.04317855834961, "rewards/rejected": -148.48092651367188, "step": 48790 }, { "epoch": 2.82, "grad_norm": 4.043242454528809, "learning_rate": 5.859359882348388e-05, "logits/chosen": -15.739047050476074, "logits/rejected": -15.961587905883789, "logps/chosen": -3195.961669921875, "logps/rejected": -3231.345458984375, "loss": 8.5259, "rewards/accuracies": 0.5, "rewards/chosen": -190.88265991210938, "rewards/margins": -2.5704216957092285, "rewards/rejected": -188.31222534179688, "step": 48800 }, { "epoch": 2.83, "grad_norm": 0.0005042462144047022, "learning_rate": 5.84000928828515e-05, "logits/chosen": -18.736997604370117, "logits/rejected": -19.077699661254883, "logps/chosen": -3109.6708984375, "logps/rejected": -2700.538330078125, "loss": 1.4965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -200.4980010986328, "rewards/margins": 6.27227258682251, "rewards/rejected": -206.77029418945312, "step": 48810 }, { "epoch": 2.83, "grad_norm": 0.844921350479126, "learning_rate": 5.820658694221913e-05, "logits/chosen": -15.833836555480957, "logits/rejected": -17.020259857177734, "logps/chosen": -2740.802978515625, "logps/rejected": -2612.4150390625, "loss": 2.4189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -154.74417114257812, "rewards/margins": 9.847646713256836, "rewards/rejected": -164.59182739257812, "step": 48820 }, { "epoch": 2.83, "grad_norm": 3.305508045059469e-08, "learning_rate": 5.801308100158675e-05, "logits/chosen": -18.872299194335938, "logits/rejected": -19.118717193603516, "logps/chosen": -2513.6328125, "logps/rejected": -2343.54150390625, "loss": 1.1674, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -186.03317260742188, "rewards/margins": 6.375016689300537, "rewards/rejected": -192.40818786621094, "step": 48830 }, { "epoch": 2.83, "grad_norm": 1.091198782887659e-06, "learning_rate": 5.781957506095437e-05, "logits/chosen": -15.731961250305176, "logits/rejected": -16.112497329711914, "logps/chosen": -3207.112548828125, "logps/rejected": -3173.975341796875, "loss": 0.944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -131.05653381347656, "rewards/margins": 6.34126091003418, "rewards/rejected": -137.3977813720703, "step": 48840 }, { "epoch": 2.83, "grad_norm": 3.680634616531897e-11, "learning_rate": 5.7626069120321995e-05, "logits/chosen": -18.659564971923828, "logits/rejected": -19.327835083007812, "logps/chosen": -2900.048828125, "logps/rejected": -2877.64404296875, "loss": 5.8185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -220.956787109375, "rewards/margins": 2.135911464691162, "rewards/rejected": -223.0926971435547, "step": 48850 }, { "epoch": 2.83, "grad_norm": 66.39334106445312, "learning_rate": 5.743256317968962e-05, "logits/chosen": -22.427797317504883, "logits/rejected": -22.31627655029297, "logps/chosen": -2527.86181640625, "logps/rejected": -2401.72900390625, "loss": 7.9796, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -205.78085327148438, "rewards/margins": -1.0985603332519531, "rewards/rejected": -204.68231201171875, "step": 48860 }, { "epoch": 2.83, "grad_norm": 0.0017741917399689555, "learning_rate": 5.723905723905724e-05, "logits/chosen": -18.336872100830078, "logits/rejected": -17.50082015991211, "logps/chosen": -2917.326416015625, "logps/rejected": -2823.559814453125, "loss": 2.8165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -145.7981719970703, "rewards/margins": 17.09730339050293, "rewards/rejected": -162.89547729492188, "step": 48870 }, { "epoch": 2.83, "grad_norm": 0.011126991361379623, "learning_rate": 5.704555129842486e-05, "logits/chosen": -16.53827667236328, "logits/rejected": -16.527833938598633, "logps/chosen": -2998.9853515625, "logps/rejected": -2909.63623046875, "loss": 0.5136, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -126.04945373535156, "rewards/margins": 7.739877223968506, "rewards/rejected": -133.78933715820312, "step": 48880 }, { "epoch": 2.83, "grad_norm": 69.60016632080078, "learning_rate": 5.685204535779249e-05, "logits/chosen": -16.527969360351562, "logits/rejected": -16.51687240600586, "logps/chosen": -3358.66748046875, "logps/rejected": -3219.778564453125, "loss": 4.906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -161.5110321044922, "rewards/margins": 0.7283821105957031, "rewards/rejected": -162.2394256591797, "step": 48890 }, { "epoch": 2.83, "grad_norm": 92.35616302490234, "learning_rate": 5.665853941716011e-05, "logits/chosen": -17.304983139038086, "logits/rejected": -18.438810348510742, "logps/chosen": -2992.343017578125, "logps/rejected": -3014.725341796875, "loss": 7.735, "rewards/accuracies": 0.5, "rewards/chosen": -197.4613037109375, "rewards/margins": -3.3777847290039062, "rewards/rejected": -194.08352661132812, "step": 48900 }, { "epoch": 2.83, "grad_norm": 0.21249236166477203, "learning_rate": 5.646503347652773e-05, "logits/chosen": -17.3018741607666, "logits/rejected": -17.50550651550293, "logps/chosen": -2970.744384765625, "logps/rejected": -2989.19189453125, "loss": 4.5995, "rewards/accuracies": 0.5, "rewards/chosen": -167.28225708007812, "rewards/margins": -2.2296605110168457, "rewards/rejected": -165.05259704589844, "step": 48910 }, { "epoch": 2.83, "grad_norm": 6.20636399162322e-07, "learning_rate": 5.627152753589535e-05, "logits/chosen": -16.57192611694336, "logits/rejected": -16.367870330810547, "logps/chosen": -3211.42822265625, "logps/rejected": -3273.3759765625, "loss": 0.2756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -96.32610321044922, "rewards/margins": 17.451278686523438, "rewards/rejected": -113.77738189697266, "step": 48920 }, { "epoch": 2.83, "grad_norm": 0.003774053882807493, "learning_rate": 5.607802159526298e-05, "logits/chosen": -17.510374069213867, "logits/rejected": -17.61280632019043, "logps/chosen": -3031.825927734375, "logps/rejected": -2930.81640625, "loss": 3.3514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -187.40084838867188, "rewards/margins": 3.505077838897705, "rewards/rejected": -190.90589904785156, "step": 48930 }, { "epoch": 2.83, "grad_norm": 11.847578048706055, "learning_rate": 5.58845156546306e-05, "logits/chosen": -17.290699005126953, "logits/rejected": -16.891361236572266, "logps/chosen": -3059.269775390625, "logps/rejected": -3096.23046875, "loss": 7.0423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.39080810546875, "rewards/margins": -1.5919945240020752, "rewards/rejected": -159.79879760742188, "step": 48940 }, { "epoch": 2.83, "grad_norm": 0.008451846428215504, "learning_rate": 5.569100971399822e-05, "logits/chosen": -17.747648239135742, "logits/rejected": -18.59465980529785, "logps/chosen": -3045.75830078125, "logps/rejected": -2974.30224609375, "loss": 2.632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -225.3655242919922, "rewards/margins": 6.564537048339844, "rewards/rejected": -231.9300537109375, "step": 48950 }, { "epoch": 2.83, "grad_norm": 67.15776062011719, "learning_rate": 5.5497503773365845e-05, "logits/chosen": -18.197141647338867, "logits/rejected": -18.098487854003906, "logps/chosen": -2789.058349609375, "logps/rejected": -2825.7841796875, "loss": 1.1208, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -179.0000762939453, "rewards/margins": 12.634839057922363, "rewards/rejected": -191.63491821289062, "step": 48960 }, { "epoch": 2.83, "grad_norm": 0.1863774061203003, "learning_rate": 5.530399783273347e-05, "logits/chosen": -17.09697723388672, "logits/rejected": -17.594438552856445, "logps/chosen": -2849.757080078125, "logps/rejected": -2776.890625, "loss": 3.0404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -130.40721130371094, "rewards/margins": 7.7986602783203125, "rewards/rejected": -138.20587158203125, "step": 48970 }, { "epoch": 2.84, "grad_norm": 0.09972264617681503, "learning_rate": 5.5110491892101086e-05, "logits/chosen": -21.409772872924805, "logits/rejected": -21.857723236083984, "logps/chosen": -2872.586669921875, "logps/rejected": -2894.55419921875, "loss": 3.3438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -225.06143188476562, "rewards/margins": 6.047770023345947, "rewards/rejected": -231.1091766357422, "step": 48980 }, { "epoch": 2.84, "grad_norm": 0.0034204265102744102, "learning_rate": 5.491698595146871e-05, "logits/chosen": -16.83930778503418, "logits/rejected": -18.194427490234375, "logps/chosen": -3094.04345703125, "logps/rejected": -2917.438232421875, "loss": 1.3118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -167.86573791503906, "rewards/margins": 11.57716178894043, "rewards/rejected": -179.44287109375, "step": 48990 }, { "epoch": 2.84, "grad_norm": 46.31658172607422, "learning_rate": 5.4723480010836334e-05, "logits/chosen": -20.089550018310547, "logits/rejected": -22.00678825378418, "logps/chosen": -3033.929931640625, "logps/rejected": -2844.40771484375, "loss": 2.025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.7066192626953, "rewards/margins": 2.8476250171661377, "rewards/rejected": -212.5542449951172, "step": 49000 }, { "epoch": 2.84, "grad_norm": 1.3057968595298064e-10, "learning_rate": 5.452997407020396e-05, "logits/chosen": -15.863210678100586, "logits/rejected": -17.27216911315918, "logps/chosen": -3219.134033203125, "logps/rejected": -3016.671142578125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -120.33757019042969, "rewards/margins": 12.055839538574219, "rewards/rejected": -132.39340209960938, "step": 49010 }, { "epoch": 2.84, "grad_norm": 97.89068603515625, "learning_rate": 5.4336468129571575e-05, "logits/chosen": -17.08521842956543, "logits/rejected": -16.136917114257812, "logps/chosen": -3224.729248046875, "logps/rejected": -3046.44091796875, "loss": 1.4002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -165.94093322753906, "rewards/margins": 6.28167724609375, "rewards/rejected": -172.22262573242188, "step": 49020 }, { "epoch": 2.84, "grad_norm": 1.844248652458191, "learning_rate": 5.41429621889392e-05, "logits/chosen": -18.992061614990234, "logits/rejected": -18.608684539794922, "logps/chosen": -2862.34912109375, "logps/rejected": -2899.17236328125, "loss": 6.0721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -182.0621795654297, "rewards/margins": 0.9757480621337891, "rewards/rejected": -183.03793334960938, "step": 49030 }, { "epoch": 2.84, "grad_norm": 121.59564208984375, "learning_rate": 5.394945624830683e-05, "logits/chosen": -17.754352569580078, "logits/rejected": -18.286327362060547, "logps/chosen": -2678.81787109375, "logps/rejected": -3033.303466796875, "loss": 3.5016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -161.1951446533203, "rewards/margins": 4.162961959838867, "rewards/rejected": -165.35809326171875, "step": 49040 }, { "epoch": 2.84, "grad_norm": 6.240711212158203, "learning_rate": 5.375595030767445e-05, "logits/chosen": -16.437503814697266, "logits/rejected": -16.731536865234375, "logps/chosen": -2745.426025390625, "logps/rejected": -2662.997802734375, "loss": 4.4283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -129.78250122070312, "rewards/margins": 2.4742016792297363, "rewards/rejected": -132.25669860839844, "step": 49050 }, { "epoch": 2.84, "grad_norm": 22.08407974243164, "learning_rate": 5.356244436704207e-05, "logits/chosen": -19.986196517944336, "logits/rejected": -21.441051483154297, "logps/chosen": -3034.553955078125, "logps/rejected": -2989.790283203125, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": -198.76467895507812, "rewards/margins": 10.413739204406738, "rewards/rejected": -209.1784210205078, "step": 49060 }, { "epoch": 2.84, "grad_norm": 0.0005160558503121138, "learning_rate": 5.336893842640969e-05, "logits/chosen": -15.77757740020752, "logits/rejected": -15.820780754089355, "logps/chosen": -3302.346923828125, "logps/rejected": -3058.51220703125, "loss": 3.3143, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -148.4834747314453, "rewards/margins": 5.9600114822387695, "rewards/rejected": -154.44346618652344, "step": 49070 }, { "epoch": 2.84, "grad_norm": 66.14530944824219, "learning_rate": 5.317543248577732e-05, "logits/chosen": -17.115089416503906, "logits/rejected": -17.30162239074707, "logps/chosen": -3023.35107421875, "logps/rejected": -2825.445068359375, "loss": 1.6176, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -186.3387908935547, "rewards/margins": 19.735637664794922, "rewards/rejected": -206.07443237304688, "step": 49080 }, { "epoch": 2.84, "grad_norm": 0.06045347824692726, "learning_rate": 5.2981926545144936e-05, "logits/chosen": -17.390745162963867, "logits/rejected": -18.12514305114746, "logps/chosen": -2480.99365234375, "logps/rejected": -2433.830078125, "loss": 3.4634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -179.00515747070312, "rewards/margins": 5.010926246643066, "rewards/rejected": -184.01608276367188, "step": 49090 }, { "epoch": 2.84, "grad_norm": 3.7139854993512023e-13, "learning_rate": 5.278842060451256e-05, "logits/chosen": -18.872623443603516, "logits/rejected": -18.788713455200195, "logps/chosen": -2366.550537109375, "logps/rejected": -2405.783447265625, "loss": 2.3696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -159.09190368652344, "rewards/margins": 9.686968803405762, "rewards/rejected": -168.77890014648438, "step": 49100 }, { "epoch": 2.84, "grad_norm": 5.234485067617811e-10, "learning_rate": 5.2594914663880184e-05, "logits/chosen": -17.000774383544922, "logits/rejected": -18.08640480041504, "logps/chosen": -2906.552001953125, "logps/rejected": -2535.766845703125, "loss": 6.8284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -151.45249938964844, "rewards/margins": 5.647196292877197, "rewards/rejected": -157.0996856689453, "step": 49110 }, { "epoch": 2.84, "grad_norm": 0.012980354018509388, "learning_rate": 5.240140872324781e-05, "logits/chosen": -18.21730613708496, "logits/rejected": -17.74897575378418, "logps/chosen": -2509.3759765625, "logps/rejected": -2427.49462890625, "loss": 3.6997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.7595672607422, "rewards/margins": 9.196310043334961, "rewards/rejected": -160.95587158203125, "step": 49120 }, { "epoch": 2.84, "grad_norm": 6.848291397094727, "learning_rate": 5.2207902782615425e-05, "logits/chosen": -21.988168716430664, "logits/rejected": -22.309663772583008, "logps/chosen": -2724.404541015625, "logps/rejected": -2913.68310546875, "loss": 1.2967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -204.173095703125, "rewards/margins": 17.82186508178711, "rewards/rejected": -221.99496459960938, "step": 49130 }, { "epoch": 2.84, "grad_norm": 0.000580020307097584, "learning_rate": 5.201439684198305e-05, "logits/chosen": -14.675682067871094, "logits/rejected": -15.618192672729492, "logps/chosen": -3370.535888671875, "logps/rejected": -3310.89501953125, "loss": 1.9646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -148.78396606445312, "rewards/margins": 11.960260391235352, "rewards/rejected": -160.74424743652344, "step": 49140 }, { "epoch": 2.84, "grad_norm": 70.38072967529297, "learning_rate": 5.182089090135067e-05, "logits/chosen": -17.721797943115234, "logits/rejected": -18.264341354370117, "logps/chosen": -2704.78466796875, "logps/rejected": -2623.3359375, "loss": 14.0411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.8657684326172, "rewards/margins": -7.442407131195068, "rewards/rejected": -140.42335510253906, "step": 49150 }, { "epoch": 2.85, "grad_norm": 33.4881477355957, "learning_rate": 5.16273849607183e-05, "logits/chosen": -15.585497856140137, "logits/rejected": -17.363189697265625, "logps/chosen": -2881.37158203125, "logps/rejected": -3007.98974609375, "loss": 5.4783, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -189.6788330078125, "rewards/margins": 8.083940505981445, "rewards/rejected": -197.7627716064453, "step": 49160 }, { "epoch": 2.85, "grad_norm": 0.6545652747154236, "learning_rate": 5.1433879020085914e-05, "logits/chosen": -15.743009567260742, "logits/rejected": -15.283856391906738, "logps/chosen": -2841.883544921875, "logps/rejected": -2665.76953125, "loss": 1.0643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -164.69580078125, "rewards/margins": 10.02000904083252, "rewards/rejected": -174.7158203125, "step": 49170 }, { "epoch": 2.85, "grad_norm": 114.08136749267578, "learning_rate": 5.124037307945354e-05, "logits/chosen": -20.389976501464844, "logits/rejected": -22.508758544921875, "logps/chosen": -2921.044677734375, "logps/rejected": -2812.02001953125, "loss": 9.8561, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -255.724365234375, "rewards/margins": -3.617466688156128, "rewards/rejected": -252.10690307617188, "step": 49180 }, { "epoch": 2.85, "grad_norm": 43.529388427734375, "learning_rate": 5.104686713882116e-05, "logits/chosen": -15.033784866333008, "logits/rejected": -15.39598274230957, "logps/chosen": -2828.509521484375, "logps/rejected": -2449.14306640625, "loss": 2.9059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -157.61805725097656, "rewards/margins": 5.570340156555176, "rewards/rejected": -163.18838500976562, "step": 49190 }, { "epoch": 2.85, "grad_norm": 7.4209859391115445e-22, "learning_rate": 5.0853361198188786e-05, "logits/chosen": -13.734173774719238, "logits/rejected": -14.123071670532227, "logps/chosen": -3385.26513671875, "logps/rejected": -3525.712890625, "loss": 0.3248, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -134.68191528320312, "rewards/margins": 18.55550765991211, "rewards/rejected": -153.23741149902344, "step": 49200 }, { "epoch": 2.85, "grad_norm": 23.08608627319336, "learning_rate": 5.065985525755641e-05, "logits/chosen": -17.228527069091797, "logits/rejected": -17.36081314086914, "logps/chosen": -3064.360595703125, "logps/rejected": -2991.56103515625, "loss": 1.1636, "rewards/accuracies": 0.5, "rewards/chosen": -200.4495849609375, "rewards/margins": 10.785711288452148, "rewards/rejected": -211.2352752685547, "step": 49210 }, { "epoch": 2.85, "grad_norm": 197.10292053222656, "learning_rate": 5.046634931692403e-05, "logits/chosen": -14.155607223510742, "logits/rejected": -13.688232421875, "logps/chosen": -3103.77001953125, "logps/rejected": -2562.267822265625, "loss": 4.4163, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -102.95511627197266, "rewards/margins": 1.9210188388824463, "rewards/rejected": -104.8761215209961, "step": 49220 }, { "epoch": 2.85, "grad_norm": 0.0014635126572102308, "learning_rate": 5.027284337629166e-05, "logits/chosen": -17.747957229614258, "logits/rejected": -18.168821334838867, "logps/chosen": -2891.96728515625, "logps/rejected": -2823.274169921875, "loss": 0.6441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -180.1840362548828, "rewards/margins": 10.11230754852295, "rewards/rejected": -190.29635620117188, "step": 49230 }, { "epoch": 2.85, "grad_norm": 2.052682024356045e-09, "learning_rate": 5.0079337435659275e-05, "logits/chosen": -18.12303924560547, "logits/rejected": -18.6641845703125, "logps/chosen": -3033.86669921875, "logps/rejected": -2931.137451171875, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": -191.84326171875, "rewards/margins": 14.333868026733398, "rewards/rejected": -206.17715454101562, "step": 49240 }, { "epoch": 2.85, "grad_norm": 96.07440185546875, "learning_rate": 4.98858314950269e-05, "logits/chosen": -15.068603515625, "logits/rejected": -15.2315034866333, "logps/chosen": -2921.203857421875, "logps/rejected": -2881.73193359375, "loss": 7.516, "rewards/accuracies": 0.5, "rewards/chosen": -141.17617797851562, "rewards/margins": -0.028638649731874466, "rewards/rejected": -141.14755249023438, "step": 49250 }, { "epoch": 2.85, "grad_norm": 0.202528715133667, "learning_rate": 4.969232555439452e-05, "logits/chosen": -16.97128677368164, "logits/rejected": -16.730701446533203, "logps/chosen": -3321.72509765625, "logps/rejected": -3107.78466796875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -144.01107788085938, "rewards/margins": 15.891616821289062, "rewards/rejected": -159.9027099609375, "step": 49260 }, { "epoch": 2.85, "grad_norm": 36.1680793762207, "learning_rate": 4.949881961376215e-05, "logits/chosen": -18.205053329467773, "logits/rejected": -19.1476993560791, "logps/chosen": -3143.593505859375, "logps/rejected": -2735.55810546875, "loss": 6.0283, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -174.54534912109375, "rewards/margins": 2.536595106124878, "rewards/rejected": -177.08193969726562, "step": 49270 }, { "epoch": 2.85, "grad_norm": 0.006227921694517136, "learning_rate": 4.9305313673129764e-05, "logits/chosen": -18.174922943115234, "logits/rejected": -19.069822311401367, "logps/chosen": -2880.163330078125, "logps/rejected": -2911.98974609375, "loss": 6.9503, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -193.373291015625, "rewards/margins": -5.211788654327393, "rewards/rejected": -188.16151428222656, "step": 49280 }, { "epoch": 2.85, "grad_norm": 5.429681914392859e-05, "learning_rate": 4.911180773249739e-05, "logits/chosen": -18.225187301635742, "logits/rejected": -19.514535903930664, "logps/chosen": -2649.65966796875, "logps/rejected": -2634.068359375, "loss": 3.6181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -193.4840087890625, "rewards/margins": 6.853758811950684, "rewards/rejected": -200.33778381347656, "step": 49290 }, { "epoch": 2.85, "grad_norm": 7.166964530944824, "learning_rate": 4.891830179186501e-05, "logits/chosen": -14.685206413269043, "logits/rejected": -15.94696044921875, "logps/chosen": -3370.49951171875, "logps/rejected": -3289.865966796875, "loss": 5.2272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -184.55047607421875, "rewards/margins": 6.819052219390869, "rewards/rejected": -191.36953735351562, "step": 49300 }, { "epoch": 2.85, "grad_norm": 4.7053243246164556e-11, "learning_rate": 4.8724795851232636e-05, "logits/chosen": -16.7941837310791, "logits/rejected": -18.711902618408203, "logps/chosen": -2685.2724609375, "logps/rejected": -2969.85693359375, "loss": 4.4046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -186.76943969726562, "rewards/margins": 4.632574558258057, "rewards/rejected": -191.40200805664062, "step": 49310 }, { "epoch": 2.85, "grad_norm": 1.5075524970598053e-06, "learning_rate": 4.853128991060025e-05, "logits/chosen": -15.618841171264648, "logits/rejected": -15.4412841796875, "logps/chosen": -3245.777587890625, "logps/rejected": -3100.088623046875, "loss": 18.8166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -80.81488037109375, "rewards/margins": -9.576251029968262, "rewards/rejected": -71.23863220214844, "step": 49320 }, { "epoch": 2.86, "grad_norm": 61.83898162841797, "learning_rate": 4.833778396996788e-05, "logits/chosen": -21.172046661376953, "logits/rejected": -22.928733825683594, "logps/chosen": -2844.5263671875, "logps/rejected": -2800.36865234375, "loss": 0.1857, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -216.2405548095703, "rewards/margins": 8.601275444030762, "rewards/rejected": -224.8418426513672, "step": 49330 }, { "epoch": 2.86, "grad_norm": 8.205555701579215e-08, "learning_rate": 4.81442780293355e-05, "logits/chosen": -16.36615562438965, "logits/rejected": -16.49337387084961, "logps/chosen": -2505.66162109375, "logps/rejected": -2283.59228515625, "loss": 0.7403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.91648864746094, "rewards/margins": 17.289081573486328, "rewards/rejected": -188.20558166503906, "step": 49340 }, { "epoch": 2.86, "grad_norm": 0.03680606931447983, "learning_rate": 4.7950772088703125e-05, "logits/chosen": -15.484959602355957, "logits/rejected": -18.204036712646484, "logps/chosen": -3320.723388671875, "logps/rejected": -2783.797607421875, "loss": 24.26, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.52813720703125, "rewards/margins": -16.793781280517578, "rewards/rejected": -124.7343521118164, "step": 49350 }, { "epoch": 2.86, "grad_norm": 1.1307428394502494e-05, "learning_rate": 4.775726614807074e-05, "logits/chosen": -16.39040756225586, "logits/rejected": -16.64411735534668, "logps/chosen": -2845.0810546875, "logps/rejected": -2796.028564453125, "loss": 2.2087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -201.71568298339844, "rewards/margins": 5.155546188354492, "rewards/rejected": -206.87124633789062, "step": 49360 }, { "epoch": 2.86, "grad_norm": 9.876660378438373e-09, "learning_rate": 4.756376020743837e-05, "logits/chosen": -14.993295669555664, "logits/rejected": -15.014264106750488, "logps/chosen": -3014.38720703125, "logps/rejected": -2560.949462890625, "loss": 0.9957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -132.14334106445312, "rewards/margins": 10.617432594299316, "rewards/rejected": -142.76077270507812, "step": 49370 }, { "epoch": 2.86, "grad_norm": 13.899585723876953, "learning_rate": 4.737025426680599e-05, "logits/chosen": -16.397005081176758, "logits/rejected": -16.83826446533203, "logps/chosen": -2884.670654296875, "logps/rejected": -2947.565673828125, "loss": 3.9041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.98057556152344, "rewards/margins": -0.3184043765068054, "rewards/rejected": -168.66217041015625, "step": 49380 }, { "epoch": 2.86, "grad_norm": 41.52326965332031, "learning_rate": 4.7176748326173614e-05, "logits/chosen": -15.224159240722656, "logits/rejected": -15.804168701171875, "logps/chosen": -2830.239501953125, "logps/rejected": -3390.7109375, "loss": 0.3369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -154.0480499267578, "rewards/margins": 14.478775024414062, "rewards/rejected": -168.52682495117188, "step": 49390 }, { "epoch": 2.86, "grad_norm": 19.118465423583984, "learning_rate": 4.698324238554124e-05, "logits/chosen": -18.295867919921875, "logits/rejected": -21.152400970458984, "logps/chosen": -2512.493408203125, "logps/rejected": -2550.92431640625, "loss": 6.2729, "rewards/accuracies": 0.5, "rewards/chosen": -140.3931427001953, "rewards/margins": 8.081989288330078, "rewards/rejected": -148.47511291503906, "step": 49400 }, { "epoch": 2.86, "grad_norm": 235.0437469482422, "learning_rate": 4.678973644490886e-05, "logits/chosen": -17.274839401245117, "logits/rejected": -19.64931297302246, "logps/chosen": -2675.14990234375, "logps/rejected": -2890.59130859375, "loss": 3.3875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -119.02638244628906, "rewards/margins": 2.3748939037323, "rewards/rejected": -121.4012680053711, "step": 49410 }, { "epoch": 2.86, "grad_norm": 0.0007985576521605253, "learning_rate": 4.6596230504276486e-05, "logits/chosen": -18.836801528930664, "logits/rejected": -18.826480865478516, "logps/chosen": -2910.321044921875, "logps/rejected": -2988.450439453125, "loss": 0.0699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -147.59744262695312, "rewards/margins": 14.726486206054688, "rewards/rejected": -162.32394409179688, "step": 49420 }, { "epoch": 2.86, "grad_norm": 210.48016357421875, "learning_rate": 4.64027245636441e-05, "logits/chosen": -17.902740478515625, "logits/rejected": -18.041004180908203, "logps/chosen": -2846.10302734375, "logps/rejected": -2605.048828125, "loss": 2.4898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.79739379882812, "rewards/margins": 10.44869613647461, "rewards/rejected": -181.24607849121094, "step": 49430 }, { "epoch": 2.86, "grad_norm": 4.923210869875591e-15, "learning_rate": 4.620921862301173e-05, "logits/chosen": -16.525461196899414, "logits/rejected": -16.97140884399414, "logps/chosen": -3105.24560546875, "logps/rejected": -3039.72216796875, "loss": 0.756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -143.83888244628906, "rewards/margins": 18.304506301879883, "rewards/rejected": -162.14340209960938, "step": 49440 }, { "epoch": 2.86, "grad_norm": 119.52133178710938, "learning_rate": 4.601571268237935e-05, "logits/chosen": -18.695632934570312, "logits/rejected": -18.214332580566406, "logps/chosen": -2288.93798828125, "logps/rejected": -2435.343017578125, "loss": 1.6841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.69570922851562, "rewards/margins": 4.731044292449951, "rewards/rejected": -178.4267578125, "step": 49450 }, { "epoch": 2.86, "grad_norm": 0.049637142568826675, "learning_rate": 4.5822206741746975e-05, "logits/chosen": -17.288654327392578, "logits/rejected": -17.64431381225586, "logps/chosen": -3055.330078125, "logps/rejected": -2536.99267578125, "loss": 10.0268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -136.4681854248047, "rewards/margins": 11.293210983276367, "rewards/rejected": -147.76138305664062, "step": 49460 }, { "epoch": 2.86, "grad_norm": 2.3731008695904166e-06, "learning_rate": 4.562870080111459e-05, "logits/chosen": -17.890233993530273, "logits/rejected": -19.377771377563477, "logps/chosen": -3133.59228515625, "logps/rejected": -2902.328857421875, "loss": 3.8466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -240.08926391601562, "rewards/margins": 8.837450981140137, "rewards/rejected": -248.9266815185547, "step": 49470 }, { "epoch": 2.86, "grad_norm": 0.0, "learning_rate": 4.5435194860482216e-05, "logits/chosen": -17.509531021118164, "logits/rejected": -17.564178466796875, "logps/chosen": -3005.80859375, "logps/rejected": -3056.259765625, "loss": 0.134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -180.91207885742188, "rewards/margins": 13.758939743041992, "rewards/rejected": -194.6710205078125, "step": 49480 }, { "epoch": 2.86, "grad_norm": 0.9175106287002563, "learning_rate": 4.524168891984984e-05, "logits/chosen": -18.012998580932617, "logits/rejected": -18.934415817260742, "logps/chosen": -2821.063232421875, "logps/rejected": -2824.1171875, "loss": 4.2189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -181.00970458984375, "rewards/margins": 3.594268798828125, "rewards/rejected": -184.60397338867188, "step": 49490 }, { "epoch": 2.87, "grad_norm": 1.729736770483825e-10, "learning_rate": 4.5048182979217464e-05, "logits/chosen": -15.257360458374023, "logits/rejected": -15.534878730773926, "logps/chosen": -3310.80517578125, "logps/rejected": -3113.990234375, "loss": 2.2426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -217.8459930419922, "rewards/margins": 9.94921875, "rewards/rejected": -227.7952117919922, "step": 49500 }, { "epoch": 2.87, "grad_norm": 9.188025615003426e-06, "learning_rate": 4.485467703858508e-05, "logits/chosen": -16.55098533630371, "logits/rejected": -16.22709846496582, "logps/chosen": -3266.256591796875, "logps/rejected": -2635.219970703125, "loss": 2.5329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -174.95877075195312, "rewards/margins": 6.676027774810791, "rewards/rejected": -181.6348114013672, "step": 49510 }, { "epoch": 2.87, "grad_norm": 1.545030863780994e-05, "learning_rate": 4.466117109795271e-05, "logits/chosen": -17.131053924560547, "logits/rejected": -17.46257781982422, "logps/chosen": -2668.040771484375, "logps/rejected": -2659.17138671875, "loss": 2.3202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -174.3617401123047, "rewards/margins": 9.285870552062988, "rewards/rejected": -183.6476287841797, "step": 49520 }, { "epoch": 2.87, "grad_norm": 110.44959259033203, "learning_rate": 4.446766515732033e-05, "logits/chosen": -17.799983978271484, "logits/rejected": -19.20570945739746, "logps/chosen": -2726.24169921875, "logps/rejected": -2924.888916015625, "loss": 6.7828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -189.28665161132812, "rewards/margins": 1.8284553289413452, "rewards/rejected": -191.1151123046875, "step": 49530 }, { "epoch": 2.87, "grad_norm": 0.07203330844640732, "learning_rate": 4.427415921668795e-05, "logits/chosen": -16.57137107849121, "logits/rejected": -16.7788028717041, "logps/chosen": -2891.7001953125, "logps/rejected": -2497.038818359375, "loss": 0.7404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -142.5022735595703, "rewards/margins": 18.670101165771484, "rewards/rejected": -161.17234802246094, "step": 49540 }, { "epoch": 2.87, "grad_norm": 46.42525863647461, "learning_rate": 4.408065327605557e-05, "logits/chosen": -21.515697479248047, "logits/rejected": -21.805519104003906, "logps/chosen": -2519.50537109375, "logps/rejected": -2601.96728515625, "loss": 1.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -219.31661987304688, "rewards/margins": 14.387136459350586, "rewards/rejected": -233.7037811279297, "step": 49550 }, { "epoch": 2.87, "grad_norm": 62.624549865722656, "learning_rate": 4.38871473354232e-05, "logits/chosen": -18.140214920043945, "logits/rejected": -18.332073211669922, "logps/chosen": -2617.34326171875, "logps/rejected": -2678.90234375, "loss": 13.6392, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -130.3489532470703, "rewards/margins": -8.871709823608398, "rewards/rejected": -121.47724914550781, "step": 49560 }, { "epoch": 2.87, "grad_norm": 40.767520904541016, "learning_rate": 4.3693641394790825e-05, "logits/chosen": -15.83564281463623, "logits/rejected": -17.016817092895508, "logps/chosen": -3488.021484375, "logps/rejected": -3354.026611328125, "loss": 1.1413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -112.61087799072266, "rewards/margins": 10.737375259399414, "rewards/rejected": -123.3482437133789, "step": 49570 }, { "epoch": 2.87, "grad_norm": 1.8438355255057104e-05, "learning_rate": 4.350013545415844e-05, "logits/chosen": -17.0926513671875, "logits/rejected": -18.358177185058594, "logps/chosen": -3127.665771484375, "logps/rejected": -3097.38232421875, "loss": 0.1249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -163.44273376464844, "rewards/margins": 10.591862678527832, "rewards/rejected": -174.0346221923828, "step": 49580 }, { "epoch": 2.87, "grad_norm": 99.51551818847656, "learning_rate": 4.3306629513526066e-05, "logits/chosen": -17.912830352783203, "logits/rejected": -17.316219329833984, "logps/chosen": -2586.331787109375, "logps/rejected": -2599.18017578125, "loss": 1.682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -195.1835479736328, "rewards/margins": 0.8831678628921509, "rewards/rejected": -196.0666961669922, "step": 49590 }, { "epoch": 2.87, "grad_norm": 85.65770721435547, "learning_rate": 4.311312357289369e-05, "logits/chosen": -16.62883186340332, "logits/rejected": -18.07712173461914, "logps/chosen": -2938.030517578125, "logps/rejected": -2864.17529296875, "loss": 1.6815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -211.95785522460938, "rewards/margins": 9.095772743225098, "rewards/rejected": -221.0536346435547, "step": 49600 }, { "epoch": 2.87, "grad_norm": 38.00510025024414, "learning_rate": 4.2919617632261314e-05, "logits/chosen": -17.444416046142578, "logits/rejected": -18.180912017822266, "logps/chosen": -2718.982177734375, "logps/rejected": -2542.02978515625, "loss": 15.1309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -168.08053588867188, "rewards/margins": -11.778993606567383, "rewards/rejected": -156.30152893066406, "step": 49610 }, { "epoch": 2.87, "grad_norm": 1.2792810201644897, "learning_rate": 4.272611169162893e-05, "logits/chosen": -17.623165130615234, "logits/rejected": -17.242938995361328, "logps/chosen": -2819.547119140625, "logps/rejected": -3019.467529296875, "loss": 0.367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -166.26040649414062, "rewards/margins": 8.373884201049805, "rewards/rejected": -174.63430786132812, "step": 49620 }, { "epoch": 2.87, "grad_norm": 6.087341830607329e-08, "learning_rate": 4.2532605750996555e-05, "logits/chosen": -19.864559173583984, "logits/rejected": -21.453113555908203, "logps/chosen": -2977.509033203125, "logps/rejected": -3038.49072265625, "loss": 7.7644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -209.9661102294922, "rewards/margins": -0.3477729856967926, "rewards/rejected": -209.6183319091797, "step": 49630 }, { "epoch": 2.87, "grad_norm": 7.54765496822074e-05, "learning_rate": 4.233909981036418e-05, "logits/chosen": -17.137527465820312, "logits/rejected": -16.825231552124023, "logps/chosen": -3088.09814453125, "logps/rejected": -2886.37548828125, "loss": 3.3158, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -180.8249053955078, "rewards/margins": 3.2766571044921875, "rewards/rejected": -184.10159301757812, "step": 49640 }, { "epoch": 2.87, "grad_norm": 0.0004899102495983243, "learning_rate": 4.21455938697318e-05, "logits/chosen": -16.956771850585938, "logits/rejected": -17.45138168334961, "logps/chosen": -3147.154296875, "logps/rejected": -2700.529052734375, "loss": 2.3842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -98.78511810302734, "rewards/margins": 11.038762092590332, "rewards/rejected": -109.82386779785156, "step": 49650 }, { "epoch": 2.87, "grad_norm": 2.893116106861271e-05, "learning_rate": 4.195208792909942e-05, "logits/chosen": -17.221933364868164, "logits/rejected": -17.82190704345703, "logps/chosen": -2383.228271484375, "logps/rejected": -2256.409423828125, "loss": 0.3582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -169.416259765625, "rewards/margins": 8.164871215820312, "rewards/rejected": -177.58114624023438, "step": 49660 }, { "epoch": 2.88, "grad_norm": 72.13435363769531, "learning_rate": 4.175858198846705e-05, "logits/chosen": -20.5216064453125, "logits/rejected": -20.79718017578125, "logps/chosen": -2611.45556640625, "logps/rejected": -2617.934326171875, "loss": 5.5461, "rewards/accuracies": 0.5, "rewards/chosen": -230.80343627929688, "rewards/margins": -0.2823295593261719, "rewards/rejected": -230.5211181640625, "step": 49670 }, { "epoch": 2.88, "grad_norm": 0.5462756752967834, "learning_rate": 4.156507604783467e-05, "logits/chosen": -17.992902755737305, "logits/rejected": -20.774166107177734, "logps/chosen": -2926.32958984375, "logps/rejected": -2782.983642578125, "loss": 8.3804, "rewards/accuracies": 0.5, "rewards/chosen": -248.4083709716797, "rewards/margins": -3.974226474761963, "rewards/rejected": -244.4341583251953, "step": 49680 }, { "epoch": 2.88, "grad_norm": 0.00044769427040591836, "learning_rate": 4.137157010720229e-05, "logits/chosen": -15.769513130187988, "logits/rejected": -16.61087417602539, "logps/chosen": -2669.82080078125, "logps/rejected": -2990.763916015625, "loss": 2.1259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -151.30352783203125, "rewards/margins": 5.49456787109375, "rewards/rejected": -156.79811096191406, "step": 49690 }, { "epoch": 2.88, "grad_norm": 0.00022231681214179844, "learning_rate": 4.117806416656991e-05, "logits/chosen": -16.91998863220215, "logits/rejected": -18.04582977294922, "logps/chosen": -2855.425537109375, "logps/rejected": -2377.68603515625, "loss": 3.0988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -135.825927734375, "rewards/margins": 21.798622131347656, "rewards/rejected": -157.6245574951172, "step": 49700 }, { "epoch": 2.88, "grad_norm": 118.18955993652344, "learning_rate": 4.098455822593754e-05, "logits/chosen": -16.300100326538086, "logits/rejected": -16.591838836669922, "logps/chosen": -2965.06640625, "logps/rejected": -2574.9638671875, "loss": 9.9122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -147.21226501464844, "rewards/margins": 2.6008613109588623, "rewards/rejected": -149.81312561035156, "step": 49710 }, { "epoch": 2.88, "grad_norm": 0.056806035339832306, "learning_rate": 4.079105228530516e-05, "logits/chosen": -21.595760345458984, "logits/rejected": -22.67367172241211, "logps/chosen": -2621.41357421875, "logps/rejected": -2616.308349609375, "loss": 3.4994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -185.76773071289062, "rewards/margins": 5.489818096160889, "rewards/rejected": -191.25755310058594, "step": 49720 }, { "epoch": 2.88, "grad_norm": 0.007443931419402361, "learning_rate": 4.059754634467278e-05, "logits/chosen": -15.456975936889648, "logits/rejected": -16.13310432434082, "logps/chosen": -2642.135986328125, "logps/rejected": -2719.071044921875, "loss": 1.5221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -141.00692749023438, "rewards/margins": 10.638778686523438, "rewards/rejected": -151.6457061767578, "step": 49730 }, { "epoch": 2.88, "grad_norm": 1.5075747828605194e-16, "learning_rate": 4.0404040404040405e-05, "logits/chosen": -16.373075485229492, "logits/rejected": -17.766048431396484, "logps/chosen": -3042.71484375, "logps/rejected": -2929.01806640625, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -158.7015838623047, "rewards/margins": 12.548202514648438, "rewards/rejected": -171.24978637695312, "step": 49740 }, { "epoch": 2.88, "grad_norm": 4.182703495025635, "learning_rate": 4.021053446340803e-05, "logits/chosen": -17.046640396118164, "logits/rejected": -17.78217124938965, "logps/chosen": -2759.844970703125, "logps/rejected": -2733.55322265625, "loss": 1.1859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -170.98263549804688, "rewards/margins": 9.640628814697266, "rewards/rejected": -180.62326049804688, "step": 49750 }, { "epoch": 2.88, "grad_norm": 7.810753595549613e-05, "learning_rate": 4.001702852277565e-05, "logits/chosen": -19.654254913330078, "logits/rejected": -19.885334014892578, "logps/chosen": -2662.8291015625, "logps/rejected": -2676.372314453125, "loss": 3.329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -170.71298217773438, "rewards/margins": 1.240873098373413, "rewards/rejected": -171.95384216308594, "step": 49760 }, { "epoch": 2.88, "grad_norm": 46.14556884765625, "learning_rate": 3.982352258214327e-05, "logits/chosen": -23.17543601989746, "logits/rejected": -21.793018341064453, "logps/chosen": -2660.594482421875, "logps/rejected": -2725.232177734375, "loss": 2.324, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -240.77590942382812, "rewards/margins": 2.8667938709259033, "rewards/rejected": -243.64266967773438, "step": 49770 }, { "epoch": 2.88, "grad_norm": 40.40470504760742, "learning_rate": 3.96300166415109e-05, "logits/chosen": -18.05821990966797, "logits/rejected": -18.385143280029297, "logps/chosen": -2807.9755859375, "logps/rejected": -2853.585693359375, "loss": 3.1528, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -155.43374633789062, "rewards/margins": 6.327458381652832, "rewards/rejected": -161.76119995117188, "step": 49780 }, { "epoch": 2.88, "grad_norm": 0.0, "learning_rate": 3.943651070087852e-05, "logits/chosen": -18.940704345703125, "logits/rejected": -18.923362731933594, "logps/chosen": -2678.23193359375, "logps/rejected": -2668.7548828125, "loss": 1.0642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -175.04644775390625, "rewards/margins": 18.47199058532715, "rewards/rejected": -193.51846313476562, "step": 49790 }, { "epoch": 2.88, "grad_norm": 77.60845184326172, "learning_rate": 3.924300476024614e-05, "logits/chosen": -19.078731536865234, "logits/rejected": -20.29201889038086, "logps/chosen": -3024.086669921875, "logps/rejected": -2795.75146484375, "loss": 12.8264, "rewards/accuracies": 0.5, "rewards/chosen": -220.36087036132812, "rewards/margins": -5.02182149887085, "rewards/rejected": -215.3390350341797, "step": 49800 }, { "epoch": 2.88, "grad_norm": 5.816525459289551, "learning_rate": 3.904949881961376e-05, "logits/chosen": -16.585569381713867, "logits/rejected": -18.367143630981445, "logps/chosen": -3148.677734375, "logps/rejected": -3040.263671875, "loss": 5.1052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -205.4528350830078, "rewards/margins": -0.016654204577207565, "rewards/rejected": -205.43618774414062, "step": 49810 }, { "epoch": 2.88, "grad_norm": 193.37045288085938, "learning_rate": 3.885599287898139e-05, "logits/chosen": -18.294490814208984, "logits/rejected": -19.586841583251953, "logps/chosen": -2785.72265625, "logps/rejected": -2620.1640625, "loss": 6.533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -173.9287109375, "rewards/margins": 3.2348124980926514, "rewards/rejected": -177.16354370117188, "step": 49820 }, { "epoch": 2.88, "grad_norm": 193.7652587890625, "learning_rate": 3.866248693834901e-05, "logits/chosen": -17.415607452392578, "logits/rejected": -17.703495025634766, "logps/chosen": -2922.53125, "logps/rejected": -2901.82763671875, "loss": 8.897, "rewards/accuracies": 0.5, "rewards/chosen": -152.20260620117188, "rewards/margins": -4.432930946350098, "rewards/rejected": -147.76966857910156, "step": 49830 }, { "epoch": 2.88, "grad_norm": 0.0003201300569344312, "learning_rate": 3.846898099771663e-05, "logits/chosen": -18.371768951416016, "logits/rejected": -19.465347290039062, "logps/chosen": -2926.62158203125, "logps/rejected": -2912.901611328125, "loss": 1.012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -152.1573944091797, "rewards/margins": 5.288963317871094, "rewards/rejected": -157.4463653564453, "step": 49840 }, { "epoch": 2.89, "grad_norm": 2.3391226733338044e-09, "learning_rate": 3.827547505708425e-05, "logits/chosen": -17.569198608398438, "logits/rejected": -18.020587921142578, "logps/chosen": -2611.183349609375, "logps/rejected": -2234.09423828125, "loss": 0.7874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -149.51065063476562, "rewards/margins": 12.189024925231934, "rewards/rejected": -161.69967651367188, "step": 49850 }, { "epoch": 2.89, "grad_norm": 1.3844905040125144e-12, "learning_rate": 3.808196911645188e-05, "logits/chosen": -17.213491439819336, "logits/rejected": -17.270553588867188, "logps/chosen": -2871.735107421875, "logps/rejected": -2565.78173828125, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": -143.8280487060547, "rewards/margins": 8.46357536315918, "rewards/rejected": -152.29165649414062, "step": 49860 }, { "epoch": 2.89, "grad_norm": 0.0002652259427122772, "learning_rate": 3.7888463175819497e-05, "logits/chosen": -22.176733016967773, "logits/rejected": -22.229955673217773, "logps/chosen": -3002.431640625, "logps/rejected": -2924.660400390625, "loss": 3.6812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -235.3111114501953, "rewards/margins": 2.670342206954956, "rewards/rejected": -237.98147583007812, "step": 49870 }, { "epoch": 2.89, "grad_norm": 0.18137353658676147, "learning_rate": 3.769495723518712e-05, "logits/chosen": -14.700653076171875, "logits/rejected": -14.65784740447998, "logps/chosen": -3309.69921875, "logps/rejected": -2730.20654296875, "loss": 0.8536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -103.89373779296875, "rewards/margins": 9.612940788269043, "rewards/rejected": -113.50667572021484, "step": 49880 }, { "epoch": 2.89, "grad_norm": 3.0976716516306624e-06, "learning_rate": 3.750145129455474e-05, "logits/chosen": -17.31360626220703, "logits/rejected": -17.152692794799805, "logps/chosen": -2510.833251953125, "logps/rejected": -2910.00048828125, "loss": 4.8858, "rewards/accuracies": 0.5, "rewards/chosen": -180.64056396484375, "rewards/margins": 0.837948203086853, "rewards/rejected": -181.478515625, "step": 49890 }, { "epoch": 2.89, "grad_norm": 43.8767204284668, "learning_rate": 3.730794535392237e-05, "logits/chosen": -23.292591094970703, "logits/rejected": -23.821197509765625, "logps/chosen": -2572.2666015625, "logps/rejected": -2611.030029296875, "loss": 0.5694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -227.8941650390625, "rewards/margins": 7.671347141265869, "rewards/rejected": -235.56552124023438, "step": 49900 }, { "epoch": 2.89, "grad_norm": 7.08571720123291, "learning_rate": 3.7114439413289986e-05, "logits/chosen": -16.949499130249023, "logits/rejected": -17.448139190673828, "logps/chosen": -2993.54931640625, "logps/rejected": -3063.770751953125, "loss": 6.004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -192.498779296875, "rewards/margins": 2.201453685760498, "rewards/rejected": -194.70022583007812, "step": 49910 }, { "epoch": 2.89, "grad_norm": 1.3088876008987427, "learning_rate": 3.692093347265761e-05, "logits/chosen": -18.27474021911621, "logits/rejected": -19.320049285888672, "logps/chosen": -2737.894287109375, "logps/rejected": -2385.565673828125, "loss": 1.5405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -201.76004028320312, "rewards/margins": 8.822880744934082, "rewards/rejected": -210.58291625976562, "step": 49920 }, { "epoch": 2.89, "grad_norm": 4.0869521308883883e-13, "learning_rate": 3.672742753202524e-05, "logits/chosen": -16.352628707885742, "logits/rejected": -17.216367721557617, "logps/chosen": -3107.4248046875, "logps/rejected": -2887.427978515625, "loss": 10.7473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -128.12539672851562, "rewards/margins": 3.031339168548584, "rewards/rejected": -131.15673828125, "step": 49930 }, { "epoch": 2.89, "grad_norm": 0.0025599924847483635, "learning_rate": 3.653392159139286e-05, "logits/chosen": -19.970354080200195, "logits/rejected": -20.403549194335938, "logps/chosen": -2773.53369140625, "logps/rejected": -2322.525146484375, "loss": 4.8518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -141.2147216796875, "rewards/margins": 5.890833854675293, "rewards/rejected": -147.10556030273438, "step": 49940 }, { "epoch": 2.89, "grad_norm": 0.025928737595677376, "learning_rate": 3.634041565076048e-05, "logits/chosen": -15.922749519348145, "logits/rejected": -17.25811195373535, "logps/chosen": -3080.497802734375, "logps/rejected": -3177.123046875, "loss": 2.4704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -171.20164489746094, "rewards/margins": 10.209287643432617, "rewards/rejected": -181.41094970703125, "step": 49950 }, { "epoch": 2.89, "grad_norm": 8.685820812592837e-11, "learning_rate": 3.61469097101281e-05, "logits/chosen": -18.375715255737305, "logits/rejected": -20.534292221069336, "logps/chosen": -2642.234375, "logps/rejected": -2574.5048828125, "loss": 0.8539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -196.48178100585938, "rewards/margins": 16.522769927978516, "rewards/rejected": -213.00454711914062, "step": 49960 }, { "epoch": 2.89, "grad_norm": 129.13978576660156, "learning_rate": 3.595340376949573e-05, "logits/chosen": -18.909652709960938, "logits/rejected": -21.70547103881836, "logps/chosen": -3234.28125, "logps/rejected": -3133.64599609375, "loss": 2.8246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -286.3567810058594, "rewards/margins": 7.557762145996094, "rewards/rejected": -293.91455078125, "step": 49970 }, { "epoch": 2.89, "grad_norm": 1.5354285665125644e-07, "learning_rate": 3.5759897828863347e-05, "logits/chosen": -15.834104537963867, "logits/rejected": -16.304393768310547, "logps/chosen": -2730.161865234375, "logps/rejected": -2792.966552734375, "loss": 5.2305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -177.42298889160156, "rewards/margins": 3.9485855102539062, "rewards/rejected": -181.37158203125, "step": 49980 }, { "epoch": 2.89, "grad_norm": 0.25955814123153687, "learning_rate": 3.556639188823097e-05, "logits/chosen": -20.60123634338379, "logits/rejected": -21.26789665222168, "logps/chosen": -2825.6650390625, "logps/rejected": -2843.031494140625, "loss": 2.4094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -213.26541137695312, "rewards/margins": 10.57829475402832, "rewards/rejected": -223.8437042236328, "step": 49990 }, { "epoch": 2.89, "grad_norm": 5.159951602314603e-16, "learning_rate": 3.537288594759859e-05, "logits/chosen": -18.926538467407227, "logits/rejected": -21.202037811279297, "logps/chosen": -2804.2060546875, "logps/rejected": -2295.44482421875, "loss": 9.608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -184.22756958007812, "rewards/margins": 5.823060035705566, "rewards/rejected": -190.0506134033203, "step": 50000 } ], "logging_steps": 10, "max_steps": 51828, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }