{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012025012025012025, "grad_norm": 801.1768400159908, "learning_rate": 6.009615384615385e-10, "logits/generated": -2.701044797897339, "logits/real": -2.630336046218872, "logps/generated": -268.9845275878906, "logps/real": -454.865478515625, "loss": 1.0444, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.0012025012025012026, "grad_norm": 779.6618128301591, "learning_rate": 6.009615384615385e-09, "logits/generated": -2.5943427085876465, "logits/real": -2.5661115646362305, "logps/generated": -653.7264404296875, "logps/real": -644.458740234375, "loss": 1.1575, "rewards/accuracies": 0.2777777910232544, "rewards/generated": 0.0031201376114040613, "rewards/margins": -0.036962077021598816, "rewards/real": -0.03384193778038025, "step": 10 }, { "epoch": 0.002405002405002405, "grad_norm": 801.905899552404, "learning_rate": 1.201923076923077e-08, "logits/generated": -2.601459503173828, "logits/real": -2.5791308879852295, "logps/generated": -425.9059143066406, "logps/real": -415.3407287597656, "loss": 1.1469, "rewards/accuracies": 0.574999988079071, "rewards/generated": -0.014987630769610405, "rewards/margins": 0.01562613621354103, "rewards/real": 0.0006385032320395112, "step": 20 }, { "epoch": 0.0036075036075036075, "grad_norm": 915.0921462682539, "learning_rate": 1.802884615384615e-08, "logits/generated": -2.614966869354248, "logits/real": -2.579796552658081, "logps/generated": -541.577392578125, "logps/real": -548.8224487304688, "loss": 1.1734, "rewards/accuracies": 0.42500001192092896, "rewards/generated": 0.016047468408942223, "rewards/margins": -0.0203808955848217, "rewards/real": -0.004333429038524628, "step": 30 }, { "epoch": 0.00481000481000481, "grad_norm": 730.4416358593099, "learning_rate": 2.403846153846154e-08, "logits/generated": -2.593712091445923, "logits/real": -2.599540948867798, "logps/generated": -575.514404296875, "logps/real": -462.55242919921875, "loss": 1.1219, "rewards/accuracies": 0.5249999761581421, "rewards/generated": 0.0017959155375137925, "rewards/margins": 0.014108413830399513, "rewards/real": 0.015904325991868973, "step": 40 }, { "epoch": 0.006012506012506013, "grad_norm": 764.8278246157919, "learning_rate": 3.004807692307692e-08, "logits/generated": -2.609126329421997, "logits/real": -2.536421537399292, "logps/generated": -554.6502075195312, "logps/real": -468.76885986328125, "loss": 1.1639, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.04441952705383301, "rewards/margins": 0.1175607442855835, "rewards/real": 0.07314121723175049, "step": 50 }, { "epoch": 0.007215007215007215, "grad_norm": 723.6041258034065, "learning_rate": 3.60576923076923e-08, "logits/generated": -2.613905668258667, "logits/real": -2.5859856605529785, "logps/generated": -523.7518310546875, "logps/real": -452.20654296875, "loss": 1.1171, "rewards/accuracies": 0.800000011920929, "rewards/generated": 0.0223550945520401, "rewards/margins": 0.091279536485672, "rewards/real": 0.11363464593887329, "step": 60 }, { "epoch": 0.008417508417508417, "grad_norm": 807.1990949198348, "learning_rate": 4.206730769230769e-08, "logits/generated": -2.6600990295410156, "logits/real": -2.5796685218811035, "logps/generated": -488.0248107910156, "logps/real": -502.6383361816406, "loss": 1.0794, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.03872176259756088, "rewards/margins": 0.2055022418498993, "rewards/real": 0.1667805016040802, "step": 70 }, { "epoch": 0.00962000962000962, "grad_norm": 557.9206390670668, "learning_rate": 4.807692307692308e-08, "logits/generated": -2.5978739261627197, "logits/real": -2.5913002490997314, "logps/generated": -405.31683349609375, "logps/real": -380.29229736328125, "loss": 1.0792, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.05246102809906006, "rewards/margins": 0.22621431946754456, "rewards/real": 0.1737532913684845, "step": 80 }, { "epoch": 0.010822510822510822, "grad_norm": 542.1020748733775, "learning_rate": 5.4086538461538464e-08, "logits/generated": -2.5775256156921387, "logits/real": -2.583211898803711, "logps/generated": -609.107421875, "logps/real": -473.18231201171875, "loss": 0.9755, "rewards/accuracies": 0.7749999761581421, "rewards/generated": -0.1160820946097374, "rewards/margins": 0.31075945496559143, "rewards/real": 0.19467735290527344, "step": 90 }, { "epoch": 0.012025012025012025, "grad_norm": 640.8082792099547, "learning_rate": 6.009615384615384e-08, "logits/generated": -2.6364057064056396, "logits/real": -2.643916606903076, "logps/generated": -570.5785522460938, "logps/real": -506.00396728515625, "loss": 0.9193, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.19135859608650208, "rewards/margins": 0.3666282296180725, "rewards/real": 0.17526963353157043, "step": 100 }, { "epoch": 0.013227513227513227, "grad_norm": 1166.4897957447906, "learning_rate": 6.610576923076924e-08, "logits/generated": -2.634885311126709, "logits/real": -2.648221492767334, "logps/generated": -475.5386657714844, "logps/real": -539.703369140625, "loss": 1.0419, "rewards/accuracies": 0.75, "rewards/generated": 0.035467393696308136, "rewards/margins": 0.3378516435623169, "rewards/real": 0.37331902980804443, "step": 110 }, { "epoch": 0.01443001443001443, "grad_norm": 812.577454904602, "learning_rate": 7.21153846153846e-08, "logits/generated": -2.6088433265686035, "logits/real": -2.587759494781494, "logps/generated": -385.3928527832031, "logps/real": -301.99420166015625, "loss": 0.9061, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.4538938105106354, "rewards/margins": 0.6048692464828491, "rewards/real": 0.15097543597221375, "step": 120 }, { "epoch": 0.015632515632515633, "grad_norm": 820.3630614817264, "learning_rate": 7.812499999999999e-08, "logits/generated": -2.6701788902282715, "logits/real": -2.5965487957000732, "logps/generated": -540.5206909179688, "logps/real": -473.69091796875, "loss": 0.9399, "rewards/accuracies": 0.800000011920929, "rewards/generated": -0.04602137207984924, "rewards/margins": 0.4468808174133301, "rewards/real": 0.40085944533348083, "step": 130 }, { "epoch": 0.016835016835016835, "grad_norm": 672.691049772504, "learning_rate": 8.413461538461539e-08, "logits/generated": -2.666149139404297, "logits/real": -2.6245312690734863, "logps/generated": -540.637451171875, "logps/real": -487.11004638671875, "loss": 0.9741, "rewards/accuracies": 0.824999988079071, "rewards/generated": -0.5353710055351257, "rewards/margins": 0.8442476987838745, "rewards/real": 0.30887678265571594, "step": 140 }, { "epoch": 0.018037518037518036, "grad_norm": 382.5992415420118, "learning_rate": 9.014423076923076e-08, "logits/generated": -2.625582218170166, "logits/real": -2.588461399078369, "logps/generated": -458.9794006347656, "logps/real": -340.5458679199219, "loss": 0.7644, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.8172666430473328, "rewards/margins": 1.0257928371429443, "rewards/real": 0.2085261046886444, "step": 150 }, { "epoch": 0.01924001924001924, "grad_norm": 514.8693438152337, "learning_rate": 9.615384615384616e-08, "logits/generated": -2.6541621685028076, "logits/real": -2.6064515113830566, "logps/generated": -494.205322265625, "logps/real": -537.3380126953125, "loss": 0.8356, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.7966269254684448, "rewards/margins": 1.1517274379730225, "rewards/real": 0.3551006019115448, "step": 160 }, { "epoch": 0.020442520442520443, "grad_norm": 290.27033949114116, "learning_rate": 1.0216346153846154e-07, "logits/generated": -2.7106773853302, "logits/real": -2.659435272216797, "logps/generated": -480.480224609375, "logps/real": -387.47772216796875, "loss": 0.822, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.075594186782837, "rewards/margins": 1.4880707263946533, "rewards/real": 0.4124765992164612, "step": 170 }, { "epoch": 0.021645021645021644, "grad_norm": 420.11050173483665, "learning_rate": 1.0817307692307693e-07, "logits/generated": -2.6927719116210938, "logits/real": -2.6310219764709473, "logps/generated": -454.25994873046875, "logps/real": -530.0487060546875, "loss": 0.8513, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.28558117151260376, "rewards/margins": 0.7983309626579285, "rewards/real": 0.5127498507499695, "step": 180 }, { "epoch": 0.02284752284752285, "grad_norm": 568.4859686887961, "learning_rate": 1.141826923076923e-07, "logits/generated": -2.6487412452697754, "logits/real": -2.6339938640594482, "logps/generated": -395.45294189453125, "logps/real": -395.30145263671875, "loss": 0.7378, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.9616363644599915, "rewards/margins": 1.4832828044891357, "rewards/real": 0.5216464996337891, "step": 190 }, { "epoch": 0.02405002405002405, "grad_norm": 770.9599691138654, "learning_rate": 1.2019230769230769e-07, "logits/generated": -2.6841750144958496, "logits/real": -2.6595330238342285, "logps/generated": -557.3283081054688, "logps/real": -498.83392333984375, "loss": 0.8671, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.3152936697006226, "rewards/margins": 1.9122545719146729, "rewards/real": 0.5969609022140503, "step": 200 }, { "epoch": 0.025252525252525252, "grad_norm": 633.8366426708363, "learning_rate": 1.2620192307692308e-07, "logits/generated": -2.6651999950408936, "logits/real": -2.6084225177764893, "logps/generated": -550.3500366210938, "logps/real": -575.7730712890625, "loss": 0.9144, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.824698269367218, "rewards/margins": 1.798667311668396, "rewards/real": 0.9739691019058228, "step": 210 }, { "epoch": 0.026455026455026454, "grad_norm": 301.80349725323805, "learning_rate": 1.3221153846153847e-07, "logits/generated": -2.6506190299987793, "logits/real": -2.632826805114746, "logps/generated": -338.2746276855469, "logps/real": -296.9292297363281, "loss": 0.6624, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.1101773977279663, "rewards/margins": 1.6372003555297852, "rewards/real": 0.5270229578018188, "step": 220 }, { "epoch": 0.02765752765752766, "grad_norm": 345.67994999617287, "learning_rate": 1.3822115384615384e-07, "logits/generated": -2.6194703578948975, "logits/real": -2.614107131958008, "logps/generated": -432.5381774902344, "logps/real": -374.21966552734375, "loss": 0.7959, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.127386450767517, "rewards/margins": 1.9437313079833984, "rewards/real": 0.8163448572158813, "step": 230 }, { "epoch": 0.02886002886002886, "grad_norm": 602.2763147294542, "learning_rate": 1.442307692307692e-07, "logits/generated": -2.604137897491455, "logits/real": -2.5926718711853027, "logps/generated": -501.0613708496094, "logps/real": -463.6505432128906, "loss": 0.7729, "rewards/accuracies": 0.824999988079071, "rewards/generated": -0.8467072248458862, "rewards/margins": 1.6996408700942993, "rewards/real": 0.8529335260391235, "step": 240 }, { "epoch": 0.03006253006253006, "grad_norm": 141.56496682275844, "learning_rate": 1.502403846153846e-07, "logits/generated": -2.6360106468200684, "logits/real": -2.5817055702209473, "logps/generated": -529.6976318359375, "logps/real": -435.2784729003906, "loss": 0.7024, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.684234380722046, "rewards/margins": 2.7245383262634277, "rewards/real": 1.0403037071228027, "step": 250 }, { "epoch": 0.031265031265031266, "grad_norm": 564.9266573022827, "learning_rate": 1.5624999999999999e-07, "logits/generated": -2.692927598953247, "logits/real": -2.611467123031616, "logps/generated": -524.0853271484375, "logps/real": -434.35821533203125, "loss": 0.7334, "rewards/accuracies": 0.800000011920929, "rewards/generated": -1.3338186740875244, "rewards/margins": 2.573237419128418, "rewards/real": 1.239418625831604, "step": 260 }, { "epoch": 0.032467532467532464, "grad_norm": 524.1826937063853, "learning_rate": 1.6225961538461538e-07, "logits/generated": -2.6772425174713135, "logits/real": -2.6025278568267822, "logps/generated": -580.0106201171875, "logps/real": -491.933837890625, "loss": 0.7656, "rewards/accuracies": 0.875, "rewards/generated": -1.5975688695907593, "rewards/margins": 2.1904983520507812, "rewards/real": 0.5929292440414429, "step": 270 }, { "epoch": 0.03367003367003367, "grad_norm": 428.7369151880417, "learning_rate": 1.6826923076923077e-07, "logits/generated": -2.6498706340789795, "logits/real": -2.650085687637329, "logps/generated": -392.8506774902344, "logps/real": -369.8538513183594, "loss": 0.6524, "rewards/accuracies": 0.875, "rewards/generated": -1.6748186349868774, "rewards/margins": 2.5976762771606445, "rewards/real": 0.9228577613830566, "step": 280 }, { "epoch": 0.034872534872534874, "grad_norm": 243.60844008737482, "learning_rate": 1.7427884615384614e-07, "logits/generated": -2.666207790374756, "logits/real": -2.6470437049865723, "logps/generated": -502.02264404296875, "logps/real": -403.56585693359375, "loss": 0.7117, "rewards/accuracies": 0.824999988079071, "rewards/generated": -1.3320616483688354, "rewards/margins": 2.2303693294525146, "rewards/real": 0.8983078002929688, "step": 290 }, { "epoch": 0.03607503607503607, "grad_norm": 593.7863933773639, "learning_rate": 1.8028846153846153e-07, "logits/generated": -2.6411917209625244, "logits/real": -2.5625710487365723, "logps/generated": -457.24298095703125, "logps/real": -396.57537841796875, "loss": 0.6727, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -1.5696680545806885, "rewards/margins": 1.9555574655532837, "rewards/real": 0.3858892619609833, "step": 300 }, { "epoch": 0.03727753727753728, "grad_norm": 564.2932253829903, "learning_rate": 1.8629807692307692e-07, "logits/generated": -2.5881824493408203, "logits/real": -2.561656951904297, "logps/generated": -487.9922790527344, "logps/real": -400.12200927734375, "loss": 0.6698, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.118783712387085, "rewards/margins": 2.787592887878418, "rewards/real": 0.6688090562820435, "step": 310 }, { "epoch": 0.03848003848003848, "grad_norm": 472.1429017589802, "learning_rate": 1.9230769230769231e-07, "logits/generated": -2.5694048404693604, "logits/real": -2.504631519317627, "logps/generated": -482.6383361816406, "logps/real": -373.3436279296875, "loss": 0.7241, "rewards/accuracies": 0.824999988079071, "rewards/generated": -1.9490076303482056, "rewards/margins": 2.4001662731170654, "rewards/real": 0.45115867257118225, "step": 320 }, { "epoch": 0.03968253968253968, "grad_norm": 351.72945578036473, "learning_rate": 1.9831730769230768e-07, "logits/generated": -2.5863828659057617, "logits/real": -2.556687831878662, "logps/generated": -566.9625244140625, "logps/real": -511.40673828125, "loss": 0.6826, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.6171916723251343, "rewards/margins": 2.7914156913757324, "rewards/real": 1.1742244958877563, "step": 330 }, { "epoch": 0.040885040885040885, "grad_norm": 230.944199336625, "learning_rate": 2.0432692307692307e-07, "logits/generated": -2.579220771789551, "logits/real": -2.5368800163269043, "logps/generated": -587.6741333007812, "logps/real": -509.1455993652344, "loss": 0.61, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.6644325256347656, "rewards/margins": 4.057629585266113, "rewards/real": 1.3931968212127686, "step": 340 }, { "epoch": 0.04208754208754209, "grad_norm": 610.2796148482761, "learning_rate": 2.1033653846153846e-07, "logits/generated": -2.6246964931488037, "logits/real": -2.607069492340088, "logps/generated": -405.486083984375, "logps/real": -408.4022521972656, "loss": 0.6567, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -1.6070611476898193, "rewards/margins": 3.0959818363189697, "rewards/real": 1.4889209270477295, "step": 350 }, { "epoch": 0.04329004329004329, "grad_norm": 311.0025856326943, "learning_rate": 2.1634615384615386e-07, "logits/generated": -2.555190324783325, "logits/real": -2.564065456390381, "logps/generated": -445.73095703125, "logps/real": -367.8589172363281, "loss": 0.5773, "rewards/accuracies": 0.875, "rewards/generated": -1.8315942287445068, "rewards/margins": 3.250680923461914, "rewards/real": 1.4190864562988281, "step": 360 }, { "epoch": 0.04449254449254449, "grad_norm": 355.8050012317517, "learning_rate": 2.223557692307692e-07, "logits/generated": -2.5479896068573, "logits/real": -2.5375823974609375, "logps/generated": -451.8771057128906, "logps/real": -421.00421142578125, "loss": 0.6029, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.6865472793579102, "rewards/margins": 3.6383838653564453, "rewards/real": 1.9518362283706665, "step": 370 }, { "epoch": 0.0456950456950457, "grad_norm": 213.75664258958034, "learning_rate": 2.283653846153846e-07, "logits/generated": -2.6107444763183594, "logits/real": -2.5683038234710693, "logps/generated": -472.42938232421875, "logps/real": -404.27276611328125, "loss": 0.5558, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.811893105506897, "rewards/margins": 4.338118076324463, "rewards/real": 2.5262250900268555, "step": 380 }, { "epoch": 0.046897546897546896, "grad_norm": 803.8075458632958, "learning_rate": 2.3437499999999998e-07, "logits/generated": -2.624004364013672, "logits/real": -2.607717514038086, "logps/generated": -532.3065795898438, "logps/real": -464.7298889160156, "loss": 0.7197, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.13997170329093933, "rewards/margins": 3.161569356918335, "rewards/real": 3.0215978622436523, "step": 390 }, { "epoch": 0.0481000481000481, "grad_norm": 77.72914826224212, "learning_rate": 2.4038461538461537e-07, "logits/generated": -2.6229655742645264, "logits/real": -2.5912537574768066, "logps/generated": -599.0198364257812, "logps/real": -509.234619140625, "loss": 0.6238, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.3059558868408203, "rewards/margins": 5.463759899139404, "rewards/real": 3.157804012298584, "step": 400 }, { "epoch": 0.0493025493025493, "grad_norm": 215.20858836955986, "learning_rate": 2.4639423076923076e-07, "logits/generated": -2.5568790435791016, "logits/real": -2.519082546234131, "logps/generated": -547.3959350585938, "logps/real": -472.25244140625, "loss": 0.5749, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.3383524417877197, "rewards/margins": 4.01686954498291, "rewards/real": 2.678516387939453, "step": 410 }, { "epoch": 0.050505050505050504, "grad_norm": 270.6802091677648, "learning_rate": 2.5240384615384616e-07, "logits/generated": -2.525097131729126, "logits/real": -2.5024070739746094, "logps/generated": -409.02276611328125, "logps/real": -322.4518127441406, "loss": 0.5, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.2078709602355957, "rewards/margins": 3.8254752159118652, "rewards/real": 1.6176042556762695, "step": 420 }, { "epoch": 0.05170755170755171, "grad_norm": 438.88307962028466, "learning_rate": 2.5841346153846155e-07, "logits/generated": -2.5554592609405518, "logits/real": -2.5345654487609863, "logps/generated": -374.7137145996094, "logps/real": -272.47564697265625, "loss": 0.6292, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -2.2819714546203613, "rewards/margins": 3.79643177986145, "rewards/real": 1.5144603252410889, "step": 430 }, { "epoch": 0.05291005291005291, "grad_norm": 159.03520082916455, "learning_rate": 2.6442307692307694e-07, "logits/generated": -2.523759365081787, "logits/real": -2.5251123905181885, "logps/generated": -440.36126708984375, "logps/real": -395.1308288574219, "loss": 0.6537, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.2008309364318848, "rewards/margins": 4.090251922607422, "rewards/real": 1.8894212245941162, "step": 440 }, { "epoch": 0.05411255411255411, "grad_norm": 106.50435638461428, "learning_rate": 2.704326923076923e-07, "logits/generated": -2.577564239501953, "logits/real": -2.5691657066345215, "logps/generated": -575.0303344726562, "logps/real": -491.9471130371094, "loss": 0.6023, "rewards/accuracies": 1.0, "rewards/generated": -1.6865742206573486, "rewards/margins": 4.896966457366943, "rewards/real": 3.2103919982910156, "step": 450 }, { "epoch": 0.05531505531505532, "grad_norm": 149.6158479648117, "learning_rate": 2.7644230769230767e-07, "logits/generated": -2.583193302154541, "logits/real": -2.527129650115967, "logps/generated": -511.2799377441406, "logps/real": -359.5958251953125, "loss": 0.7013, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.581364154815674, "rewards/margins": 5.107150554656982, "rewards/real": 2.5257863998413086, "step": 460 }, { "epoch": 0.056517556517556515, "grad_norm": 286.1520809667947, "learning_rate": 2.8245192307692306e-07, "logits/generated": -2.5159034729003906, "logits/real": -2.4691407680511475, "logps/generated": -538.7835693359375, "logps/real": -378.9813537597656, "loss": 0.5576, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.5919742584228516, "rewards/margins": 5.763204097747803, "rewards/real": 2.171229600906372, "step": 470 }, { "epoch": 0.05772005772005772, "grad_norm": 266.57122924881753, "learning_rate": 2.884615384615384e-07, "logits/generated": -2.57939076423645, "logits/real": -2.518014430999756, "logps/generated": -464.8843688964844, "logps/real": -393.81854248046875, "loss": 0.5013, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.7847225666046143, "rewards/margins": 6.235506534576416, "rewards/real": 2.4507832527160645, "step": 480 }, { "epoch": 0.058922558922558925, "grad_norm": 55.87920931382874, "learning_rate": 2.9447115384615385e-07, "logits/generated": -2.5865557193756104, "logits/real": -2.562652349472046, "logps/generated": -597.1098022460938, "logps/real": -537.5181884765625, "loss": 0.5396, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.842224359512329, "rewards/margins": 5.604229927062988, "rewards/real": 3.76200532913208, "step": 490 }, { "epoch": 0.06012506012506012, "grad_norm": 131.18473169676926, "learning_rate": 3.004807692307692e-07, "logits/generated": -2.506545066833496, "logits/real": -2.4941606521606445, "logps/generated": -447.23748779296875, "logps/real": -305.6527404785156, "loss": 0.506, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.6240792274475098, "rewards/margins": 4.890207290649414, "rewards/real": 2.2661280632019043, "step": 500 }, { "epoch": 0.06132756132756133, "grad_norm": 62.90722345988625, "learning_rate": 3.0649038461538463e-07, "logits/generated": -2.5521976947784424, "logits/real": -2.5559165477752686, "logps/generated": -520.6541748046875, "logps/real": -467.57781982421875, "loss": 0.5945, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -1.9740931987762451, "rewards/margins": 5.077263355255127, "rewards/real": 3.1031699180603027, "step": 510 }, { "epoch": 0.06253006253006253, "grad_norm": 545.3694751870157, "learning_rate": 3.1249999999999997e-07, "logits/generated": -2.5236270427703857, "logits/real": -2.4969699382781982, "logps/generated": -386.9814453125, "logps/real": -265.28399658203125, "loss": 0.6543, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.31213641166687, "rewards/margins": 4.220711708068848, "rewards/real": 1.9085750579833984, "step": 520 }, { "epoch": 0.06373256373256374, "grad_norm": 160.94599584449614, "learning_rate": 3.1850961538461536e-07, "logits/generated": -2.492840051651001, "logits/real": -2.504638910293579, "logps/generated": -438.1183166503906, "logps/real": -422.6055603027344, "loss": 0.4684, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.6569240093231201, "rewards/margins": 5.817118167877197, "rewards/real": 4.160194396972656, "step": 530 }, { "epoch": 0.06493506493506493, "grad_norm": 219.24291818102114, "learning_rate": 3.2451923076923076e-07, "logits/generated": -2.5277576446533203, "logits/real": -2.534254550933838, "logps/generated": -567.7418212890625, "logps/real": -410.13189697265625, "loss": 0.5648, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.521907091140747, "rewards/margins": 5.512710094451904, "rewards/real": 3.9908034801483154, "step": 540 }, { "epoch": 0.06613756613756613, "grad_norm": 262.4963212470536, "learning_rate": 3.3052884615384615e-07, "logits/generated": -2.4447712898254395, "logits/real": -2.4399654865264893, "logps/generated": -563.7222290039062, "logps/real": -487.09149169921875, "loss": 0.5795, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.4821799993515015, "rewards/margins": 5.634998321533203, "rewards/real": 4.152817726135254, "step": 550 }, { "epoch": 0.06734006734006734, "grad_norm": 14.40198793340137, "learning_rate": 3.3653846153846154e-07, "logits/generated": -2.446659803390503, "logits/real": -2.4423022270202637, "logps/generated": -510.00048828125, "logps/real": -354.17633056640625, "loss": 0.463, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.6586902141571045, "rewards/margins": 5.03386926651001, "rewards/real": 3.375178098678589, "step": 560 }, { "epoch": 0.06854256854256854, "grad_norm": 322.40706415540495, "learning_rate": 3.4254807692307693e-07, "logits/generated": -2.436397075653076, "logits/real": -2.407928943634033, "logps/generated": -444.75335693359375, "logps/real": -361.814697265625, "loss": 0.5566, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.6426433324813843, "rewards/margins": 5.619287490844727, "rewards/real": 3.9766430854797363, "step": 570 }, { "epoch": 0.06974506974506975, "grad_norm": 520.8071806827575, "learning_rate": 3.4855769230769227e-07, "logits/generated": -2.443695545196533, "logits/real": -2.4213390350341797, "logps/generated": -573.2529296875, "logps/real": -466.0352478027344, "loss": 0.5636, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.9522768259048462, "rewards/margins": 7.255436897277832, "rewards/real": 5.303159713745117, "step": 580 }, { "epoch": 0.07094757094757095, "grad_norm": 136.86662774329525, "learning_rate": 3.545673076923077e-07, "logits/generated": -2.444119930267334, "logits/real": -2.4461493492126465, "logps/generated": -403.935546875, "logps/real": -372.10955810546875, "loss": 0.5627, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.049966335296631, "rewards/margins": 5.5424723625183105, "rewards/real": 3.492506504058838, "step": 590 }, { "epoch": 0.07215007215007214, "grad_norm": 200.39657674377634, "learning_rate": 3.6057692307692306e-07, "logits/generated": -2.4266598224639893, "logits/real": -2.3929033279418945, "logps/generated": -471.447998046875, "logps/real": -349.2568359375, "loss": 0.4766, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.8870925903320312, "rewards/margins": 6.241556167602539, "rewards/real": 3.3544631004333496, "step": 600 }, { "epoch": 0.07335257335257335, "grad_norm": 26.212093486989286, "learning_rate": 3.665865384615384e-07, "logits/generated": -2.537414312362671, "logits/real": -2.522113800048828, "logps/generated": -539.5340576171875, "logps/real": -499.4637756347656, "loss": 0.563, "rewards/accuracies": 0.875, "rewards/generated": 0.5028484463691711, "rewards/margins": 5.832631587982178, "rewards/real": 6.335480213165283, "step": 610 }, { "epoch": 0.07455507455507455, "grad_norm": 288.1085577690171, "learning_rate": 3.7259615384615384e-07, "logits/generated": -2.44157075881958, "logits/real": -2.4534690380096436, "logps/generated": -562.2481689453125, "logps/real": -413.3949279785156, "loss": 0.7212, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.3799520432949066, "rewards/margins": 6.192772388458252, "rewards/real": 5.812820911407471, "step": 620 }, { "epoch": 0.07575757575757576, "grad_norm": 234.3285774174516, "learning_rate": 3.786057692307692e-07, "logits/generated": -2.4359230995178223, "logits/real": -2.4403796195983887, "logps/generated": -473.8140563964844, "logps/real": -376.0264587402344, "loss": 0.5482, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.6209138631820679, "rewards/margins": 6.4328203201293945, "rewards/real": 5.811906337738037, "step": 630 }, { "epoch": 0.07696007696007696, "grad_norm": 607.9712137079911, "learning_rate": 3.8461538461538463e-07, "logits/generated": -2.480332851409912, "logits/real": -2.427781105041504, "logps/generated": -534.2012939453125, "logps/real": -399.5908203125, "loss": 0.5918, "rewards/accuracies": 0.875, "rewards/generated": -0.24489817023277283, "rewards/margins": 5.902339458465576, "rewards/real": 5.657441139221191, "step": 640 }, { "epoch": 0.07816257816257816, "grad_norm": 644.5895679430932, "learning_rate": 3.9062499999999997e-07, "logits/generated": -2.4072418212890625, "logits/real": -2.4051966667175293, "logps/generated": -488.00372314453125, "logps/real": -419.2978515625, "loss": 0.6482, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.9883257746696472, "rewards/margins": 6.993611812591553, "rewards/real": 6.005285739898682, "step": 650 }, { "epoch": 0.07936507936507936, "grad_norm": 165.58063796257375, "learning_rate": 3.9663461538461536e-07, "logits/generated": -2.4074649810791016, "logits/real": -2.4075779914855957, "logps/generated": -429.4495544433594, "logps/real": -338.69903564453125, "loss": 0.4639, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.5352877378463745, "rewards/margins": 5.956145763397217, "rewards/real": 4.4208574295043945, "step": 660 }, { "epoch": 0.08056758056758057, "grad_norm": 470.2298594259089, "learning_rate": 4.0264423076923075e-07, "logits/generated": -2.438861846923828, "logits/real": -2.43005108833313, "logps/generated": -609.919921875, "logps/real": -400.38555908203125, "loss": 0.5084, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.441681146621704, "rewards/margins": 8.445650100708008, "rewards/real": 7.003968715667725, "step": 670 }, { "epoch": 0.08177008177008177, "grad_norm": 231.19690718837484, "learning_rate": 4.0865384615384614e-07, "logits/generated": -2.4505388736724854, "logits/real": -2.4851975440979004, "logps/generated": -523.8932495117188, "logps/real": -337.04669189453125, "loss": 0.5029, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.192518949508667, "rewards/margins": 7.243227958679199, "rewards/real": 5.050708770751953, "step": 680 }, { "epoch": 0.08297258297258298, "grad_norm": 36.955288870395485, "learning_rate": 4.1466346153846153e-07, "logits/generated": -2.4332611560821533, "logits/real": -2.445805072784424, "logps/generated": -447.68951416015625, "logps/real": -366.5887756347656, "loss": 0.5458, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.1135241985321045, "rewards/margins": 5.728610992431641, "rewards/real": 6.842134952545166, "step": 690 }, { "epoch": 0.08417508417508418, "grad_norm": 104.13258883882806, "learning_rate": 4.2067307692307693e-07, "logits/generated": -2.400561809539795, "logits/real": -2.390897512435913, "logps/generated": -420.000732421875, "logps/real": -329.4757385253906, "loss": 0.6707, "rewards/accuracies": 0.824999988079071, "rewards/generated": -0.8484959602355957, "rewards/margins": 5.642020225524902, "rewards/real": 4.793523788452148, "step": 700 }, { "epoch": 0.08537758537758537, "grad_norm": 463.4846438228281, "learning_rate": 4.2668269230769227e-07, "logits/generated": -2.410674571990967, "logits/real": -2.3668198585510254, "logps/generated": -516.2708740234375, "logps/real": -344.43231201171875, "loss": 0.5679, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.931654453277588, "rewards/margins": 8.226637840270996, "rewards/real": 5.294983863830566, "step": 710 }, { "epoch": 0.08658008658008658, "grad_norm": 194.3378900515586, "learning_rate": 4.326923076923077e-07, "logits/generated": -2.4032340049743652, "logits/real": -2.4055473804473877, "logps/generated": -672.94873046875, "logps/real": -361.0772399902344, "loss": 0.5047, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.726757049560547, "rewards/margins": 10.705833435058594, "rewards/real": 5.979075908660889, "step": 720 }, { "epoch": 0.08778258778258778, "grad_norm": 473.11628415499354, "learning_rate": 4.3870192307692305e-07, "logits/generated": -2.441622257232666, "logits/real": -2.4414029121398926, "logps/generated": -554.408447265625, "logps/real": -416.569091796875, "loss": 0.5421, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.7528645992279053, "rewards/margins": 10.26198673248291, "rewards/real": 7.509122371673584, "step": 730 }, { "epoch": 0.08898508898508899, "grad_norm": 593.8514206150782, "learning_rate": 4.447115384615384e-07, "logits/generated": -2.306051731109619, "logits/real": -2.3629348278045654, "logps/generated": -431.8253479003906, "logps/real": -410.6041564941406, "loss": 0.7682, "rewards/accuracies": 0.875, "rewards/generated": -0.5903338193893433, "rewards/margins": 6.22799825668335, "rewards/real": 5.637664794921875, "step": 740 }, { "epoch": 0.09018759018759019, "grad_norm": 559.1956981297295, "learning_rate": 4.5072115384615384e-07, "logits/generated": -2.4053454399108887, "logits/real": -2.429900646209717, "logps/generated": -411.954833984375, "logps/real": -375.6228332519531, "loss": 0.5099, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.7702974677085876, "rewards/margins": 6.885931968688965, "rewards/real": 6.115634918212891, "step": 750 }, { "epoch": 0.0913900913900914, "grad_norm": 208.42536166252188, "learning_rate": 4.567307692307692e-07, "logits/generated": -2.435375928878784, "logits/real": -2.4195473194122314, "logps/generated": -404.2030944824219, "logps/real": -304.415283203125, "loss": 0.5346, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.7665412425994873, "rewards/margins": 7.186232566833496, "rewards/real": 5.419691562652588, "step": 760 }, { "epoch": 0.09259259259259259, "grad_norm": 277.6783880737456, "learning_rate": 4.627403846153846e-07, "logits/generated": -2.446922540664673, "logits/real": -2.4227371215820312, "logps/generated": -464.3440856933594, "logps/real": -328.65765380859375, "loss": 0.4893, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.3294190168380737, "rewards/margins": 8.597673416137695, "rewards/real": 7.26825475692749, "step": 770 }, { "epoch": 0.09379509379509379, "grad_norm": 177.72925006175979, "learning_rate": 4.6874999999999996e-07, "logits/generated": -2.445438861846924, "logits/real": -2.4165053367614746, "logps/generated": -412.7168884277344, "logps/real": -357.92266845703125, "loss": 0.5939, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 0.5890105962753296, "rewards/margins": 6.901103973388672, "rewards/real": 7.490114688873291, "step": 780 }, { "epoch": 0.094997594997595, "grad_norm": 498.9814928270047, "learning_rate": 4.7475961538461535e-07, "logits/generated": -2.508052110671997, "logits/real": -2.463560104370117, "logps/generated": -544.4752197265625, "logps/real": -421.33880615234375, "loss": 0.598, "rewards/accuracies": 0.875, "rewards/generated": -0.3339812755584717, "rewards/margins": 8.22835922241211, "rewards/real": 7.894377708435059, "step": 790 }, { "epoch": 0.0962000962000962, "grad_norm": 177.58519858258708, "learning_rate": 4.807692307692307e-07, "logits/generated": -2.529059886932373, "logits/real": -2.5021815299987793, "logps/generated": -638.894287109375, "logps/real": -456.08636474609375, "loss": 0.6459, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.0881636142730713, "rewards/margins": 9.593210220336914, "rewards/real": 8.505046844482422, "step": 800 }, { "epoch": 0.09740259740259741, "grad_norm": 289.1705831497986, "learning_rate": 4.867788461538461e-07, "logits/generated": -2.4944424629211426, "logits/real": -2.4910571575164795, "logps/generated": -389.064208984375, "logps/real": -338.09130859375, "loss": 0.6605, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.8759121894836426, "rewards/margins": 7.720027923583984, "rewards/real": 4.8441162109375, "step": 810 }, { "epoch": 0.0986050986050986, "grad_norm": 89.71763545171552, "learning_rate": 4.927884615384615e-07, "logits/generated": -2.4910922050476074, "logits/real": -2.512207508087158, "logps/generated": -429.01885986328125, "logps/real": -306.97186279296875, "loss": 0.4738, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.9805129766464233, "rewards/margins": 7.416163444519043, "rewards/real": 5.435650825500488, "step": 820 }, { "epoch": 0.0998075998075998, "grad_norm": 43.180380119194005, "learning_rate": 4.987980769230769e-07, "logits/generated": -2.541921854019165, "logits/real": -2.5123507976531982, "logps/generated": -642.5328979492188, "logps/real": -390.1651611328125, "loss": 0.6895, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.0395376682281494, "rewards/margins": 8.597407341003418, "rewards/real": 7.557869911193848, "step": 830 }, { "epoch": 0.10101010101010101, "grad_norm": 283.019728239531, "learning_rate": 4.994655264564404e-07, "logits/generated": -2.54461407661438, "logits/real": -2.5405166149139404, "logps/generated": -526.7052001953125, "logps/real": -341.8548278808594, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/generated": -1.519399642944336, "rewards/margins": 8.346105575561523, "rewards/real": 6.826704978942871, "step": 840 }, { "epoch": 0.10221260221260221, "grad_norm": 59.705205007363304, "learning_rate": 4.987974345269909e-07, "logits/generated": -2.5890445709228516, "logits/real": -2.5490384101867676, "logps/generated": -526.5341186523438, "logps/real": -378.7647399902344, "loss": 0.4837, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.410961866378784, "rewards/margins": 10.236207008361816, "rewards/real": 7.825244903564453, "step": 850 }, { "epoch": 0.10341510341510342, "grad_norm": 36.879840980165895, "learning_rate": 4.981293425975414e-07, "logits/generated": -2.510080337524414, "logits/real": -2.5209672451019287, "logps/generated": -557.8668212890625, "logps/real": -390.33087158203125, "loss": 0.7064, "rewards/accuracies": 0.824999988079071, "rewards/generated": -1.326934814453125, "rewards/margins": 9.699923515319824, "rewards/real": 8.372990608215332, "step": 860 }, { "epoch": 0.10461760461760462, "grad_norm": 44.31221900322654, "learning_rate": 4.974612506680919e-07, "logits/generated": -2.4854016304016113, "logits/real": -2.5063211917877197, "logps/generated": -390.99456787109375, "logps/real": -264.6806640625, "loss": 0.4735, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.6687812805175781, "rewards/margins": 7.6750898361206055, "rewards/real": 6.006308555603027, "step": 870 }, { "epoch": 0.10582010582010581, "grad_norm": 595.4513964717609, "learning_rate": 4.967931587386424e-07, "logits/generated": -2.4633870124816895, "logits/real": -2.4368672370910645, "logps/generated": -516.665771484375, "logps/real": -359.8690490722656, "loss": 0.8311, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.3331992626190186, "rewards/margins": 7.489238739013672, "rewards/real": 9.822439193725586, "step": 880 }, { "epoch": 0.10702260702260702, "grad_norm": 494.76580483961595, "learning_rate": 4.961250668091929e-07, "logits/generated": -2.3668341636657715, "logits/real": -2.4266374111175537, "logps/generated": -477.02947998046875, "logps/real": -399.01104736328125, "loss": 0.5306, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.31238502264022827, "rewards/margins": 8.317577362060547, "rewards/real": 8.005192756652832, "step": 890 }, { "epoch": 0.10822510822510822, "grad_norm": 227.26978506638594, "learning_rate": 4.954569748797434e-07, "logits/generated": -2.452258586883545, "logits/real": -2.409104108810425, "logps/generated": -344.12030029296875, "logps/real": -279.15472412109375, "loss": 0.4819, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.35809463262557983, "rewards/margins": 8.180088996887207, "rewards/real": 7.821993827819824, "step": 900 }, { "epoch": 0.10942760942760943, "grad_norm": 408.53064920970644, "learning_rate": 4.947888829502939e-07, "logits/generated": -2.5109660625457764, "logits/real": -2.5140318870544434, "logps/generated": -387.4598693847656, "logps/real": -274.79962158203125, "loss": 0.637, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.21372707188129425, "rewards/margins": 9.280416488647461, "rewards/real": 9.494144439697266, "step": 910 }, { "epoch": 0.11063011063011063, "grad_norm": 627.8064977379107, "learning_rate": 4.941207910208444e-07, "logits/generated": -2.43747615814209, "logits/real": -2.4163014888763428, "logps/generated": -411.9353942871094, "logps/real": -286.6072082519531, "loss": 0.6341, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.19150570034980774, "rewards/margins": 7.435213565826416, "rewards/real": 7.6267194747924805, "step": 920 }, { "epoch": 0.11183261183261184, "grad_norm": 129.83205422064785, "learning_rate": 4.93452699091395e-07, "logits/generated": -2.452383279800415, "logits/real": -2.4608356952667236, "logps/generated": -479.47430419921875, "logps/real": -348.01751708984375, "loss": 0.9279, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 3.176835536956787, "rewards/margins": 6.142772674560547, "rewards/real": 9.319608688354492, "step": 930 }, { "epoch": 0.11303511303511303, "grad_norm": 60.642646580294546, "learning_rate": 4.927846071619454e-07, "logits/generated": -2.526930332183838, "logits/real": -2.535792112350464, "logps/generated": -442.809814453125, "logps/real": -283.3125915527344, "loss": 0.4962, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.3951802253723145, "rewards/margins": 11.033866882324219, "rewards/real": 8.638687133789062, "step": 940 }, { "epoch": 0.11423761423761424, "grad_norm": 82.19446737393146, "learning_rate": 4.921165152324959e-07, "logits/generated": -2.5252230167388916, "logits/real": -2.4797539710998535, "logps/generated": -443.1474609375, "logps/real": -261.7357177734375, "loss": 0.7189, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.12446584552526474, "rewards/margins": 7.716545104980469, "rewards/real": 7.841011047363281, "step": 950 }, { "epoch": 0.11544011544011544, "grad_norm": 375.0730409211993, "learning_rate": 4.914484233030464e-07, "logits/generated": -2.524353504180908, "logits/real": -2.4667043685913086, "logps/generated": -524.8092041015625, "logps/real": -352.5014343261719, "loss": 0.8018, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.1281962394714355, "rewards/margins": 7.834773063659668, "rewards/real": 5.706576347351074, "step": 960 }, { "epoch": 0.11664261664261664, "grad_norm": 52.860984681162265, "learning_rate": 4.907803313735969e-07, "logits/generated": -2.5741899013519287, "logits/real": -2.5507147312164307, "logps/generated": -561.8126831054688, "logps/real": -427.07537841796875, "loss": 0.5758, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.574278473854065, "rewards/margins": 10.812956809997559, "rewards/real": 12.387235641479492, "step": 970 }, { "epoch": 0.11784511784511785, "grad_norm": 350.4347742596602, "learning_rate": 4.901122394441475e-07, "logits/generated": -2.5642387866973877, "logits/real": -2.538931131362915, "logps/generated": -459.7955017089844, "logps/real": -340.40509033203125, "loss": 0.5898, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.2329527586698532, "rewards/margins": 11.073533058166504, "rewards/real": 10.840580940246582, "step": 980 }, { "epoch": 0.11904761904761904, "grad_norm": 132.98066032528084, "learning_rate": 4.89444147514698e-07, "logits/generated": -2.4642674922943115, "logits/real": -2.47807240486145, "logps/generated": -505.66522216796875, "logps/real": -333.0113220214844, "loss": 0.6519, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.3832284212112427, "rewards/margins": 9.659668922424316, "rewards/real": 11.042898178100586, "step": 990 }, { "epoch": 0.12025012025012025, "grad_norm": 242.28515976706066, "learning_rate": 4.887760555852485e-07, "logits/generated": -2.436150074005127, "logits/real": -2.366624355316162, "logps/generated": -610.8570556640625, "logps/real": -486.4978942871094, "loss": 0.595, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.764488697052002, "rewards/margins": 11.187214851379395, "rewards/real": 13.951703071594238, "step": 1000 }, { "epoch": 0.12025012025012025, "eval_logits/generated": -2.478092908859253, "eval_logits/real": -2.4647560119628906, "eval_logps/generated": -473.17388916015625, "eval_logps/real": -358.1131896972656, "eval_loss": 0.5124213099479675, "eval_rewards/accuracies": 0.9464285969734192, "eval_rewards/generated": 1.6607673168182373, "eval_rewards/margins": 9.65501880645752, "eval_rewards/real": 11.31578540802002, "eval_runtime": 161.422, "eval_samples_per_second": 6.195, "eval_steps_per_second": 0.52, "step": 1000 }, { "epoch": 0.12145262145262145, "grad_norm": 33.429061743895765, "learning_rate": 4.881079636557991e-07, "logits/generated": -2.500575542449951, "logits/real": -2.4536807537078857, "logps/generated": -388.9678955078125, "logps/real": -366.090576171875, "loss": 0.5454, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.6237608194351196, "rewards/margins": 10.379239082336426, "rewards/real": 9.755477905273438, "step": 1010 }, { "epoch": 0.12265512265512266, "grad_norm": 50.69127825558814, "learning_rate": 4.874398717263496e-07, "logits/generated": -2.435478687286377, "logits/real": -2.4292519092559814, "logps/generated": -407.613037109375, "logps/real": -299.08050537109375, "loss": 0.6124, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.8360468745231628, "rewards/margins": 9.333776473999023, "rewards/real": 8.49772834777832, "step": 1020 }, { "epoch": 0.12385762385762386, "grad_norm": 101.56606026574721, "learning_rate": 4.867717797969001e-07, "logits/generated": -2.3880202770233154, "logits/real": -2.4442012310028076, "logps/generated": -592.423095703125, "logps/real": -419.3992614746094, "loss": 0.5554, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.1970345973968506, "rewards/margins": 14.139823913574219, "rewards/real": 12.942790031433105, "step": 1030 }, { "epoch": 0.12506012506012507, "grad_norm": 356.7800204621778, "learning_rate": 4.861036878674505e-07, "logits/generated": -2.432394504547119, "logits/real": -2.4268717765808105, "logps/generated": -515.2378540039062, "logps/real": -355.1988830566406, "loss": 0.5997, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.030477285385132, "rewards/margins": 11.90191650390625, "rewards/real": 9.871438980102539, "step": 1040 }, { "epoch": 0.12626262626262627, "grad_norm": 81.28241781969085, "learning_rate": 4.85435595938001e-07, "logits/generated": -2.480248212814331, "logits/real": -2.490931987762451, "logps/generated": -531.4738159179688, "logps/real": -348.71099853515625, "loss": 0.5742, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 3.440899610519409, "rewards/margins": 9.678221702575684, "rewards/real": 13.119122505187988, "step": 1050 }, { "epoch": 0.12746512746512748, "grad_norm": 209.72715801690197, "learning_rate": 4.847675040085515e-07, "logits/generated": -2.4914391040802, "logits/real": -2.524989366531372, "logps/generated": -524.429931640625, "logps/real": -325.39263916015625, "loss": 0.6769, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.5767424702644348, "rewards/margins": 11.284751892089844, "rewards/real": 10.708008766174316, "step": 1060 }, { "epoch": 0.12866762866762868, "grad_norm": 119.62875271914363, "learning_rate": 4.84099412079102e-07, "logits/generated": -2.477612018585205, "logits/real": -2.476527452468872, "logps/generated": -435.0867614746094, "logps/real": -314.85333251953125, "loss": 0.6891, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.22685666382312775, "rewards/margins": 10.923724174499512, "rewards/real": 11.150581359863281, "step": 1070 }, { "epoch": 0.12987012987012986, "grad_norm": 22.269697003690823, "learning_rate": 4.834313201496525e-07, "logits/generated": -2.3847670555114746, "logits/real": -2.4150912761688232, "logps/generated": -539.1434326171875, "logps/real": -399.27154541015625, "loss": 0.5388, "rewards/accuracies": 0.949999988079071, "rewards/generated": 2.7289445400238037, "rewards/margins": 11.612767219543457, "rewards/real": 14.341712951660156, "step": 1080 }, { "epoch": 0.13107263107263106, "grad_norm": 130.4099114252631, "learning_rate": 4.827632282202031e-07, "logits/generated": -2.412956953048706, "logits/real": -2.4364120960235596, "logps/generated": -473.836181640625, "logps/real": -327.67926025390625, "loss": 0.7132, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.9893986582756042, "rewards/margins": 13.284649848937988, "rewards/real": 14.2740478515625, "step": 1090 }, { "epoch": 0.13227513227513227, "grad_norm": 214.17196740614938, "learning_rate": 4.820951362907536e-07, "logits/generated": -2.3645195960998535, "logits/real": -2.411306619644165, "logps/generated": -611.4638061523438, "logps/real": -375.39239501953125, "loss": 0.5831, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.057896614074707, "rewards/margins": 13.181696891784668, "rewards/real": 11.123800277709961, "step": 1100 }, { "epoch": 0.13347763347763347, "grad_norm": 585.7892384302444, "learning_rate": 4.814270443613041e-07, "logits/generated": -2.468086004257202, "logits/real": -2.4742960929870605, "logps/generated": -520.3275756835938, "logps/real": -408.871337890625, "loss": 0.6295, "rewards/accuracies": 0.824999988079071, "rewards/generated": 3.207319736480713, "rewards/margins": 12.730597496032715, "rewards/real": 15.937917709350586, "step": 1110 }, { "epoch": 0.13468013468013468, "grad_norm": 39.50209435074947, "learning_rate": 4.807589524318546e-07, "logits/generated": -2.515010356903076, "logits/real": -2.5143516063690186, "logps/generated": -592.912841796875, "logps/real": -406.6743469238281, "loss": 0.6573, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.7588653564453125, "rewards/margins": 15.239355087280273, "rewards/real": 13.480488777160645, "step": 1120 }, { "epoch": 0.13588263588263588, "grad_norm": 20.336657571766736, "learning_rate": 4.800908605024052e-07, "logits/generated": -2.453594923019409, "logits/real": -2.508441925048828, "logps/generated": -562.1450805664062, "logps/real": -344.488525390625, "loss": 0.6144, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.14070944488048553, "rewards/margins": 15.271081924438477, "rewards/real": 15.130373001098633, "step": 1130 }, { "epoch": 0.1370851370851371, "grad_norm": 78.19434756439111, "learning_rate": 4.794227685729556e-07, "logits/generated": -2.454686403274536, "logits/real": -2.470715284347534, "logps/generated": -460.20465087890625, "logps/real": -363.54034423828125, "loss": 0.7764, "rewards/accuracies": 0.925000011920929, "rewards/generated": 5.291678428649902, "rewards/margins": 9.904863357543945, "rewards/real": 15.196542739868164, "step": 1140 }, { "epoch": 0.1382876382876383, "grad_norm": 247.0342336683836, "learning_rate": 4.787546766435061e-07, "logits/generated": -2.302990198135376, "logits/real": -2.364873170852661, "logps/generated": -505.3990173339844, "logps/real": -331.7662048339844, "loss": 0.5708, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.2684504985809326, "rewards/margins": 13.13066291809082, "rewards/real": 9.862212181091309, "step": 1150 }, { "epoch": 0.1394901394901395, "grad_norm": 33.81672212166084, "learning_rate": 4.780865847140566e-07, "logits/generated": -2.3383965492248535, "logits/real": -2.3328897953033447, "logps/generated": -513.9251708984375, "logps/real": -321.45458984375, "loss": 0.6679, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.1338083744049072, "rewards/margins": 12.928006172180176, "rewards/real": 10.794198036193848, "step": 1160 }, { "epoch": 0.1406926406926407, "grad_norm": 140.00271640988865, "learning_rate": 4.774184927846072e-07, "logits/generated": -2.2809154987335205, "logits/real": -2.2834174633026123, "logps/generated": -473.13873291015625, "logps/real": -264.2456359863281, "loss": 0.7149, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.4780304431915283, "rewards/margins": 12.65080738067627, "rewards/real": 10.17277717590332, "step": 1170 }, { "epoch": 0.1418951418951419, "grad_norm": 144.07305793972205, "learning_rate": 4.767504008551577e-07, "logits/generated": -2.2845511436462402, "logits/real": -2.341881513595581, "logps/generated": -494.26715087890625, "logps/real": -308.30810546875, "loss": 0.4847, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.6808719635009766, "rewards/margins": 14.3323335647583, "rewards/real": 10.651460647583008, "step": 1180 }, { "epoch": 0.14309764309764308, "grad_norm": 39.01401304181334, "learning_rate": 4.7608230892570814e-07, "logits/generated": -2.3511962890625, "logits/real": -2.347599983215332, "logps/generated": -477.8648986816406, "logps/real": -268.33685302734375, "loss": 0.6017, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.041712760925293, "rewards/margins": 13.663518905639648, "rewards/real": 8.621807098388672, "step": 1190 }, { "epoch": 0.1443001443001443, "grad_norm": 105.7004920268303, "learning_rate": 4.7541421699625865e-07, "logits/generated": -2.4286952018737793, "logits/real": -2.4655444622039795, "logps/generated": -515.7615966796875, "logps/real": -323.18011474609375, "loss": 0.5312, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.503037691116333, "rewards/margins": 17.060775756835938, "rewards/real": 13.557737350463867, "step": 1200 }, { "epoch": 0.1455026455026455, "grad_norm": 240.55346316701133, "learning_rate": 4.7474612506680915e-07, "logits/generated": -2.3684866428375244, "logits/real": -2.4068591594696045, "logps/generated": -432.70452880859375, "logps/real": -303.25897216796875, "loss": 0.5141, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.26573410630226135, "rewards/margins": 12.277440071105957, "rewards/real": 12.011706352233887, "step": 1210 }, { "epoch": 0.1467051467051467, "grad_norm": 119.35160235135942, "learning_rate": 4.7407803313735966e-07, "logits/generated": -2.324099063873291, "logits/real": -2.3842215538024902, "logps/generated": -676.953857421875, "logps/real": -430.06683349609375, "loss": 0.5652, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.1948742866516113, "rewards/margins": 18.48761749267578, "rewards/real": 15.292742729187012, "step": 1220 }, { "epoch": 0.1479076479076479, "grad_norm": 47.52295374620309, "learning_rate": 4.7340994120791017e-07, "logits/generated": -2.368788242340088, "logits/real": -2.3172709941864014, "logps/generated": -395.877197265625, "logps/real": -179.94818115234375, "loss": 0.5762, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.3225250244140625, "rewards/margins": 10.550348281860352, "rewards/real": 7.2278242111206055, "step": 1230 }, { "epoch": 0.1491101491101491, "grad_norm": 9.593658350756252, "learning_rate": 4.7274184927846067e-07, "logits/generated": -2.19208025932312, "logits/real": -2.206171989440918, "logps/generated": -471.58355712890625, "logps/real": -359.2139892578125, "loss": 0.5611, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 0.07689180225133896, "rewards/margins": 11.707413673400879, "rewards/real": 11.78430461883545, "step": 1240 }, { "epoch": 0.15031265031265031, "grad_norm": 635.9445131020764, "learning_rate": 4.7207375734901123e-07, "logits/generated": -2.27813982963562, "logits/real": -2.3700928688049316, "logps/generated": -531.0651245117188, "logps/real": -286.7881164550781, "loss": 0.957, "rewards/accuracies": 0.875, "rewards/generated": -1.789415955543518, "rewards/margins": 13.39814567565918, "rewards/real": 11.608728408813477, "step": 1250 }, { "epoch": 0.15151515151515152, "grad_norm": 418.0975051018659, "learning_rate": 4.7140566541956174e-07, "logits/generated": -2.273059368133545, "logits/real": -2.3243207931518555, "logps/generated": -449.553466796875, "logps/real": -281.80035400390625, "loss": 0.5858, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.2391071766614914, "rewards/margins": 13.100481986999512, "rewards/real": 13.339590072631836, "step": 1260 }, { "epoch": 0.15271765271765272, "grad_norm": 89.7497286321816, "learning_rate": 4.7073757349011224e-07, "logits/generated": -2.2853188514709473, "logits/real": -2.298370361328125, "logps/generated": -496.208251953125, "logps/real": -326.5314025878906, "loss": 0.5482, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.892930269241333, "rewards/margins": 12.869303703308105, "rewards/real": 10.976374626159668, "step": 1270 }, { "epoch": 0.15392015392015393, "grad_norm": 142.91378486261908, "learning_rate": 4.7006948156066275e-07, "logits/generated": -2.383915901184082, "logits/real": -2.3850746154785156, "logps/generated": -463.39715576171875, "logps/real": -257.097900390625, "loss": 0.4108, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.065344333648682, "rewards/margins": 15.007519721984863, "rewards/real": 8.942174911499023, "step": 1280 }, { "epoch": 0.15512265512265513, "grad_norm": 9.179187293755389, "learning_rate": 4.694013896312132e-07, "logits/generated": -2.3176121711730957, "logits/real": -2.351137638092041, "logps/generated": -428.81890869140625, "logps/real": -280.2019348144531, "loss": 0.6041, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 0.25497040152549744, "rewards/margins": 9.969922065734863, "rewards/real": 10.224891662597656, "step": 1290 }, { "epoch": 0.1563251563251563, "grad_norm": 58.845435926340926, "learning_rate": 4.687332977017637e-07, "logits/generated": -2.3036844730377197, "logits/real": -2.3555920124053955, "logps/generated": -525.302978515625, "logps/real": -343.0516052246094, "loss": 0.5772, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.0582842826843262, "rewards/margins": 13.692483901977539, "rewards/real": 12.634200096130371, "step": 1300 }, { "epoch": 0.15752765752765752, "grad_norm": 116.29820907535108, "learning_rate": 4.680652057723142e-07, "logits/generated": -2.2145986557006836, "logits/real": -2.2836480140686035, "logps/generated": -561.1383056640625, "logps/real": -343.1160583496094, "loss": 0.6855, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.1828441619873047, "rewards/margins": 14.4840726852417, "rewards/real": 11.301229476928711, "step": 1310 }, { "epoch": 0.15873015873015872, "grad_norm": 91.38327282645358, "learning_rate": 4.673971138428647e-07, "logits/generated": -2.2995400428771973, "logits/real": -2.321096181869507, "logps/generated": -475.58349609375, "logps/real": -314.55731201171875, "loss": 0.4606, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.775874376296997, "rewards/margins": 16.249086380004883, "rewards/real": 12.473210334777832, "step": 1320 }, { "epoch": 0.15993265993265993, "grad_norm": 17.17921537999581, "learning_rate": 4.667290219134153e-07, "logits/generated": -2.3252789974212646, "logits/real": -2.320345401763916, "logps/generated": -482.9417419433594, "logps/real": -281.4524841308594, "loss": 0.3806, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.5373896360397339, "rewards/margins": 14.353311538696289, "rewards/real": 12.81592082977295, "step": 1330 }, { "epoch": 0.16113516113516113, "grad_norm": 11.28736641110847, "learning_rate": 4.660609299839658e-07, "logits/generated": -2.3551905155181885, "logits/real": -2.399311065673828, "logps/generated": -479.5247497558594, "logps/real": -274.640380859375, "loss": 0.5912, "rewards/accuracies": 0.949999988079071, "rewards/generated": 2.5349957942962646, "rewards/margins": 11.847904205322266, "rewards/real": 14.382901191711426, "step": 1340 }, { "epoch": 0.16233766233766234, "grad_norm": 60.62685743997119, "learning_rate": 4.653928380545163e-07, "logits/generated": -2.457043170928955, "logits/real": -2.4052481651306152, "logps/generated": -495.21026611328125, "logps/real": -284.0331115722656, "loss": 0.7891, "rewards/accuracies": 0.875, "rewards/generated": 1.9937740564346313, "rewards/margins": 11.762030601501465, "rewards/real": 13.755805969238281, "step": 1350 }, { "epoch": 0.16354016354016354, "grad_norm": 371.39808284910134, "learning_rate": 4.647247461250668e-07, "logits/generated": -2.3548593521118164, "logits/real": -2.326577663421631, "logps/generated": -516.9511108398438, "logps/real": -317.8005065917969, "loss": 0.8024, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.0962762832641602, "rewards/margins": 13.388837814331055, "rewards/real": 12.292562484741211, "step": 1360 }, { "epoch": 0.16474266474266475, "grad_norm": 373.7942551502674, "learning_rate": 4.640566541956173e-07, "logits/generated": -2.395211696624756, "logits/real": -2.429844856262207, "logps/generated": -471.0773010253906, "logps/real": -293.4932861328125, "loss": 0.4601, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.73394775390625, "rewards/margins": 14.701550483703613, "rewards/real": 11.967602729797363, "step": 1370 }, { "epoch": 0.16594516594516595, "grad_norm": 74.08532767749598, "learning_rate": 4.633885622661678e-07, "logits/generated": -2.399756669998169, "logits/real": -2.3803422451019287, "logps/generated": -574.4180297851562, "logps/real": -323.24102783203125, "loss": 0.6522, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.6571858525276184, "rewards/margins": 13.190020561218262, "rewards/real": 12.532835006713867, "step": 1380 }, { "epoch": 0.16714766714766716, "grad_norm": 267.6385400497943, "learning_rate": 4.6272047033671827e-07, "logits/generated": -2.4607558250427246, "logits/real": -2.4937150478363037, "logps/generated": -533.2379150390625, "logps/real": -404.29168701171875, "loss": 0.8347, "rewards/accuracies": 0.925000011920929, "rewards/generated": 6.728078365325928, "rewards/margins": 14.371792793273926, "rewards/real": 21.099870681762695, "step": 1390 }, { "epoch": 0.16835016835016836, "grad_norm": 308.7151767120102, "learning_rate": 4.620523784072688e-07, "logits/generated": -2.536503314971924, "logits/real": -2.4976582527160645, "logps/generated": -545.3577880859375, "logps/real": -431.7715759277344, "loss": 0.6409, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 3.8251757621765137, "rewards/margins": 13.433639526367188, "rewards/real": 17.25881576538086, "step": 1400 }, { "epoch": 0.16955266955266957, "grad_norm": 306.179494954948, "learning_rate": 4.6138428647781933e-07, "logits/generated": -2.4538426399230957, "logits/real": -2.4141111373901367, "logps/generated": -478.7513122558594, "logps/real": -396.04547119140625, "loss": 0.5104, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.485698699951172, "rewards/margins": 13.679600715637207, "rewards/real": 16.165298461914062, "step": 1410 }, { "epoch": 0.17075517075517074, "grad_norm": 496.13565710543406, "learning_rate": 4.6071619454836984e-07, "logits/generated": -2.386244773864746, "logits/real": -2.398043632507324, "logps/generated": -482.45526123046875, "logps/real": -345.71978759765625, "loss": 0.5504, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 3.6164448261260986, "rewards/margins": 8.972063064575195, "rewards/real": 12.588507652282715, "step": 1420 }, { "epoch": 0.17195767195767195, "grad_norm": 7.012010018130877, "learning_rate": 4.6004810261892035e-07, "logits/generated": -2.3383326530456543, "logits/real": -2.3502814769744873, "logps/generated": -401.1824951171875, "logps/real": -345.9866943359375, "loss": 0.6135, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.4917720854282379, "rewards/margins": 10.542943954467773, "rewards/real": 11.03471565246582, "step": 1430 }, { "epoch": 0.17316017316017315, "grad_norm": 893.0870988980764, "learning_rate": 4.5938001068947085e-07, "logits/generated": -2.4274826049804688, "logits/real": -2.432913303375244, "logps/generated": -517.9597778320312, "logps/real": -382.17681884765625, "loss": 0.779, "rewards/accuracies": 0.824999988079071, "rewards/generated": 4.715818881988525, "rewards/margins": 8.692477226257324, "rewards/real": 13.408296585083008, "step": 1440 }, { "epoch": 0.17436267436267436, "grad_norm": 428.45989043129157, "learning_rate": 4.5871191876002136e-07, "logits/generated": -2.4645206928253174, "logits/real": -2.4560036659240723, "logps/generated": -493.51397705078125, "logps/real": -310.2508239746094, "loss": 0.5581, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.16840839385986328, "rewards/margins": 15.603052139282227, "rewards/real": 15.434643745422363, "step": 1450 }, { "epoch": 0.17556517556517556, "grad_norm": 50.85431689497447, "learning_rate": 4.5804382683057187e-07, "logits/generated": -2.435072660446167, "logits/real": -2.456373929977417, "logps/generated": -469.07269287109375, "logps/real": -414.0575256347656, "loss": 0.7517, "rewards/accuracies": 0.925000011920929, "rewards/generated": 4.06419563293457, "rewards/margins": 12.056317329406738, "rewards/real": 16.120512008666992, "step": 1460 }, { "epoch": 0.17676767676767677, "grad_norm": 14.883022568834077, "learning_rate": 4.573757349011224e-07, "logits/generated": -2.333134889602661, "logits/real": -2.330747604370117, "logps/generated": -464.7652282714844, "logps/real": -231.93667602539062, "loss": 0.5558, "rewards/accuracies": 0.875, "rewards/generated": 0.14628782868385315, "rewards/margins": 11.229342460632324, "rewards/real": 11.375631332397461, "step": 1470 }, { "epoch": 0.17797017797017797, "grad_norm": 196.79652566238005, "learning_rate": 4.567076429716729e-07, "logits/generated": -2.3200600147247314, "logits/real": -2.396704912185669, "logps/generated": -432.6995544433594, "logps/real": -338.466064453125, "loss": 0.6584, "rewards/accuracies": 0.949999988079071, "rewards/generated": 1.0237276554107666, "rewards/margins": 13.672329902648926, "rewards/real": 14.69605827331543, "step": 1480 }, { "epoch": 0.17917267917267918, "grad_norm": 589.2611389956969, "learning_rate": 4.5603955104222344e-07, "logits/generated": -2.3034911155700684, "logits/real": -2.2840018272399902, "logps/generated": -410.0265197753906, "logps/real": -288.2999267578125, "loss": 0.84, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.6751577854156494, "rewards/margins": 11.684983253479004, "rewards/real": 11.009824752807617, "step": 1490 }, { "epoch": 0.18037518037518038, "grad_norm": 25.104466466994783, "learning_rate": 4.553714591127739e-07, "logits/generated": -2.250640869140625, "logits/real": -2.3452303409576416, "logps/generated": -366.4850158691406, "logps/real": -303.73175048828125, "loss": 0.6159, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.013589382171631, "rewards/margins": 16.16395378112793, "rewards/real": 12.15036392211914, "step": 1500 }, { "epoch": 0.1815776815776816, "grad_norm": 55.12333155338489, "learning_rate": 4.547033671833244e-07, "logits/generated": -2.3714919090270996, "logits/real": -2.378178119659424, "logps/generated": -513.4163818359375, "logps/real": -332.26312255859375, "loss": 0.6123, "rewards/accuracies": 0.875, "rewards/generated": 3.348322629928589, "rewards/margins": 12.414596557617188, "rewards/real": 15.762918472290039, "step": 1510 }, { "epoch": 0.1827801827801828, "grad_norm": 768.6271488170355, "learning_rate": 4.540352752538749e-07, "logits/generated": -2.3710010051727295, "logits/real": -2.3883090019226074, "logps/generated": -571.8932495117188, "logps/real": -419.20172119140625, "loss": 0.4591, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.354491710662842, "rewards/margins": 16.200727462768555, "rewards/real": 18.555217742919922, "step": 1520 }, { "epoch": 0.18398268398268397, "grad_norm": 314.7097682368379, "learning_rate": 4.533671833244254e-07, "logits/generated": -2.379059076309204, "logits/real": -2.423083782196045, "logps/generated": -562.8613891601562, "logps/real": -310.7023620605469, "loss": 0.5682, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 3.716447353363037, "rewards/margins": 12.8453369140625, "rewards/real": 16.561786651611328, "step": 1530 }, { "epoch": 0.18518518518518517, "grad_norm": 41.10200461760403, "learning_rate": 4.526990913949759e-07, "logits/generated": -2.3485469818115234, "logits/real": -2.346954822540283, "logps/generated": -541.9116821289062, "logps/real": -284.74383544921875, "loss": 0.6131, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 0.33158206939697266, "rewards/margins": 12.98466682434082, "rewards/real": 13.316247940063477, "step": 1540 }, { "epoch": 0.18638768638768638, "grad_norm": 497.39061404702255, "learning_rate": 4.520309994655264e-07, "logits/generated": -2.3814096450805664, "logits/real": -2.3579282760620117, "logps/generated": -413.19970703125, "logps/real": -272.5608215332031, "loss": 0.5324, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.7178661823272705, "rewards/margins": 11.779719352722168, "rewards/real": 14.497584342956543, "step": 1550 }, { "epoch": 0.18759018759018758, "grad_norm": 429.25026650701125, "learning_rate": 4.5136290753607693e-07, "logits/generated": -2.4310972690582275, "logits/real": -2.4227535724639893, "logps/generated": -419.1544494628906, "logps/real": -263.32415771484375, "loss": 0.671, "rewards/accuracies": 0.824999988079071, "rewards/generated": 3.222504138946533, "rewards/margins": 11.070513725280762, "rewards/real": 14.29301643371582, "step": 1560 }, { "epoch": 0.1887926887926888, "grad_norm": 49.50677819796209, "learning_rate": 4.506948156066275e-07, "logits/generated": -2.4903347492218018, "logits/real": -2.4775195121765137, "logps/generated": -466.87957763671875, "logps/real": -298.753173828125, "loss": 0.8774, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.608281135559082, "rewards/margins": 12.107540130615234, "rewards/real": 14.715822219848633, "step": 1570 }, { "epoch": 0.18999518999519, "grad_norm": 519.2203790711026, "learning_rate": 4.50026723677178e-07, "logits/generated": -2.4701426029205322, "logits/real": -2.4812369346618652, "logps/generated": -461.80181884765625, "logps/real": -288.8653564453125, "loss": 0.5857, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.4484314918518066, "rewards/margins": 12.156876564025879, "rewards/real": 14.605308532714844, "step": 1580 }, { "epoch": 0.1911976911976912, "grad_norm": 25.36251599233081, "learning_rate": 4.493586317477285e-07, "logits/generated": -2.4882984161376953, "logits/real": -2.5147907733917236, "logps/generated": -571.2619018554688, "logps/real": -326.4120178222656, "loss": 0.5151, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.17659726738929749, "rewards/margins": 16.33140754699707, "rewards/real": 16.50800323486328, "step": 1590 }, { "epoch": 0.1924001924001924, "grad_norm": 24.19702687293738, "learning_rate": 4.4869053981827896e-07, "logits/generated": -2.4557952880859375, "logits/real": -2.5277934074401855, "logps/generated": -498.01849365234375, "logps/real": -229.16592407226562, "loss": 0.4103, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.9251322746276855, "rewards/margins": 15.357412338256836, "rewards/real": 11.432281494140625, "step": 1600 }, { "epoch": 0.1936026936026936, "grad_norm": 763.9680269490647, "learning_rate": 4.4802244788882946e-07, "logits/generated": -2.4854238033294678, "logits/real": -2.4586923122406006, "logps/generated": -394.8902282714844, "logps/real": -201.8122100830078, "loss": 0.6123, "rewards/accuracies": 0.824999988079071, "rewards/generated": 2.92989444732666, "rewards/margins": 9.340131759643555, "rewards/real": 12.270027160644531, "step": 1610 }, { "epoch": 0.19480519480519481, "grad_norm": 96.0981867685106, "learning_rate": 4.4735435595937997e-07, "logits/generated": -2.4385573863983154, "logits/real": -2.4143004417419434, "logps/generated": -377.39617919921875, "logps/real": -227.68789672851562, "loss": 0.5976, "rewards/accuracies": 0.824999988079071, "rewards/generated": -0.8079854846000671, "rewards/margins": 11.13033676147461, "rewards/real": 10.322351455688477, "step": 1620 }, { "epoch": 0.19600769600769602, "grad_norm": 113.30707926581165, "learning_rate": 4.466862640299305e-07, "logits/generated": -2.4770843982696533, "logits/real": -2.4692885875701904, "logps/generated": -481.06878662109375, "logps/real": -274.21343994140625, "loss": 0.4434, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.1712679862976074, "rewards/margins": 11.480100631713867, "rewards/real": 13.651369094848633, "step": 1630 }, { "epoch": 0.1972101972101972, "grad_norm": 87.07106570030153, "learning_rate": 4.46018172100481e-07, "logits/generated": -2.4080023765563965, "logits/real": -2.4215290546417236, "logps/generated": -470.46026611328125, "logps/real": -295.76507568359375, "loss": 0.648, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 4.108889102935791, "rewards/margins": 12.166716575622559, "rewards/real": 16.275604248046875, "step": 1640 }, { "epoch": 0.1984126984126984, "grad_norm": 41.39049774073744, "learning_rate": 4.4535008017103154e-07, "logits/generated": -2.455228328704834, "logits/real": -2.4480817317962646, "logps/generated": -411.90057373046875, "logps/real": -256.37957763671875, "loss": 0.6725, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.38176360726356506, "rewards/margins": 12.624078750610352, "rewards/real": 13.005842208862305, "step": 1650 }, { "epoch": 0.1996151996151996, "grad_norm": 74.85093751786184, "learning_rate": 4.4468198824158205e-07, "logits/generated": -2.4727237224578857, "logits/real": -2.4917914867401123, "logps/generated": -484.3727111816406, "logps/real": -301.02545166015625, "loss": 0.611, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.7268470525741577, "rewards/margins": 12.012619018554688, "rewards/real": 13.739466667175293, "step": 1660 }, { "epoch": 0.2008177008177008, "grad_norm": 73.04979535106028, "learning_rate": 4.4401389631213256e-07, "logits/generated": -2.4775261878967285, "logits/real": -2.4395952224731445, "logps/generated": -456.0423278808594, "logps/real": -281.8722839355469, "loss": 0.6198, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.6114019155502319, "rewards/margins": 16.019010543823242, "rewards/real": 15.407610893249512, "step": 1670 }, { "epoch": 0.20202020202020202, "grad_norm": 74.86241887073881, "learning_rate": 4.4334580438268306e-07, "logits/generated": -2.4175937175750732, "logits/real": -2.3863775730133057, "logps/generated": -425.3651428222656, "logps/real": -240.4586944580078, "loss": 0.4477, "rewards/accuracies": 1.0, "rewards/generated": -0.2837601602077484, "rewards/margins": 14.158716201782227, "rewards/real": 13.874957084655762, "step": 1680 }, { "epoch": 0.20322270322270322, "grad_norm": 572.3239272700722, "learning_rate": 4.4267771245323357e-07, "logits/generated": -2.423804759979248, "logits/real": -2.4141461849212646, "logps/generated": -392.3048095703125, "logps/real": -300.6315002441406, "loss": 0.583, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.03982771560549736, "rewards/margins": 13.115669250488281, "rewards/real": 13.155497550964355, "step": 1690 }, { "epoch": 0.20442520442520443, "grad_norm": 228.8164207304763, "learning_rate": 4.42009620523784e-07, "logits/generated": -2.3679518699645996, "logits/real": -2.4284584522247314, "logps/generated": -534.503173828125, "logps/real": -310.5823974609375, "loss": 0.6521, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 2.6827309131622314, "rewards/margins": 12.820253372192383, "rewards/real": 15.502985000610352, "step": 1700 }, { "epoch": 0.20562770562770563, "grad_norm": 217.21778042822632, "learning_rate": 4.4134152859433453e-07, "logits/generated": -2.241626739501953, "logits/real": -2.2955174446105957, "logps/generated": -435.6639709472656, "logps/real": -250.5611572265625, "loss": 0.6091, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.5427447557449341, "rewards/margins": 12.22032642364502, "rewards/real": 11.677579879760742, "step": 1710 }, { "epoch": 0.20683020683020684, "grad_norm": 71.02987427794781, "learning_rate": 4.4067343666488503e-07, "logits/generated": -2.2914438247680664, "logits/real": -2.3038229942321777, "logps/generated": -504.0199279785156, "logps/real": -340.43194580078125, "loss": 0.4417, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.6536057591438293, "rewards/margins": 16.69736099243164, "rewards/real": 17.35096549987793, "step": 1720 }, { "epoch": 0.20803270803270804, "grad_norm": 245.7445291713791, "learning_rate": 4.400053447354356e-07, "logits/generated": -2.354158401489258, "logits/real": -2.3235185146331787, "logps/generated": -542.0653076171875, "logps/real": -333.35601806640625, "loss": 0.4537, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.0518426895141602, "rewards/margins": 16.70272445678711, "rewards/real": 15.65088176727295, "step": 1730 }, { "epoch": 0.20923520923520925, "grad_norm": 331.9096392613902, "learning_rate": 4.393372528059861e-07, "logits/generated": -2.260397434234619, "logits/real": -2.321648359298706, "logps/generated": -601.6236572265625, "logps/real": -353.7864990234375, "loss": 0.5179, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.30995121598243713, "rewards/margins": 15.769620895385742, "rewards/real": 15.459671020507812, "step": 1740 }, { "epoch": 0.21043771043771045, "grad_norm": 21.281642905190886, "learning_rate": 4.386691608765366e-07, "logits/generated": -2.2410879135131836, "logits/real": -2.2713370323181152, "logps/generated": -468.92987060546875, "logps/real": -259.9978332519531, "loss": 0.4654, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.6135820150375366, "rewards/margins": 13.631518363952637, "rewards/real": 12.017935752868652, "step": 1750 }, { "epoch": 0.21164021164021163, "grad_norm": 320.62046619878, "learning_rate": 4.380010689470871e-07, "logits/generated": -2.295548677444458, "logits/real": -2.3581936359405518, "logps/generated": -517.8612060546875, "logps/real": -268.38616943359375, "loss": 0.4489, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.6024210453033447, "rewards/margins": 16.50004005432129, "rewards/real": 13.897619247436523, "step": 1760 }, { "epoch": 0.21284271284271283, "grad_norm": 233.75139428508683, "learning_rate": 4.373329770176376e-07, "logits/generated": -2.3198766708374023, "logits/real": -2.3995108604431152, "logps/generated": -511.88720703125, "logps/real": -391.0545349121094, "loss": 0.5041, "rewards/accuracies": 0.949999988079071, "rewards/generated": 5.2740559577941895, "rewards/margins": 16.216516494750977, "rewards/real": 21.49057388305664, "step": 1770 }, { "epoch": 0.21404521404521404, "grad_norm": 18.52303496346386, "learning_rate": 4.3666488508818813e-07, "logits/generated": -2.1872031688690186, "logits/real": -2.221298933029175, "logps/generated": -482.6136779785156, "logps/real": -300.1002197265625, "loss": 1.0096, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.4327448010444641, "rewards/margins": 12.359970092773438, "rewards/real": 11.927223205566406, "step": 1780 }, { "epoch": 0.21524771524771524, "grad_norm": 157.91598537412438, "learning_rate": 4.3599679315873863e-07, "logits/generated": -2.2624454498291016, "logits/real": -2.293370246887207, "logps/generated": -544.6771850585938, "logps/real": -346.1015625, "loss": 0.4924, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.7577619552612305, "rewards/margins": 14.797775268554688, "rewards/real": 15.555536270141602, "step": 1790 }, { "epoch": 0.21645021645021645, "grad_norm": 25.268422973955733, "learning_rate": 4.353287012292891e-07, "logits/generated": -2.3029236793518066, "logits/real": -2.311941146850586, "logps/generated": -491.8525390625, "logps/real": -256.67156982421875, "loss": 0.4484, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.2190592288970947, "rewards/margins": 15.085932731628418, "rewards/real": 12.86687183380127, "step": 1800 }, { "epoch": 0.21765271765271765, "grad_norm": 380.27991069500246, "learning_rate": 4.3466060929983965e-07, "logits/generated": -2.2758796215057373, "logits/real": -2.3383190631866455, "logps/generated": -425.18389892578125, "logps/real": -309.69171142578125, "loss": 0.6675, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.514988660812378, "rewards/margins": 14.734872817993164, "rewards/real": 17.249860763549805, "step": 1810 }, { "epoch": 0.21885521885521886, "grad_norm": 12.522879331918343, "learning_rate": 4.3399251737039015e-07, "logits/generated": -2.2408108711242676, "logits/real": -2.2868175506591797, "logps/generated": -391.3513488769531, "logps/real": -247.9014892578125, "loss": 0.5494, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.6192407608032227, "rewards/margins": 14.945060729980469, "rewards/real": 12.325821876525879, "step": 1820 }, { "epoch": 0.22005772005772006, "grad_norm": 430.19118319914594, "learning_rate": 4.3332442544094066e-07, "logits/generated": -2.343614101409912, "logits/real": -2.356734275817871, "logps/generated": -483.753173828125, "logps/real": -279.4593200683594, "loss": 0.7199, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.6338332891464233, "rewards/margins": 14.713136672973633, "rewards/real": 16.346969604492188, "step": 1830 }, { "epoch": 0.22126022126022127, "grad_norm": 17.347325299062064, "learning_rate": 4.3265633351149117e-07, "logits/generated": -2.308915853500366, "logits/real": -2.350360631942749, "logps/generated": -478.40423583984375, "logps/real": -358.20611572265625, "loss": 0.649, "rewards/accuracies": 1.0, "rewards/generated": 4.020327091217041, "rewards/margins": 16.09906578063965, "rewards/real": 20.1193904876709, "step": 1840 }, { "epoch": 0.22246272246272247, "grad_norm": 407.4563009923528, "learning_rate": 4.3198824158204167e-07, "logits/generated": -2.3142600059509277, "logits/real": -2.300481081008911, "logps/generated": -461.16650390625, "logps/real": -245.1242218017578, "loss": 0.5186, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.1521506309509277, "rewards/margins": 14.651391983032227, "rewards/real": 11.499241828918457, "step": 1850 }, { "epoch": 0.22366522366522368, "grad_norm": 696.4123601790546, "learning_rate": 4.313201496525922e-07, "logits/generated": -2.2123773097991943, "logits/real": -2.3065268993377686, "logps/generated": -435.8077087402344, "logps/real": -245.79104614257812, "loss": 0.8668, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.2253907918930054, "rewards/margins": 12.305874824523926, "rewards/real": 13.53126335144043, "step": 1860 }, { "epoch": 0.22486772486772486, "grad_norm": 79.30742150180409, "learning_rate": 4.306520577231427e-07, "logits/generated": -2.1671016216278076, "logits/real": -2.1830508708953857, "logps/generated": -525.7474365234375, "logps/real": -286.37158203125, "loss": 0.508, "rewards/accuracies": 0.875, "rewards/generated": -2.8406291007995605, "rewards/margins": 16.074737548828125, "rewards/real": 13.234106063842773, "step": 1870 }, { "epoch": 0.22607022607022606, "grad_norm": 85.42810821033221, "learning_rate": 4.299839657936932e-07, "logits/generated": -2.221487045288086, "logits/real": -2.212695837020874, "logps/generated": -532.984130859375, "logps/real": -257.0509033203125, "loss": 0.5246, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.5567172169685364, "rewards/margins": 14.513051986694336, "rewards/real": 13.956336975097656, "step": 1880 }, { "epoch": 0.22727272727272727, "grad_norm": 64.49347505838848, "learning_rate": 4.2931587386424375e-07, "logits/generated": -2.329141855239868, "logits/real": -2.3000168800354004, "logps/generated": -464.63287353515625, "logps/real": -294.87652587890625, "loss": 0.5232, "rewards/accuracies": 0.949999988079071, "rewards/generated": 4.820662021636963, "rewards/margins": 13.737935066223145, "rewards/real": 18.558597564697266, "step": 1890 }, { "epoch": 0.22847522847522847, "grad_norm": 657.9420294202405, "learning_rate": 4.286477819347942e-07, "logits/generated": -2.2628135681152344, "logits/real": -2.280303955078125, "logps/generated": -361.05511474609375, "logps/real": -195.9595947265625, "loss": 0.9089, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.6073031425476074, "rewards/margins": 12.64094352722168, "rewards/real": 12.03364086151123, "step": 1900 }, { "epoch": 0.22967772967772968, "grad_norm": 329.61710458929105, "learning_rate": 4.279796900053447e-07, "logits/generated": -2.2633259296417236, "logits/real": -2.3366036415100098, "logps/generated": -435.8863830566406, "logps/real": -282.08587646484375, "loss": 0.7455, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.6685137748718262, "rewards/margins": 15.345560073852539, "rewards/real": 16.01407241821289, "step": 1910 }, { "epoch": 0.23088023088023088, "grad_norm": 10.871694392692595, "learning_rate": 4.273115980758952e-07, "logits/generated": -2.227038860321045, "logits/real": -2.2521004676818848, "logps/generated": -513.7752685546875, "logps/real": -307.07830810546875, "loss": 0.6906, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.7155650854110718, "rewards/margins": 15.317960739135742, "rewards/real": 17.033527374267578, "step": 1920 }, { "epoch": 0.23208273208273208, "grad_norm": 34.03969667999832, "learning_rate": 4.266435061464457e-07, "logits/generated": -2.1800999641418457, "logits/real": -2.2404844760894775, "logps/generated": -559.51123046875, "logps/real": -282.93316650390625, "loss": 0.3882, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.813613176345825, "rewards/margins": 18.474063873291016, "rewards/real": 14.660449028015137, "step": 1930 }, { "epoch": 0.2332852332852333, "grad_norm": 580.2052114089759, "learning_rate": 4.2597541421699623e-07, "logits/generated": -2.2422163486480713, "logits/real": -2.31141996383667, "logps/generated": -539.1412353515625, "logps/real": -268.7283630371094, "loss": 0.5868, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.737396240234375, "rewards/margins": 13.86515998840332, "rewards/real": 15.602557182312012, "step": 1940 }, { "epoch": 0.2344877344877345, "grad_norm": 15.896789915736347, "learning_rate": 4.2530732228754674e-07, "logits/generated": -2.2258827686309814, "logits/real": -2.1926958560943604, "logps/generated": -467.52166748046875, "logps/real": -212.1752471923828, "loss": 0.7112, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.5906529426574707, "rewards/margins": 13.30827808380127, "rewards/real": 10.717625617980957, "step": 1950 }, { "epoch": 0.2356902356902357, "grad_norm": 106.83104345911345, "learning_rate": 4.2463923035809724e-07, "logits/generated": -2.2099413871765137, "logits/real": -2.250890016555786, "logps/generated": -452.8672790527344, "logps/real": -339.02288818359375, "loss": 0.4297, "rewards/accuracies": 0.875, "rewards/generated": 5.544894695281982, "rewards/margins": 11.915275573730469, "rewards/real": 17.46017074584961, "step": 1960 }, { "epoch": 0.2368927368927369, "grad_norm": 498.3981378201728, "learning_rate": 4.239711384286478e-07, "logits/generated": -2.246563196182251, "logits/real": -2.2958579063415527, "logps/generated": -578.3214111328125, "logps/real": -343.76971435546875, "loss": 0.5417, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 3.9790565967559814, "rewards/margins": 16.92190170288086, "rewards/real": 20.900959014892578, "step": 1970 }, { "epoch": 0.23809523809523808, "grad_norm": 30.15345699954383, "learning_rate": 4.233030464991983e-07, "logits/generated": -2.2259361743927, "logits/real": -2.294032573699951, "logps/generated": -525.3324584960938, "logps/real": -358.31097412109375, "loss": 0.6253, "rewards/accuracies": 1.0, "rewards/generated": 1.466997742652893, "rewards/margins": 17.65773582458496, "rewards/real": 19.12473487854004, "step": 1980 }, { "epoch": 0.2392977392977393, "grad_norm": 53.102967891588364, "learning_rate": 4.226349545697488e-07, "logits/generated": -2.211698055267334, "logits/real": -2.2531678676605225, "logps/generated": -521.4146118164062, "logps/real": -284.21527099609375, "loss": 0.5967, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.092949628829956, "rewards/margins": 16.75921630859375, "rewards/real": 14.666265487670898, "step": 1990 }, { "epoch": 0.2405002405002405, "grad_norm": 22.240501548749336, "learning_rate": 4.2196686264029927e-07, "logits/generated": -2.1727306842803955, "logits/real": -2.192847490310669, "logps/generated": -502.5591735839844, "logps/real": -300.94757080078125, "loss": 0.6451, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.1510093212127686, "rewards/margins": 16.0479679107666, "rewards/real": 14.89695930480957, "step": 2000 }, { "epoch": 0.2405002405002405, "eval_logits/generated": -2.2453532218933105, "eval_logits/real": -2.289402961730957, "eval_logps/generated": -467.9886169433594, "eval_logps/real": -300.9576110839844, "eval_loss": 0.4696206748485565, "eval_rewards/accuracies": 0.961309552192688, "eval_rewards/generated": 2.1792919635772705, "eval_rewards/margins": 14.852046012878418, "eval_rewards/real": 17.031339645385742, "eval_runtime": 159.2924, "eval_samples_per_second": 6.278, "eval_steps_per_second": 0.527, "step": 2000 }, { "epoch": 0.2417027417027417, "grad_norm": 498.25699198056566, "learning_rate": 4.212987707108498e-07, "logits/generated": -2.235767364501953, "logits/real": -2.296876907348633, "logps/generated": -465.62615966796875, "logps/real": -289.1161804199219, "loss": 0.6497, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 4.580079555511475, "rewards/margins": 13.837247848510742, "rewards/real": 18.417327880859375, "step": 2010 }, { "epoch": 0.2429052429052429, "grad_norm": 108.4784891447063, "learning_rate": 4.206306787814003e-07, "logits/generated": -2.1960015296936035, "logits/real": -2.2037336826324463, "logps/generated": -411.76739501953125, "logps/real": -199.02542114257812, "loss": 0.4661, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.699652910232544, "rewards/margins": 13.909741401672363, "rewards/real": 12.210088729858398, "step": 2020 }, { "epoch": 0.2441077441077441, "grad_norm": 20.078751268669887, "learning_rate": 4.199625868519508e-07, "logits/generated": -2.1672139167785645, "logits/real": -2.26566481590271, "logps/generated": -466.12799072265625, "logps/real": -307.26123046875, "loss": 0.6044, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.63902747631073, "rewards/margins": 16.171306610107422, "rewards/real": 16.810333251953125, "step": 2030 }, { "epoch": 0.2453102453102453, "grad_norm": 397.60746174814227, "learning_rate": 4.192944949225013e-07, "logits/generated": -2.186650276184082, "logits/real": -2.2666990756988525, "logps/generated": -478.31884765625, "logps/real": -299.16778564453125, "loss": 0.6022, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.120175361633301, "rewards/margins": 17.342470169067383, "rewards/real": 19.462646484375, "step": 2040 }, { "epoch": 0.24651274651274652, "grad_norm": 7.458043613749171, "learning_rate": 4.1862640299305185e-07, "logits/generated": -2.1063156127929688, "logits/real": -2.2582225799560547, "logps/generated": -453.861328125, "logps/real": -266.7242126464844, "loss": 0.4791, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.4714561700820923, "rewards/margins": 16.04201889038086, "rewards/real": 14.570562362670898, "step": 2050 }, { "epoch": 0.24771524771524772, "grad_norm": 22.85137487751545, "learning_rate": 4.1795831106360236e-07, "logits/generated": -2.164400815963745, "logits/real": -2.239060163497925, "logps/generated": -549.2992553710938, "logps/real": -300.09600830078125, "loss": 0.4671, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.7945098876953125, "rewards/margins": 19.4163761138916, "rewards/real": 15.621866226196289, "step": 2060 }, { "epoch": 0.24891774891774893, "grad_norm": 158.87412496144168, "learning_rate": 4.1729021913415287e-07, "logits/generated": -2.260401964187622, "logits/real": -2.3259756565093994, "logps/generated": -485.48614501953125, "logps/real": -280.7210693359375, "loss": 0.8281, "rewards/accuracies": 0.925000011920929, "rewards/generated": 4.852581977844238, "rewards/margins": 12.515386581420898, "rewards/real": 17.367969512939453, "step": 2070 }, { "epoch": 0.25012025012025013, "grad_norm": 488.5382133493573, "learning_rate": 4.1662212720470337e-07, "logits/generated": -2.1770377159118652, "logits/real": -2.2206311225891113, "logps/generated": -531.7283935546875, "logps/real": -331.6108703613281, "loss": 0.4937, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.05897097662091255, "rewards/margins": 16.59484100341797, "rewards/real": 16.653812408447266, "step": 2080 }, { "epoch": 0.25132275132275134, "grad_norm": 78.167027168952, "learning_rate": 4.159540352752539e-07, "logits/generated": -2.1663780212402344, "logits/real": -2.2352824211120605, "logps/generated": -455.36163330078125, "logps/real": -214.59616088867188, "loss": 0.3855, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.8598149418830872, "rewards/margins": 14.810043334960938, "rewards/real": 13.950228691101074, "step": 2090 }, { "epoch": 0.25252525252525254, "grad_norm": 982.4700543733867, "learning_rate": 4.1528594334580433e-07, "logits/generated": -2.289685010910034, "logits/real": -2.3377137184143066, "logps/generated": -469.47088623046875, "logps/real": -281.41229248046875, "loss": 0.6261, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.8475968241691589, "rewards/margins": 16.942211151123047, "rewards/real": 17.78980827331543, "step": 2100 }, { "epoch": 0.25372775372775375, "grad_norm": 306.0092872930662, "learning_rate": 4.1461785141635484e-07, "logits/generated": -2.2969069480895996, "logits/real": -2.3424816131591797, "logps/generated": -445.5846252441406, "logps/real": -279.9046325683594, "loss": 0.5521, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.848059892654419, "rewards/margins": 14.768579483032227, "rewards/real": 16.61663818359375, "step": 2110 }, { "epoch": 0.25493025493025495, "grad_norm": 167.629245540347, "learning_rate": 4.1394975948690535e-07, "logits/generated": -2.26259183883667, "logits/real": -2.282552719116211, "logps/generated": -396.5188903808594, "logps/real": -249.4000701904297, "loss": 0.6949, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.0532415397465229, "rewards/margins": 14.1654634475708, "rewards/real": 14.112218856811523, "step": 2120 }, { "epoch": 0.25613275613275616, "grad_norm": 27.881230266471583, "learning_rate": 4.1328166755745585e-07, "logits/generated": -2.260697841644287, "logits/real": -2.3000593185424805, "logps/generated": -442.90057373046875, "logps/real": -238.9254913330078, "loss": 0.4759, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.20674467086792, "rewards/margins": 16.25245475769043, "rewards/real": 15.045707702636719, "step": 2130 }, { "epoch": 0.25733525733525736, "grad_norm": 404.60303723706244, "learning_rate": 4.126135756280064e-07, "logits/generated": -2.3268682956695557, "logits/real": -2.3540945053100586, "logps/generated": -486.217041015625, "logps/real": -275.4425354003906, "loss": 0.5313, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 1.3409477472305298, "rewards/margins": 15.182576179504395, "rewards/real": 16.523523330688477, "step": 2140 }, { "epoch": 0.2585377585377585, "grad_norm": 1205.735848922581, "learning_rate": 4.119454836985569e-07, "logits/generated": -2.343628406524658, "logits/real": -2.4291324615478516, "logps/generated": -587.6064453125, "logps/real": -310.84063720703125, "loss": 0.9305, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.1078176498413086, "rewards/margins": 20.106380462646484, "rewards/real": 18.998563766479492, "step": 2150 }, { "epoch": 0.2597402597402597, "grad_norm": 764.8068203898779, "learning_rate": 4.112773917691074e-07, "logits/generated": -2.2855494022369385, "logits/real": -2.4112563133239746, "logps/generated": -488.3350524902344, "logps/real": -305.74322509765625, "loss": 0.8736, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.6400399208068848, "rewards/margins": 18.86920738220215, "rewards/real": 21.509246826171875, "step": 2160 }, { "epoch": 0.2609427609427609, "grad_norm": 17.96023916811693, "learning_rate": 4.1060929983965793e-07, "logits/generated": -2.3181021213531494, "logits/real": -2.3859965801239014, "logps/generated": -427.2066345214844, "logps/real": -247.71176147460938, "loss": 0.3734, "rewards/accuracies": 1.0, "rewards/generated": -2.1131749153137207, "rewards/margins": 17.140527725219727, "rewards/real": 15.02735424041748, "step": 2170 }, { "epoch": 0.2621452621452621, "grad_norm": 21.958852114576167, "learning_rate": 4.0994120791020844e-07, "logits/generated": -2.329206943511963, "logits/real": -2.402454376220703, "logps/generated": -445.8583068847656, "logps/real": -263.9056701660156, "loss": 0.591, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.5780588388442993, "rewards/margins": 17.21750259399414, "rewards/real": 15.639444351196289, "step": 2180 }, { "epoch": 0.26334776334776333, "grad_norm": 67.24513864053671, "learning_rate": 4.0927311598075894e-07, "logits/generated": -2.4399991035461426, "logits/real": -2.4571475982666016, "logps/generated": -386.81072998046875, "logps/real": -234.23135375976562, "loss": 0.5481, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.7285892963409424, "rewards/margins": 12.364453315734863, "rewards/real": 13.093042373657227, "step": 2190 }, { "epoch": 0.26455026455026454, "grad_norm": 10.961551304341183, "learning_rate": 4.086050240513094e-07, "logits/generated": -2.319725513458252, "logits/real": -2.3283464908599854, "logps/generated": -494.9991149902344, "logps/real": -259.45501708984375, "loss": 0.7363, "rewards/accuracies": 0.824999988079071, "rewards/generated": 0.5694778561592102, "rewards/margins": 11.519054412841797, "rewards/real": 12.088532447814941, "step": 2200 }, { "epoch": 0.26575276575276574, "grad_norm": 493.7000639760628, "learning_rate": 4.079369321218599e-07, "logits/generated": -2.3618216514587402, "logits/real": -2.383780002593994, "logps/generated": -494.12664794921875, "logps/real": -261.7317810058594, "loss": 0.7859, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.7946144342422485, "rewards/margins": 16.871675491333008, "rewards/real": 15.077061653137207, "step": 2210 }, { "epoch": 0.26695526695526695, "grad_norm": 9.674714438495831, "learning_rate": 4.0726884019241046e-07, "logits/generated": -2.4149184226989746, "logits/real": -2.4204447269439697, "logps/generated": -478.0731506347656, "logps/real": -283.4941711425781, "loss": 0.61, "rewards/accuracies": 0.949999988079071, "rewards/generated": 3.0854523181915283, "rewards/margins": 15.088220596313477, "rewards/real": 18.173673629760742, "step": 2220 }, { "epoch": 0.26815776815776815, "grad_norm": 164.81520791608986, "learning_rate": 4.0660074826296097e-07, "logits/generated": -2.35487699508667, "logits/real": -2.3857338428497314, "logps/generated": -493.53387451171875, "logps/real": -264.8500671386719, "loss": 0.6175, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.235954999923706, "rewards/margins": 17.718080520629883, "rewards/real": 16.48212242126465, "step": 2230 }, { "epoch": 0.26936026936026936, "grad_norm": 29.11290451077836, "learning_rate": 4.059326563335115e-07, "logits/generated": -2.385585308074951, "logits/real": -2.386427879333496, "logps/generated": -455.605224609375, "logps/real": -305.58050537109375, "loss": 0.5898, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.353062391281128, "rewards/margins": 17.456607818603516, "rewards/real": 18.80967140197754, "step": 2240 }, { "epoch": 0.27056277056277056, "grad_norm": 219.00150654471975, "learning_rate": 4.05264564404062e-07, "logits/generated": -2.370087146759033, "logits/real": -2.320514440536499, "logps/generated": -458.58367919921875, "logps/real": -232.2949676513672, "loss": 0.7588, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.4099538326263428, "rewards/margins": 13.649721145629883, "rewards/real": 15.059674263000488, "step": 2250 }, { "epoch": 0.27176527176527177, "grad_norm": 10.529133930047273, "learning_rate": 4.045964724746125e-07, "logits/generated": -2.3615708351135254, "logits/real": -2.385770559310913, "logps/generated": -643.0159912109375, "logps/real": -347.21728515625, "loss": 0.5589, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.890958786010742, "rewards/margins": 20.825693130493164, "rewards/real": 23.716650009155273, "step": 2260 }, { "epoch": 0.27296777296777297, "grad_norm": 150.54104577841935, "learning_rate": 4.03928380545163e-07, "logits/generated": -2.192412853240967, "logits/real": -2.2333662509918213, "logps/generated": -463.63201904296875, "logps/real": -251.89990234375, "loss": 0.6744, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.015620231628418, "rewards/margins": 16.744342803955078, "rewards/real": 14.728724479675293, "step": 2270 }, { "epoch": 0.2741702741702742, "grad_norm": 566.8994714944832, "learning_rate": 4.032602886157135e-07, "logits/generated": -2.1026272773742676, "logits/real": -2.1895911693573, "logps/generated": -478.6298828125, "logps/real": -265.9862365722656, "loss": 0.5446, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.028159642592072487, "rewards/margins": 14.760887145996094, "rewards/real": 14.73272705078125, "step": 2280 }, { "epoch": 0.2753727753727754, "grad_norm": 10.64695627907612, "learning_rate": 4.02592196686264e-07, "logits/generated": -2.061657428741455, "logits/real": -2.1556432247161865, "logps/generated": -545.6156616210938, "logps/real": -286.1421813964844, "loss": 0.6071, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.486213684082031, "rewards/margins": 22.017555236816406, "rewards/real": 16.531341552734375, "step": 2290 }, { "epoch": 0.2765752765752766, "grad_norm": 612.2821400497011, "learning_rate": 4.0192410475681457e-07, "logits/generated": -2.1123647689819336, "logits/real": -2.19512677192688, "logps/generated": -528.1289672851562, "logps/real": -288.802001953125, "loss": 0.9276, "rewards/accuracies": 0.875, "rewards/generated": 1.6563570499420166, "rewards/margins": 15.107965469360352, "rewards/real": 16.76432228088379, "step": 2300 }, { "epoch": 0.2777777777777778, "grad_norm": 7.313056170151692, "learning_rate": 4.01256012827365e-07, "logits/generated": -2.117569923400879, "logits/real": -2.1547975540161133, "logps/generated": -495.6553649902344, "logps/real": -248.65133666992188, "loss": 0.3803, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.843641757965088, "rewards/margins": 19.134342193603516, "rewards/real": 15.290698051452637, "step": 2310 }, { "epoch": 0.278980278980279, "grad_norm": 7.7175479001943526, "learning_rate": 4.0058792089791553e-07, "logits/generated": -2.1462583541870117, "logits/real": -2.2426555156707764, "logps/generated": -585.8822631835938, "logps/real": -360.5639953613281, "loss": 0.7028, "rewards/accuracies": 0.925000011920929, "rewards/generated": 5.030735969543457, "rewards/margins": 17.73818588256836, "rewards/real": 22.768924713134766, "step": 2320 }, { "epoch": 0.2801827801827802, "grad_norm": 227.33141818933976, "learning_rate": 3.9991982896846603e-07, "logits/generated": -2.140292167663574, "logits/real": -2.1454365253448486, "logps/generated": -486.5482482910156, "logps/real": -244.6797332763672, "loss": 0.5421, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.1532176733016968, "rewards/margins": 17.947900772094727, "rewards/real": 16.794681549072266, "step": 2330 }, { "epoch": 0.2813852813852814, "grad_norm": 228.19243583306417, "learning_rate": 3.9925173703901654e-07, "logits/generated": -2.126603603363037, "logits/real": -2.230055332183838, "logps/generated": -532.1044921875, "logps/real": -248.2303009033203, "loss": 0.456, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.700369834899902, "rewards/margins": 19.924604415893555, "rewards/real": 13.224233627319336, "step": 2340 }, { "epoch": 0.2825877825877826, "grad_norm": 102.87004216018481, "learning_rate": 3.9858364510956705e-07, "logits/generated": -2.1835708618164062, "logits/real": -2.2770071029663086, "logps/generated": -556.3807373046875, "logps/real": -299.39337158203125, "loss": 0.5245, "rewards/accuracies": 0.875, "rewards/generated": 1.4072721004486084, "rewards/margins": 18.639461517333984, "rewards/real": 20.046733856201172, "step": 2350 }, { "epoch": 0.2837902837902838, "grad_norm": 52.076633582830944, "learning_rate": 3.9791555318011755e-07, "logits/generated": -2.0553860664367676, "logits/real": -2.3055174350738525, "logps/generated": -595.501220703125, "logps/real": -365.21728515625, "loss": 0.4418, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.978679895401001, "rewards/margins": 21.45098876953125, "rewards/real": 19.472309112548828, "step": 2360 }, { "epoch": 0.284992784992785, "grad_norm": 43.265739821635215, "learning_rate": 3.9724746125066806e-07, "logits/generated": -2.211268901824951, "logits/real": -2.2506141662597656, "logps/generated": -569.6729736328125, "logps/real": -267.82177734375, "loss": 0.6833, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -3.011526584625244, "rewards/margins": 20.3045711517334, "rewards/real": 17.293045043945312, "step": 2370 }, { "epoch": 0.28619528619528617, "grad_norm": 5.432566620710611, "learning_rate": 3.965793693212186e-07, "logits/generated": -2.2058141231536865, "logits/real": -2.2846221923828125, "logps/generated": -429.84002685546875, "logps/real": -262.2298278808594, "loss": 0.5481, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.326233386993408, "rewards/margins": 13.523542404174805, "rewards/real": 15.849777221679688, "step": 2380 }, { "epoch": 0.2873977873977874, "grad_norm": 4.096190479648606, "learning_rate": 3.959112773917691e-07, "logits/generated": -2.231595039367676, "logits/real": -2.326998233795166, "logps/generated": -464.07147216796875, "logps/real": -293.7169189453125, "loss": 0.5556, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.1314868927001953, "rewards/margins": 20.272600173950195, "rewards/real": 20.141111373901367, "step": 2390 }, { "epoch": 0.2886002886002886, "grad_norm": 671.8046709052743, "learning_rate": 3.9524318546231963e-07, "logits/generated": -2.17610764503479, "logits/real": -2.2699453830718994, "logps/generated": -587.4531860351562, "logps/real": -281.1423645019531, "loss": 0.8125, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.4140739440917969, "rewards/margins": 19.855026245117188, "rewards/real": 21.269100189208984, "step": 2400 }, { "epoch": 0.2898027898027898, "grad_norm": 45.617477855374005, "learning_rate": 3.945750935328701e-07, "logits/generated": -2.201483726501465, "logits/real": -2.280503988265991, "logps/generated": -544.9918823242188, "logps/real": -300.0970764160156, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/generated": 2.2972793579101562, "rewards/margins": 18.853200912475586, "rewards/real": 21.150482177734375, "step": 2410 }, { "epoch": 0.291005291005291, "grad_norm": 201.9185668294254, "learning_rate": 3.939070016034206e-07, "logits/generated": -2.1289939880371094, "logits/real": -2.170576572418213, "logps/generated": -533.7762451171875, "logps/real": -260.8277587890625, "loss": 0.5091, "rewards/accuracies": 0.875, "rewards/generated": -0.015506362542510033, "rewards/margins": 14.969365119934082, "rewards/real": 14.953857421875, "step": 2420 }, { "epoch": 0.2922077922077922, "grad_norm": 12.401245180803466, "learning_rate": 3.932389096739711e-07, "logits/generated": -2.1283884048461914, "logits/real": -2.213829755783081, "logps/generated": -542.0123291015625, "logps/real": -326.3026428222656, "loss": 0.3686, "rewards/accuracies": 1.0, "rewards/generated": -1.4629404544830322, "rewards/margins": 21.855056762695312, "rewards/real": 20.39211654663086, "step": 2430 }, { "epoch": 0.2934102934102934, "grad_norm": 469.35001799084023, "learning_rate": 3.925708177445216e-07, "logits/generated": -2.1530420780181885, "logits/real": -2.2408835887908936, "logps/generated": -553.5878295898438, "logps/real": -266.2428283691406, "loss": 0.6252, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.5921292304992676, "rewards/margins": 17.0020751953125, "rewards/real": 18.594202041625977, "step": 2440 }, { "epoch": 0.2946127946127946, "grad_norm": 377.2950116445638, "learning_rate": 3.919027258150721e-07, "logits/generated": -2.202223300933838, "logits/real": -2.2388205528259277, "logps/generated": -468.335693359375, "logps/real": -286.7129211425781, "loss": 1.0598, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 6.400108337402344, "rewards/margins": 11.34289836883545, "rewards/real": 17.74300765991211, "step": 2450 }, { "epoch": 0.2958152958152958, "grad_norm": 7.359700932395771, "learning_rate": 3.9123463388562267e-07, "logits/generated": -2.1311264038085938, "logits/real": -2.2204508781433105, "logps/generated": -551.81640625, "logps/real": -360.0440368652344, "loss": 0.7343, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.7086261510849, "rewards/margins": 20.443758010864258, "rewards/real": 22.15238380432129, "step": 2460 }, { "epoch": 0.297017797017797, "grad_norm": 10.964991581489983, "learning_rate": 3.905665419561732e-07, "logits/generated": -1.9595201015472412, "logits/real": -2.1721019744873047, "logps/generated": -380.84161376953125, "logps/real": -230.2252960205078, "loss": 0.3808, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.458247184753418, "rewards/margins": 17.87284278869629, "rewards/real": 13.414594650268555, "step": 2470 }, { "epoch": 0.2982202982202982, "grad_norm": 57.25792791041731, "learning_rate": 3.898984500267237e-07, "logits/generated": -2.2138562202453613, "logits/real": -2.1700243949890137, "logps/generated": -398.2755126953125, "logps/real": -151.51303100585938, "loss": 0.5028, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.947381496429443, "rewards/margins": 14.118861198425293, "rewards/real": 9.171480178833008, "step": 2480 }, { "epoch": 0.2994227994227994, "grad_norm": 5.578683663434324, "learning_rate": 3.892303580972742e-07, "logits/generated": -2.1187691688537598, "logits/real": -2.168670654296875, "logps/generated": -448.775146484375, "logps/real": -240.85302734375, "loss": 0.4702, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.1800984889268875, "rewards/margins": 16.530004501342773, "rewards/real": 16.34990882873535, "step": 2490 }, { "epoch": 0.30062530062530063, "grad_norm": 184.652793423236, "learning_rate": 3.885622661678247e-07, "logits/generated": -2.1732075214385986, "logits/real": -2.1834158897399902, "logps/generated": -470.95208740234375, "logps/real": -230.28964233398438, "loss": 0.5405, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.23640525341033936, "rewards/margins": 14.332746505737305, "rewards/real": 14.096341133117676, "step": 2500 }, { "epoch": 0.30182780182780183, "grad_norm": 271.53536021611694, "learning_rate": 3.8789417423837515e-07, "logits/generated": -2.0926122665405273, "logits/real": -2.2409567832946777, "logps/generated": -471.1053161621094, "logps/real": -273.2291564941406, "loss": 0.4105, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.34327155351638794, "rewards/margins": 17.26302719116211, "rewards/real": 17.606298446655273, "step": 2510 }, { "epoch": 0.30303030303030304, "grad_norm": 20.766863064045708, "learning_rate": 3.8722608230892566e-07, "logits/generated": -2.234311580657959, "logits/real": -2.247994899749756, "logps/generated": -408.84271240234375, "logps/real": -258.6989440917969, "loss": 0.7622, "rewards/accuracies": 0.875, "rewards/generated": 4.932229518890381, "rewards/margins": 13.865893363952637, "rewards/real": 18.79812240600586, "step": 2520 }, { "epoch": 0.30423280423280424, "grad_norm": 99.42766781589214, "learning_rate": 3.8655799037947616e-07, "logits/generated": -2.18027925491333, "logits/real": -2.226388931274414, "logps/generated": -498.34967041015625, "logps/real": -246.7676544189453, "loss": 0.5463, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.9954167604446411, "rewards/margins": 16.997676849365234, "rewards/real": 16.00226402282715, "step": 2530 }, { "epoch": 0.30543530543530545, "grad_norm": 350.754875419416, "learning_rate": 3.858898984500267e-07, "logits/generated": -2.1765637397766113, "logits/real": -2.2366700172424316, "logps/generated": -606.4658813476562, "logps/real": -316.66510009765625, "loss": 0.5861, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.058906078338623, "rewards/margins": 23.236309051513672, "rewards/real": 21.17740249633789, "step": 2540 }, { "epoch": 0.30663780663780665, "grad_norm": 1191.7895875915897, "learning_rate": 3.8522180652057723e-07, "logits/generated": -2.1629891395568848, "logits/real": -2.2447993755340576, "logps/generated": -498.3642578125, "logps/real": -253.2069091796875, "loss": 0.6364, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.32391619682312, "rewards/margins": 17.841588973999023, "rewards/real": 15.517674446105957, "step": 2550 }, { "epoch": 0.30784030784030786, "grad_norm": 490.4584889577419, "learning_rate": 3.8455371459112774e-07, "logits/generated": -2.1314988136291504, "logits/real": -2.1876564025878906, "logps/generated": -520.8911743164062, "logps/real": -293.00384521484375, "loss": 0.6943, "rewards/accuracies": 0.875, "rewards/generated": -1.5374441146850586, "rewards/margins": 14.27092456817627, "rewards/real": 12.733478546142578, "step": 2560 }, { "epoch": 0.30904280904280906, "grad_norm": 403.6397986457478, "learning_rate": 3.8388562266167824e-07, "logits/generated": -2.19177508354187, "logits/real": -2.3153634071350098, "logps/generated": -577.9305419921875, "logps/real": -342.81182861328125, "loss": 0.6109, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.02606072463095188, "rewards/margins": 22.08085060119629, "rewards/real": 22.054790496826172, "step": 2570 }, { "epoch": 0.31024531024531027, "grad_norm": 14.546934589605003, "learning_rate": 3.8321753073222875e-07, "logits/generated": -2.313136577606201, "logits/real": -2.2953858375549316, "logps/generated": -461.09619140625, "logps/real": -271.5863342285156, "loss": 0.4425, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.7134145498275757, "rewards/margins": 15.884051322937012, "rewards/real": 16.597463607788086, "step": 2580 }, { "epoch": 0.3114478114478115, "grad_norm": 195.04118296937162, "learning_rate": 3.8254943880277925e-07, "logits/generated": -2.257913827896118, "logits/real": -2.2643375396728516, "logps/generated": -447.4485778808594, "logps/real": -231.215576171875, "loss": 0.5602, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.1901184618473053, "rewards/margins": 15.335597038269043, "rewards/real": 15.145477294921875, "step": 2590 }, { "epoch": 0.3126503126503126, "grad_norm": 122.41843973159715, "learning_rate": 3.8188134687332976e-07, "logits/generated": -2.2918972969055176, "logits/real": -2.2709531784057617, "logps/generated": -514.5028076171875, "logps/real": -259.34429931640625, "loss": 0.7289, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.576934337615967, "rewards/margins": 13.527132987976074, "rewards/real": 16.10406494140625, "step": 2600 }, { "epoch": 0.31385281385281383, "grad_norm": 42.42941538698551, "learning_rate": 3.812132549438802e-07, "logits/generated": -2.2534289360046387, "logits/real": -2.339850902557373, "logps/generated": -518.5089721679688, "logps/real": -317.2063903808594, "loss": 0.7398, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 5.504884243011475, "rewards/margins": 13.643224716186523, "rewards/real": 19.148107528686523, "step": 2610 }, { "epoch": 0.31505531505531503, "grad_norm": 66.50875144317763, "learning_rate": 3.805451630144308e-07, "logits/generated": -2.2641537189483643, "logits/real": -2.3424103260040283, "logps/generated": -462.82977294921875, "logps/real": -228.4934539794922, "loss": 0.5007, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.8506364822387695, "rewards/margins": 18.461023330688477, "rewards/real": 15.610386848449707, "step": 2620 }, { "epoch": 0.31625781625781624, "grad_norm": 121.3224524860916, "learning_rate": 3.798770710849813e-07, "logits/generated": -2.261126756668091, "logits/real": -2.3151824474334717, "logps/generated": -564.8231811523438, "logps/real": -308.75213623046875, "loss": 0.5121, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.0477707386016846, "rewards/margins": 19.658065795898438, "rewards/real": 17.610294342041016, "step": 2630 }, { "epoch": 0.31746031746031744, "grad_norm": 286.4977922209483, "learning_rate": 3.792089791555318e-07, "logits/generated": -2.219849109649658, "logits/real": -2.258054494857788, "logps/generated": -412.97625732421875, "logps/real": -201.11192321777344, "loss": 0.5705, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -3.6303048133850098, "rewards/margins": 14.927179336547852, "rewards/real": 11.296873092651367, "step": 2640 }, { "epoch": 0.31866281866281865, "grad_norm": 27.11392858172599, "learning_rate": 3.785408872260823e-07, "logits/generated": -2.2176125049591064, "logits/real": -2.3149025440216064, "logps/generated": -447.37939453125, "logps/real": -258.38006591796875, "loss": 0.4621, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.2688522338867188, "rewards/margins": 17.308168411254883, "rewards/real": 15.039314270019531, "step": 2650 }, { "epoch": 0.31986531986531985, "grad_norm": 28.25488178532129, "learning_rate": 3.778727952966328e-07, "logits/generated": -2.250487804412842, "logits/real": -2.281437397003174, "logps/generated": -509.69677734375, "logps/real": -250.135009765625, "loss": 0.544, "rewards/accuracies": 1.0, "rewards/generated": -1.2124731540679932, "rewards/margins": 18.824953079223633, "rewards/real": 17.61248016357422, "step": 2660 }, { "epoch": 0.32106782106782106, "grad_norm": 8.3833092801713, "learning_rate": 3.772047033671833e-07, "logits/generated": -2.2615928649902344, "logits/real": -2.317018508911133, "logps/generated": -514.9195556640625, "logps/real": -277.6170959472656, "loss": 0.6451, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.9076513051986694, "rewards/margins": 19.119131088256836, "rewards/real": 18.211477279663086, "step": 2670 }, { "epoch": 0.32227032227032226, "grad_norm": 399.2622493885436, "learning_rate": 3.765366114377338e-07, "logits/generated": -2.310234785079956, "logits/real": -2.391126871109009, "logps/generated": -455.75616455078125, "logps/real": -275.1095275878906, "loss": 0.6719, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.8847066164016724, "rewards/margins": 18.197498321533203, "rewards/real": 17.312789916992188, "step": 2680 }, { "epoch": 0.32347282347282347, "grad_norm": 134.0020971242362, "learning_rate": 3.758685195082843e-07, "logits/generated": -2.3624823093414307, "logits/real": -2.3429665565490723, "logps/generated": -418.9403381347656, "logps/real": -284.3204650878906, "loss": 0.8075, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.12245519459247589, "rewards/margins": 15.674652099609375, "rewards/real": 15.552197456359863, "step": 2690 }, { "epoch": 0.3246753246753247, "grad_norm": 213.3160271893409, "learning_rate": 3.752004275788349e-07, "logits/generated": -2.3580338954925537, "logits/real": -2.403287172317505, "logps/generated": -536.4251098632812, "logps/real": -341.7716064453125, "loss": 0.4892, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.7240122556686401, "rewards/margins": 21.984424591064453, "rewards/real": 22.708433151245117, "step": 2700 }, { "epoch": 0.3258778258778259, "grad_norm": 21.235970633452414, "learning_rate": 3.7453233564938533e-07, "logits/generated": -2.3828284740448, "logits/real": -2.3824453353881836, "logps/generated": -540.6604614257812, "logps/real": -266.5406188964844, "loss": 0.4202, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.8464826345443726, "rewards/margins": 16.177722930908203, "rewards/real": 18.024206161499023, "step": 2710 }, { "epoch": 0.3270803270803271, "grad_norm": 5.13194445661882, "learning_rate": 3.7386424371993584e-07, "logits/generated": -2.217475414276123, "logits/real": -2.157270908355713, "logps/generated": -496.35400390625, "logps/real": -193.39320373535156, "loss": 0.4943, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.373010635375977, "rewards/margins": 19.458637237548828, "rewards/real": 11.085625648498535, "step": 2720 }, { "epoch": 0.3282828282828283, "grad_norm": 190.2068058722336, "learning_rate": 3.7319615179048635e-07, "logits/generated": -2.3180150985717773, "logits/real": -2.3551723957061768, "logps/generated": -534.9052124023438, "logps/real": -277.12054443359375, "loss": 0.5609, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.518371343612671, "rewards/margins": 19.505908966064453, "rewards/real": 21.024280548095703, "step": 2730 }, { "epoch": 0.3294853294853295, "grad_norm": 74.78807710626396, "learning_rate": 3.7252805986103685e-07, "logits/generated": -2.2545883655548096, "logits/real": -2.2242777347564697, "logps/generated": -422.09429931640625, "logps/real": -195.44094848632812, "loss": 0.3884, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.2318847179412842, "rewards/margins": 16.44162940979004, "rewards/real": 15.209744453430176, "step": 2740 }, { "epoch": 0.3306878306878307, "grad_norm": 8.74676748820064, "learning_rate": 3.7185996793158736e-07, "logits/generated": -2.246854066848755, "logits/real": -2.326922655105591, "logps/generated": -534.0845947265625, "logps/real": -248.7612762451172, "loss": 0.3652, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.486065626144409, "rewards/margins": 18.387165069580078, "rewards/real": 14.901100158691406, "step": 2750 }, { "epoch": 0.3318903318903319, "grad_norm": 138.41521218871162, "learning_rate": 3.7119187600213786e-07, "logits/generated": -2.213275671005249, "logits/real": -2.2868189811706543, "logps/generated": -475.4173278808594, "logps/real": -214.8863067626953, "loss": 0.4147, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.996474504470825, "rewards/margins": 18.990652084350586, "rewards/real": 15.994176864624023, "step": 2760 }, { "epoch": 0.3330928330928331, "grad_norm": 112.81032486998156, "learning_rate": 3.7052378407268837e-07, "logits/generated": -2.3418517112731934, "logits/real": -2.3174288272857666, "logps/generated": -459.78277587890625, "logps/real": -346.34503173828125, "loss": 0.5185, "rewards/accuracies": 0.925000011920929, "rewards/generated": 3.563931703567505, "rewards/margins": 16.707632064819336, "rewards/real": 20.271564483642578, "step": 2770 }, { "epoch": 0.3342953342953343, "grad_norm": 6.766426332696746, "learning_rate": 3.6985569214323893e-07, "logits/generated": -2.3402276039123535, "logits/real": -2.3462252616882324, "logps/generated": -412.19647216796875, "logps/real": -202.32833862304688, "loss": 0.401, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.25340956449508667, "rewards/margins": 14.352015495300293, "rewards/real": 14.098605155944824, "step": 2780 }, { "epoch": 0.3354978354978355, "grad_norm": 511.5833964031441, "learning_rate": 3.6918760021378944e-07, "logits/generated": -2.279498815536499, "logits/real": -2.3270673751831055, "logps/generated": -442.42388916015625, "logps/real": -239.3588409423828, "loss": 0.4917, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 3.2044434547424316, "rewards/margins": 14.670681953430176, "rewards/real": 17.875125885009766, "step": 2790 }, { "epoch": 0.3367003367003367, "grad_norm": 103.30030241970397, "learning_rate": 3.6851950828433994e-07, "logits/generated": -2.2594730854034424, "logits/real": -2.360800266265869, "logps/generated": -434.021728515625, "logps/real": -238.7564239501953, "loss": 0.9014, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 0.7155309915542603, "rewards/margins": 13.3612642288208, "rewards/real": 14.07679557800293, "step": 2800 }, { "epoch": 0.3379028379028379, "grad_norm": 426.4962354296236, "learning_rate": 3.678514163548904e-07, "logits/generated": -2.308851718902588, "logits/real": -2.311568260192871, "logps/generated": -421.3448181152344, "logps/real": -218.58200073242188, "loss": 0.7486, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.2538965344429016, "rewards/margins": 15.328129768371582, "rewards/real": 15.074234008789062, "step": 2810 }, { "epoch": 0.33910533910533913, "grad_norm": 5.335246883158649, "learning_rate": 3.671833244254409e-07, "logits/generated": -2.271623134613037, "logits/real": -2.2808804512023926, "logps/generated": -475.78912353515625, "logps/real": -257.80328369140625, "loss": 0.4559, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.2881863117218018, "rewards/margins": 17.663087844848633, "rewards/real": 15.374898910522461, "step": 2820 }, { "epoch": 0.3403078403078403, "grad_norm": 373.64227720589076, "learning_rate": 3.665152324959914e-07, "logits/generated": -2.30668044090271, "logits/real": -2.340928316116333, "logps/generated": -518.8795166015625, "logps/real": -283.543701171875, "loss": 0.6693, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.5370599627494812, "rewards/margins": 19.455291748046875, "rewards/real": 18.918231964111328, "step": 2830 }, { "epoch": 0.3415103415103415, "grad_norm": 521.2894272426375, "learning_rate": 3.658471405665419e-07, "logits/generated": -2.273470401763916, "logits/real": -2.3501219749450684, "logps/generated": -533.2444458007812, "logps/real": -338.16253662109375, "loss": 0.6353, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.6802558898925781, "rewards/margins": 21.118061065673828, "rewards/real": 20.43780517578125, "step": 2840 }, { "epoch": 0.3427128427128427, "grad_norm": 90.07582210277268, "learning_rate": 3.651790486370924e-07, "logits/generated": -2.1684927940368652, "logits/real": -2.295745372772217, "logps/generated": -419.84173583984375, "logps/real": -246.21926879882812, "loss": 0.4569, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.7380359172821045, "rewards/margins": 16.973033905029297, "rewards/real": 14.235000610351562, "step": 2850 }, { "epoch": 0.3439153439153439, "grad_norm": 302.42665547984643, "learning_rate": 3.64510956707643e-07, "logits/generated": -2.2570419311523438, "logits/real": -2.329324245452881, "logps/generated": -430.212890625, "logps/real": -216.98428344726562, "loss": 0.6666, "rewards/accuracies": 1.0, "rewards/generated": -1.2797664403915405, "rewards/margins": 16.35479736328125, "rewards/real": 15.075032234191895, "step": 2860 }, { "epoch": 0.3451178451178451, "grad_norm": 16.668042765363705, "learning_rate": 3.638428647781935e-07, "logits/generated": -2.326631784439087, "logits/real": -2.373183012008667, "logps/generated": -486.47064208984375, "logps/real": -265.8896484375, "loss": 0.4662, "rewards/accuracies": 0.949999988079071, "rewards/generated": 1.3753122091293335, "rewards/margins": 18.226367950439453, "rewards/real": 19.601680755615234, "step": 2870 }, { "epoch": 0.3463203463203463, "grad_norm": 7.364150510742759, "learning_rate": 3.63174772848744e-07, "logits/generated": -2.1694772243499756, "logits/real": -2.2114944458007812, "logps/generated": -531.44677734375, "logps/real": -232.7603759765625, "loss": 0.4505, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.588564395904541, "rewards/margins": 22.947429656982422, "rewards/real": 15.358866691589355, "step": 2880 }, { "epoch": 0.3475228475228475, "grad_norm": 11.610271194094071, "learning_rate": 3.625066809192945e-07, "logits/generated": -2.179028034210205, "logits/real": -2.2378478050231934, "logps/generated": -528.3477172851562, "logps/real": -231.15243530273438, "loss": 0.3434, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.327719688415527, "rewards/margins": 22.12078857421875, "rewards/real": 15.793066024780273, "step": 2890 }, { "epoch": 0.3487253487253487, "grad_norm": 37.74433267090721, "learning_rate": 3.61838588989845e-07, "logits/generated": -2.1644222736358643, "logits/real": -2.2317419052124023, "logps/generated": -469.208251953125, "logps/real": -276.16094970703125, "loss": 0.8129, "rewards/accuracies": 1.0, "rewards/generated": -3.7847771644592285, "rewards/margins": 21.81374740600586, "rewards/real": 18.028968811035156, "step": 2900 }, { "epoch": 0.3499278499278499, "grad_norm": 19.454792292634934, "learning_rate": 3.6117049706039546e-07, "logits/generated": -2.2520663738250732, "logits/real": -2.3081467151641846, "logps/generated": -530.2596435546875, "logps/real": -294.5622863769531, "loss": 0.4308, "rewards/accuracies": 1.0, "rewards/generated": -0.6475614309310913, "rewards/margins": 22.570125579833984, "rewards/real": 21.922565460205078, "step": 2910 }, { "epoch": 0.3511303511303511, "grad_norm": 20.555559351118088, "learning_rate": 3.6050240513094597e-07, "logits/generated": -2.2065482139587402, "logits/real": -2.285953998565674, "logps/generated": -590.97265625, "logps/real": -328.4667053222656, "loss": 0.56, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.960893154144287, "rewards/margins": 20.918859481811523, "rewards/real": 23.8797550201416, "step": 2920 }, { "epoch": 0.35233285233285233, "grad_norm": 407.4218369458025, "learning_rate": 3.598343132014965e-07, "logits/generated": -2.1608686447143555, "logits/real": -2.2716238498687744, "logps/generated": -632.4462890625, "logps/real": -391.6826477050781, "loss": 1.1456, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.0389041900634766, "rewards/margins": 20.525333404541016, "rewards/real": 18.486431121826172, "step": 2930 }, { "epoch": 0.35353535353535354, "grad_norm": 16.446471253966664, "learning_rate": 3.5916622127204703e-07, "logits/generated": -2.1633334159851074, "logits/real": -2.236440420150757, "logps/generated": -626.8197631835938, "logps/real": -301.2580871582031, "loss": 0.3489, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.856711387634277, "rewards/margins": 25.98929214477539, "rewards/real": 21.132579803466797, "step": 2940 }, { "epoch": 0.35473785473785474, "grad_norm": 21.781590947073084, "learning_rate": 3.5849812934259754e-07, "logits/generated": -2.2256908416748047, "logits/real": -2.3398449420928955, "logps/generated": -580.8104858398438, "logps/real": -310.58636474609375, "loss": 0.4497, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.1089565753936768, "rewards/margins": 21.500940322875977, "rewards/real": 19.391984939575195, "step": 2950 }, { "epoch": 0.35594035594035595, "grad_norm": 163.62698125544455, "learning_rate": 3.5783003741314805e-07, "logits/generated": -2.313955545425415, "logits/real": -2.364743947982788, "logps/generated": -484.18524169921875, "logps/real": -272.0729675292969, "loss": 0.3679, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.4989825189113617, "rewards/margins": 18.99362564086914, "rewards/real": 18.494644165039062, "step": 2960 }, { "epoch": 0.35714285714285715, "grad_norm": 499.66094153881966, "learning_rate": 3.5716194548369855e-07, "logits/generated": -2.255885601043701, "logits/real": -2.2675743103027344, "logps/generated": -501.8041076660156, "logps/real": -234.0949249267578, "loss": 0.4604, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.058472633361816, "rewards/margins": 20.395231246948242, "rewards/real": 15.336758613586426, "step": 2970 }, { "epoch": 0.35834535834535836, "grad_norm": 13.313462414204784, "learning_rate": 3.5649385355424906e-07, "logits/generated": -2.2967865467071533, "logits/real": -2.273264169692993, "logps/generated": -485.79437255859375, "logps/real": -256.3107604980469, "loss": 0.5895, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.5996122360229492, "rewards/margins": 16.557743072509766, "rewards/real": 14.9581298828125, "step": 2980 }, { "epoch": 0.35954785954785956, "grad_norm": 377.56450844050454, "learning_rate": 3.5582576162479957e-07, "logits/generated": -2.265778064727783, "logits/real": -2.3358829021453857, "logps/generated": -606.2882080078125, "logps/real": -299.10772705078125, "loss": 0.6658, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.9930524826049805, "rewards/margins": 22.922954559326172, "rewards/real": 19.929899215698242, "step": 2990 }, { "epoch": 0.36075036075036077, "grad_norm": 13.260550088610207, "learning_rate": 3.5515766969535007e-07, "logits/generated": -2.252068281173706, "logits/real": -2.2762959003448486, "logps/generated": -563.1546630859375, "logps/real": -273.4827880859375, "loss": 0.6942, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.128468036651611, "rewards/margins": 21.493574142456055, "rewards/real": 15.365107536315918, "step": 3000 }, { "epoch": 0.36075036075036077, "eval_logits/generated": -2.2503185272216797, "eval_logits/real": -2.3049116134643555, "eval_logps/generated": -529.7844848632812, "eval_logps/real": -290.2620544433594, "eval_loss": 0.40316224098205566, "eval_rewards/accuracies": 0.9732142686843872, "eval_rewards/generated": -4.000301837921143, "eval_rewards/margins": 22.101200103759766, "eval_rewards/real": 18.10089683532715, "eval_runtime": 158.3944, "eval_samples_per_second": 6.313, "eval_steps_per_second": 0.53, "step": 3000 }, { "epoch": 0.36195286195286197, "grad_norm": 122.75740828308409, "learning_rate": 3.544895777659005e-07, "logits/generated": -2.2664377689361572, "logits/real": -2.3279268741607666, "logps/generated": -516.6077880859375, "logps/real": -307.4842224121094, "loss": 0.625, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.284638404846191, "rewards/margins": 25.697412490844727, "rewards/real": 21.412769317626953, "step": 3010 }, { "epoch": 0.3631553631553632, "grad_norm": 5.912793075206404, "learning_rate": 3.538214858364511e-07, "logits/generated": -2.2338626384735107, "logits/real": -2.282067060470581, "logps/generated": -522.6149291992188, "logps/real": -242.1405792236328, "loss": 0.5807, "rewards/accuracies": 1.0, "rewards/generated": -5.8531646728515625, "rewards/margins": 20.41141128540039, "rewards/real": 14.558245658874512, "step": 3020 }, { "epoch": 0.3643578643578644, "grad_norm": 5.770911163059995, "learning_rate": 3.531533939070016e-07, "logits/generated": -2.222487688064575, "logits/real": -2.2802271842956543, "logps/generated": -552.416015625, "logps/real": -246.77639770507812, "loss": 0.3684, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.396537780761719, "rewards/margins": 21.888214111328125, "rewards/real": 15.491678237915039, "step": 3030 }, { "epoch": 0.3655603655603656, "grad_norm": 14.534338940850242, "learning_rate": 3.524853019775521e-07, "logits/generated": -2.1972241401672363, "logits/real": -2.318814754486084, "logps/generated": -456.2215270996094, "logps/real": -282.9758605957031, "loss": 0.3602, "rewards/accuracies": 1.0, "rewards/generated": -2.8818116188049316, "rewards/margins": 21.660917282104492, "rewards/real": 18.77910614013672, "step": 3040 }, { "epoch": 0.3667628667628668, "grad_norm": 141.71431274136296, "learning_rate": 3.518172100481026e-07, "logits/generated": -2.285266876220703, "logits/real": -2.3525002002716064, "logps/generated": -436.36767578125, "logps/real": -301.61981201171875, "loss": 0.7586, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.640789270401001, "rewards/margins": 19.90084457397461, "rewards/real": 19.260055541992188, "step": 3050 }, { "epoch": 0.36796536796536794, "grad_norm": 285.17578435724823, "learning_rate": 3.511491181186531e-07, "logits/generated": -2.2331862449645996, "logits/real": -2.3540260791778564, "logps/generated": -692.9502563476562, "logps/real": -376.41363525390625, "loss": 0.4003, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.579779624938965, "rewards/margins": 33.797027587890625, "rewards/real": 29.21724510192871, "step": 3060 }, { "epoch": 0.36916786916786914, "grad_norm": 262.9206989868409, "learning_rate": 3.504810261892036e-07, "logits/generated": -2.192718267440796, "logits/real": -2.2729651927948, "logps/generated": -509.4129943847656, "logps/real": -258.4654846191406, "loss": 0.4083, "rewards/accuracies": 1.0, "rewards/generated": -4.632937431335449, "rewards/margins": 20.950496673583984, "rewards/real": 16.317562103271484, "step": 3070 }, { "epoch": 0.37037037037037035, "grad_norm": 131.60497647826585, "learning_rate": 3.498129342597541e-07, "logits/generated": -2.1637587547302246, "logits/real": -2.2644407749176025, "logps/generated": -459.63836669921875, "logps/real": -248.57723999023438, "loss": 0.5513, "rewards/accuracies": 1.0, "rewards/generated": -3.667370319366455, "rewards/margins": 19.44435691833496, "rewards/real": 15.776989936828613, "step": 3080 }, { "epoch": 0.37157287157287155, "grad_norm": 10.670283284540384, "learning_rate": 3.4914484233030463e-07, "logits/generated": -2.1456921100616455, "logits/real": -2.2160375118255615, "logps/generated": -447.6417541503906, "logps/real": -242.09207153320312, "loss": 0.5443, "rewards/accuracies": 1.0, "rewards/generated": -3.274768829345703, "rewards/margins": 21.649639129638672, "rewards/real": 18.374868392944336, "step": 3090 }, { "epoch": 0.37277537277537276, "grad_norm": 284.86059767065285, "learning_rate": 3.484767504008552e-07, "logits/generated": -2.2293851375579834, "logits/real": -2.2408015727996826, "logps/generated": -485.41876220703125, "logps/real": -248.82473754882812, "loss": 0.4494, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.447257041931152, "rewards/margins": 22.267175674438477, "rewards/real": 16.81991958618164, "step": 3100 }, { "epoch": 0.37397787397787396, "grad_norm": 673.3401086177072, "learning_rate": 3.4780865847140564e-07, "logits/generated": -2.3030428886413574, "logits/real": -2.338392734527588, "logps/generated": -577.5540771484375, "logps/real": -294.89654541015625, "loss": 0.5323, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.794123888015747, "rewards/margins": 20.030696868896484, "rewards/real": 22.824819564819336, "step": 3110 }, { "epoch": 0.37518037518037517, "grad_norm": 270.9194607638238, "learning_rate": 3.4714056654195615e-07, "logits/generated": -2.2071359157562256, "logits/real": -2.2805514335632324, "logps/generated": -456.8641052246094, "logps/real": -221.67684936523438, "loss": 0.6461, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.32338285446167, "rewards/margins": 16.610441207885742, "rewards/real": 14.28705883026123, "step": 3120 }, { "epoch": 0.3763828763828764, "grad_norm": 22.331175258402148, "learning_rate": 3.4647247461250666e-07, "logits/generated": -2.2179336547851562, "logits/real": -2.3464598655700684, "logps/generated": -469.17724609375, "logps/real": -246.83615112304688, "loss": 0.672, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.541749477386475, "rewards/margins": 21.040027618408203, "rewards/real": 16.498275756835938, "step": 3130 }, { "epoch": 0.3775853775853776, "grad_norm": 12.04281250897961, "learning_rate": 3.4580438268305716e-07, "logits/generated": -2.1902594566345215, "logits/real": -2.2416536808013916, "logps/generated": -529.442138671875, "logps/real": -232.1566619873047, "loss": 0.4831, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.105222225189209, "rewards/margins": 22.607030868530273, "rewards/real": 16.50181007385254, "step": 3140 }, { "epoch": 0.3787878787878788, "grad_norm": 486.14449671043985, "learning_rate": 3.4513629075360767e-07, "logits/generated": -2.187072515487671, "logits/real": -2.2189419269561768, "logps/generated": -664.7039184570312, "logps/real": -279.95855712890625, "loss": 0.8295, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -8.598974227905273, "rewards/margins": 24.43923568725586, "rewards/real": 15.840261459350586, "step": 3150 }, { "epoch": 0.37999037999038, "grad_norm": 429.9586352979677, "learning_rate": 3.444681988241582e-07, "logits/generated": -2.2088828086853027, "logits/real": -2.2571425437927246, "logps/generated": -555.0758666992188, "logps/real": -294.0585021972656, "loss": 0.399, "rewards/accuracies": 1.0, "rewards/generated": -5.015622138977051, "rewards/margins": 24.913909912109375, "rewards/real": 19.89828872680664, "step": 3160 }, { "epoch": 0.3811928811928812, "grad_norm": 14.595761956469604, "learning_rate": 3.438001068947087e-07, "logits/generated": -2.2068917751312256, "logits/real": -2.214226007461548, "logps/generated": -457.39678955078125, "logps/real": -240.8272247314453, "loss": 0.3143, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.3468408584594727, "rewards/margins": 18.06717300415039, "rewards/real": 16.720333099365234, "step": 3170 }, { "epoch": 0.3823953823953824, "grad_norm": 286.3107478758328, "learning_rate": 3.4313201496525924e-07, "logits/generated": -2.2519948482513428, "logits/real": -2.3029470443725586, "logps/generated": -553.2166748046875, "logps/real": -263.83782958984375, "loss": 0.5949, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.3181514739990234, "rewards/margins": 23.930526733398438, "rewards/real": 20.612375259399414, "step": 3180 }, { "epoch": 0.3835978835978836, "grad_norm": 491.5969995393527, "learning_rate": 3.4246392303580975e-07, "logits/generated": -2.2349514961242676, "logits/real": -2.319042682647705, "logps/generated": -727.5499877929688, "logps/real": -342.4107360839844, "loss": 0.5115, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.6486027240753174, "rewards/margins": 24.541975021362305, "rewards/real": 22.893375396728516, "step": 3190 }, { "epoch": 0.3848003848003848, "grad_norm": 6.759721578774117, "learning_rate": 3.4179583110636025e-07, "logits/generated": -2.2635293006896973, "logits/real": -2.302671194076538, "logps/generated": -543.232421875, "logps/real": -226.0625457763672, "loss": 0.3877, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.465426445007324, "rewards/margins": 21.946393966674805, "rewards/real": 16.480966567993164, "step": 3200 }, { "epoch": 0.386002886002886, "grad_norm": 36.139008726199734, "learning_rate": 3.411277391769107e-07, "logits/generated": -2.2435240745544434, "logits/real": -2.2899813652038574, "logps/generated": -417.7156677246094, "logps/real": -204.80642700195312, "loss": 0.4363, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.3878477215766907, "rewards/margins": 15.205143928527832, "rewards/real": 14.817296028137207, "step": 3210 }, { "epoch": 0.3872053872053872, "grad_norm": 11.071793312837038, "learning_rate": 3.404596472474612e-07, "logits/generated": -2.2331137657165527, "logits/real": -2.268756151199341, "logps/generated": -513.5505981445312, "logps/real": -249.2364501953125, "loss": 0.4172, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.7772376537322998, "rewards/margins": 17.442907333374023, "rewards/real": 19.22014617919922, "step": 3220 }, { "epoch": 0.3884078884078884, "grad_norm": 598.1872828227879, "learning_rate": 3.397915553180117e-07, "logits/generated": -2.2873480319976807, "logits/real": -2.2719578742980957, "logps/generated": -500.8384704589844, "logps/real": -266.7796325683594, "loss": 0.4875, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.352444589138031, "rewards/margins": 19.39059066772461, "rewards/real": 19.743032455444336, "step": 3230 }, { "epoch": 0.38961038961038963, "grad_norm": 54.846973770309745, "learning_rate": 3.3912346338856223e-07, "logits/generated": -2.2452523708343506, "logits/real": -2.267148494720459, "logps/generated": -624.7496948242188, "logps/real": -285.53485107421875, "loss": 0.6026, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.275235891342163, "rewards/margins": 20.758686065673828, "rewards/real": 18.48345184326172, "step": 3240 }, { "epoch": 0.39081289081289083, "grad_norm": 23.725353307695027, "learning_rate": 3.3845537145911273e-07, "logits/generated": -2.2604598999023438, "logits/real": -2.3512320518493652, "logps/generated": -546.6399536132812, "logps/real": -243.18753051757812, "loss": 0.4084, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.921527862548828, "rewards/margins": 23.10455322265625, "rewards/real": 18.183025360107422, "step": 3250 }, { "epoch": 0.39201539201539204, "grad_norm": 13.713306471172793, "learning_rate": 3.377872795296633e-07, "logits/generated": -2.2504663467407227, "logits/real": -2.2905116081237793, "logps/generated": -529.0084838867188, "logps/real": -229.9738006591797, "loss": 0.6297, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.74764883518219, "rewards/margins": 17.978008270263672, "rewards/real": 16.23036003112793, "step": 3260 }, { "epoch": 0.39321789321789324, "grad_norm": 12.980384198553198, "learning_rate": 3.371191876002138e-07, "logits/generated": -2.337036609649658, "logits/real": -2.3831160068511963, "logps/generated": -556.7229614257812, "logps/real": -341.72174072265625, "loss": 0.6133, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.2307429015636444, "rewards/margins": 21.993549346923828, "rewards/real": 21.762807846069336, "step": 3270 }, { "epoch": 0.3944203944203944, "grad_norm": 7.486687417030773, "learning_rate": 3.364510956707643e-07, "logits/generated": -2.2695746421813965, "logits/real": -2.278357982635498, "logps/generated": -452.6968688964844, "logps/real": -254.2688446044922, "loss": 0.4876, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.17638777196407318, "rewards/margins": 16.836870193481445, "rewards/real": 17.013256072998047, "step": 3280 }, { "epoch": 0.3956228956228956, "grad_norm": 941.8749408006992, "learning_rate": 3.357830037413148e-07, "logits/generated": -2.335817337036133, "logits/real": -2.361940860748291, "logps/generated": -453.5274353027344, "logps/real": -245.0332489013672, "loss": 0.5833, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.624830722808838, "rewards/margins": 21.15799331665039, "rewards/real": 18.53316307067871, "step": 3290 }, { "epoch": 0.3968253968253968, "grad_norm": 50.215082443599826, "learning_rate": 3.351149118118653e-07, "logits/generated": -2.192233085632324, "logits/real": -2.311274528503418, "logps/generated": -544.9821166992188, "logps/real": -276.02166748046875, "loss": 0.4282, "rewards/accuracies": 0.949999988079071, "rewards/generated": 1.2815845012664795, "rewards/margins": 18.4959716796875, "rewards/real": 19.77755355834961, "step": 3300 }, { "epoch": 0.398027898027898, "grad_norm": 224.4088605821121, "learning_rate": 3.3444681988241577e-07, "logits/generated": -2.214669704437256, "logits/real": -2.250783681869507, "logps/generated": -562.6416015625, "logps/real": -280.8820495605469, "loss": 0.7211, "rewards/accuracies": 0.875, "rewards/generated": -2.61222505569458, "rewards/margins": 20.133581161499023, "rewards/real": 17.521358489990234, "step": 3310 }, { "epoch": 0.3992303992303992, "grad_norm": 5.858248953022507, "learning_rate": 3.337787279529663e-07, "logits/generated": -2.1052157878875732, "logits/real": -2.248359203338623, "logps/generated": -491.052001953125, "logps/real": -229.3201904296875, "loss": 0.2902, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.617383003234863, "rewards/margins": 19.45993995666504, "rewards/real": 14.842556953430176, "step": 3320 }, { "epoch": 0.4004329004329004, "grad_norm": 22.85722751121272, "learning_rate": 3.331106360235168e-07, "logits/generated": -2.177408456802368, "logits/real": -2.2191379070281982, "logps/generated": -416.21435546875, "logps/real": -212.9326934814453, "loss": 0.4874, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.6096454858779907, "rewards/margins": 16.685667037963867, "rewards/real": 15.076022148132324, "step": 3330 }, { "epoch": 0.4016354016354016, "grad_norm": 703.2679079252424, "learning_rate": 3.324425440940673e-07, "logits/generated": -2.2595977783203125, "logits/real": -2.320112943649292, "logps/generated": -574.4844970703125, "logps/real": -316.2807312011719, "loss": 0.3953, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.1208622455596924, "rewards/margins": 22.697834014892578, "rewards/real": 20.57697105407715, "step": 3340 }, { "epoch": 0.40283790283790283, "grad_norm": 6.410638733708917, "learning_rate": 3.3177445216461785e-07, "logits/generated": -2.208864688873291, "logits/real": -2.218881130218506, "logps/generated": -558.22900390625, "logps/real": -275.82513427734375, "loss": 0.3995, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.4745880365371704, "rewards/margins": 19.607568740844727, "rewards/real": 18.132980346679688, "step": 3350 }, { "epoch": 0.40404040404040403, "grad_norm": 172.69703943025226, "learning_rate": 3.3110636023516836e-07, "logits/generated": -2.2709243297576904, "logits/real": -2.2835915088653564, "logps/generated": -462.6661071777344, "logps/real": -222.7592315673828, "loss": 0.4266, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.892196536064148, "rewards/margins": 17.225933074951172, "rewards/real": 18.11812973022461, "step": 3360 }, { "epoch": 0.40524290524290524, "grad_norm": 7.743710313480041, "learning_rate": 3.3043826830571886e-07, "logits/generated": -2.2780072689056396, "logits/real": -2.2864248752593994, "logps/generated": -475.69305419921875, "logps/real": -212.77346801757812, "loss": 0.58, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.44416481256484985, "rewards/margins": 17.00520896911621, "rewards/real": 16.561044692993164, "step": 3370 }, { "epoch": 0.40644540644540644, "grad_norm": 241.93884894636207, "learning_rate": 3.2977017637626937e-07, "logits/generated": -2.261087656021118, "logits/real": -2.26119327545166, "logps/generated": -523.0970458984375, "logps/real": -363.1900329589844, "loss": 0.493, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.402726650238037, "rewards/margins": 23.13311767578125, "rewards/real": 20.73039436340332, "step": 3380 }, { "epoch": 0.40764790764790765, "grad_norm": 18.122692990817526, "learning_rate": 3.291020844468199e-07, "logits/generated": -2.1860594749450684, "logits/real": -2.2090258598327637, "logps/generated": -588.9990844726562, "logps/real": -290.1547546386719, "loss": 0.4412, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.9298200607299805, "rewards/margins": 23.909421920776367, "rewards/real": 18.979602813720703, "step": 3390 }, { "epoch": 0.40885040885040885, "grad_norm": 8.537903796099052, "learning_rate": 3.284339925173704e-07, "logits/generated": -2.2259185314178467, "logits/real": -2.2582287788391113, "logps/generated": -458.043701171875, "logps/real": -262.12109375, "loss": 0.4163, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.679396629333496, "rewards/margins": 22.194400787353516, "rewards/real": 15.515007019042969, "step": 3400 }, { "epoch": 0.41005291005291006, "grad_norm": 5.8980934306255035, "learning_rate": 3.2776590058792084e-07, "logits/generated": -2.29022479057312, "logits/real": -2.3379244804382324, "logps/generated": -522.6433715820312, "logps/real": -284.0138244628906, "loss": 0.4203, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.9177383184432983, "rewards/margins": 22.41234016418457, "rewards/real": 20.49460220336914, "step": 3410 }, { "epoch": 0.41125541125541126, "grad_norm": 232.41753097775643, "learning_rate": 3.2709780865847134e-07, "logits/generated": -2.226773738861084, "logits/real": -2.224693536758423, "logps/generated": -480.4476623535156, "logps/real": -225.9335479736328, "loss": 0.6141, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.496685028076172, "rewards/margins": 20.988861083984375, "rewards/real": 16.492176055908203, "step": 3420 }, { "epoch": 0.41245791245791247, "grad_norm": 21.747758384241585, "learning_rate": 3.264297167290219e-07, "logits/generated": -2.2422385215759277, "logits/real": -2.2999186515808105, "logps/generated": -509.33416748046875, "logps/real": -268.3351135253906, "loss": 0.4256, "rewards/accuracies": 1.0, "rewards/generated": 0.6797200441360474, "rewards/margins": 20.25993537902832, "rewards/real": 20.939655303955078, "step": 3430 }, { "epoch": 0.4136604136604137, "grad_norm": 9.188563537103336, "learning_rate": 3.257616247995724e-07, "logits/generated": -2.200892448425293, "logits/real": -2.267993450164795, "logps/generated": -523.612548828125, "logps/real": -216.20706176757812, "loss": 0.585, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.4908108711242676, "rewards/margins": 17.959659576416016, "rewards/real": 15.468846321105957, "step": 3440 }, { "epoch": 0.4148629148629149, "grad_norm": 18.877840035858405, "learning_rate": 3.250935328701229e-07, "logits/generated": -2.1867141723632812, "logits/real": -2.2638654708862305, "logps/generated": -608.2249755859375, "logps/real": -263.34625244140625, "loss": 0.7761, "rewards/accuracies": 1.0, "rewards/generated": -2.450798511505127, "rewards/margins": 21.54878044128418, "rewards/real": 19.09798240661621, "step": 3450 }, { "epoch": 0.4160654160654161, "grad_norm": 38.45033161024908, "learning_rate": 3.244254409406734e-07, "logits/generated": -2.297166585922241, "logits/real": -2.295908212661743, "logps/generated": -513.9171752929688, "logps/real": -284.8858947753906, "loss": 0.4688, "rewards/accuracies": 1.0, "rewards/generated": -1.543438196182251, "rewards/margins": 23.487194061279297, "rewards/real": 21.943756103515625, "step": 3460 }, { "epoch": 0.4172679172679173, "grad_norm": 218.30264679323815, "learning_rate": 3.2375734901122393e-07, "logits/generated": -2.250934600830078, "logits/real": -2.301692247390747, "logps/generated": -411.31549072265625, "logps/real": -227.10400390625, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/generated": -3.8222877979278564, "rewards/margins": 20.231616973876953, "rewards/real": 16.409326553344727, "step": 3470 }, { "epoch": 0.4184704184704185, "grad_norm": 105.4180123617748, "learning_rate": 3.2308925708177443e-07, "logits/generated": -2.2137606143951416, "logits/real": -2.2348432540893555, "logps/generated": -608.3657836914062, "logps/real": -308.8596496582031, "loss": 0.9847, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.6461491584777832, "rewards/margins": 18.258991241455078, "rewards/real": 18.905139923095703, "step": 3480 }, { "epoch": 0.4196729196729197, "grad_norm": 5.056705023112445, "learning_rate": 3.2242116515232494e-07, "logits/generated": -2.2457330226898193, "logits/real": -2.2896833419799805, "logps/generated": -521.7301025390625, "logps/real": -285.7955017089844, "loss": 0.3993, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.4160698652267456, "rewards/margins": 20.736743927001953, "rewards/real": 22.152812957763672, "step": 3490 }, { "epoch": 0.4208754208754209, "grad_norm": 54.09032063669176, "learning_rate": 3.2175307322287545e-07, "logits/generated": -2.2124075889587402, "logits/real": -2.244499921798706, "logps/generated": -446.72430419921875, "logps/real": -201.4197540283203, "loss": 0.6846, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.36500120162963867, "rewards/margins": 15.436958312988281, "rewards/real": 15.071955680847168, "step": 3500 }, { "epoch": 0.42207792207792205, "grad_norm": 400.72282075452534, "learning_rate": 3.21084981293426e-07, "logits/generated": -2.237973690032959, "logits/real": -2.262101888656616, "logps/generated": -420.045654296875, "logps/real": -225.34780883789062, "loss": 0.4097, "rewards/accuracies": 1.0, "rewards/generated": -1.5613340139389038, "rewards/margins": 18.243972778320312, "rewards/real": 16.68263816833496, "step": 3510 }, { "epoch": 0.42328042328042326, "grad_norm": 3.719521776829346, "learning_rate": 3.2041688936397646e-07, "logits/generated": -2.217008113861084, "logits/real": -2.2482199668884277, "logps/generated": -564.4690551757812, "logps/real": -277.1367492675781, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/generated": -1.776555061340332, "rewards/margins": 22.30672836303711, "rewards/real": 20.53017234802246, "step": 3520 }, { "epoch": 0.42448292448292446, "grad_norm": 73.99862278530746, "learning_rate": 3.1974879743452697e-07, "logits/generated": -2.2452683448791504, "logits/real": -2.2988827228546143, "logps/generated": -420.21697998046875, "logps/real": -247.0130615234375, "loss": 0.6884, "rewards/accuracies": 0.925000011920929, "rewards/generated": 3.086369752883911, "rewards/margins": 16.89458465576172, "rewards/real": 19.980953216552734, "step": 3530 }, { "epoch": 0.42568542568542567, "grad_norm": 531.4862807469356, "learning_rate": 3.1908070550507747e-07, "logits/generated": -2.206820011138916, "logits/real": -2.348249912261963, "logps/generated": -545.2318725585938, "logps/real": -235.8459014892578, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/generated": -3.8593764305114746, "rewards/margins": 24.526968002319336, "rewards/real": 20.667591094970703, "step": 3540 }, { "epoch": 0.42688792688792687, "grad_norm": 17.662058717147133, "learning_rate": 3.18412613575628e-07, "logits/generated": -2.2284553050994873, "logits/real": -2.302344799041748, "logps/generated": -401.02239990234375, "logps/real": -237.73046875, "loss": 0.9592, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.375710129737854, "rewards/margins": 17.94776725769043, "rewards/real": 19.323474884033203, "step": 3550 }, { "epoch": 0.4280904280904281, "grad_norm": 74.29450182082346, "learning_rate": 3.177445216461785e-07, "logits/generated": -2.011335849761963, "logits/real": -2.1957554817199707, "logps/generated": -577.8146362304688, "logps/real": -302.3636474609375, "loss": 0.5901, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.170976161956787, "rewards/margins": 27.727657318115234, "rewards/real": 20.556682586669922, "step": 3560 }, { "epoch": 0.4292929292929293, "grad_norm": 54.82557002704048, "learning_rate": 3.17076429716729e-07, "logits/generated": -2.0680551528930664, "logits/real": -2.0925698280334473, "logps/generated": -490.8046875, "logps/real": -186.53109741210938, "loss": 0.4297, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.373655796051025, "rewards/margins": 19.686954498291016, "rewards/real": 12.313298225402832, "step": 3570 }, { "epoch": 0.4304954304954305, "grad_norm": 269.07808979589714, "learning_rate": 3.164083377872795e-07, "logits/generated": -2.0910630226135254, "logits/real": -2.130899667739868, "logps/generated": -534.9212036132812, "logps/real": -245.68911743164062, "loss": 0.8762, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.103806495666504, "rewards/margins": 20.604034423828125, "rewards/real": 15.500228881835938, "step": 3580 }, { "epoch": 0.4316979316979317, "grad_norm": 156.16286324114017, "learning_rate": 3.1574024585783006e-07, "logits/generated": -2.138777256011963, "logits/real": -2.227443218231201, "logps/generated": -550.2257080078125, "logps/real": -302.7833251953125, "loss": 0.4447, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.9097096920013428, "rewards/margins": 22.903656005859375, "rewards/real": 20.993946075439453, "step": 3590 }, { "epoch": 0.4329004329004329, "grad_norm": 316.686145759838, "learning_rate": 3.1507215392838057e-07, "logits/generated": -2.1031007766723633, "logits/real": -2.177794933319092, "logps/generated": -618.0108642578125, "logps/real": -300.5312194824219, "loss": 0.4535, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.1155319213867188, "rewards/margins": 24.038068771362305, "rewards/real": 22.92254066467285, "step": 3600 }, { "epoch": 0.4341029341029341, "grad_norm": 13.746064790512671, "learning_rate": 3.1440406199893107e-07, "logits/generated": -2.04689884185791, "logits/real": -2.219418525695801, "logps/generated": -584.58935546875, "logps/real": -309.8716735839844, "loss": 0.585, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.4922397136688232, "rewards/margins": 26.610736846923828, "rewards/real": 23.118497848510742, "step": 3610 }, { "epoch": 0.4353054353054353, "grad_norm": 19.192953699515723, "learning_rate": 3.137359700694815e-07, "logits/generated": -2.0190205574035645, "logits/real": -2.170254945755005, "logps/generated": -444.9599609375, "logps/real": -289.107421875, "loss": 0.5915, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.893264055252075, "rewards/margins": 22.015300750732422, "rewards/real": 18.12203598022461, "step": 3620 }, { "epoch": 0.4365079365079365, "grad_norm": 117.16286334872296, "learning_rate": 3.1306787814003203e-07, "logits/generated": -2.17712140083313, "logits/real": -2.260840892791748, "logps/generated": -581.529541015625, "logps/real": -312.60406494140625, "loss": 0.5468, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.3088008165359497, "rewards/margins": 20.99437141418457, "rewards/real": 19.685569763183594, "step": 3630 }, { "epoch": 0.4377104377104377, "grad_norm": 84.84598138327165, "learning_rate": 3.1239978621058254e-07, "logits/generated": -2.0735323429107666, "logits/real": -2.177410125732422, "logps/generated": -622.9395751953125, "logps/real": -268.78997802734375, "loss": 0.4778, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.399700164794922, "rewards/margins": 24.45461082458496, "rewards/real": 17.054908752441406, "step": 3640 }, { "epoch": 0.4389129389129389, "grad_norm": 211.7406557120953, "learning_rate": 3.1173169428113304e-07, "logits/generated": -2.113731861114502, "logits/real": -2.238961696624756, "logps/generated": -516.693115234375, "logps/real": -262.43408203125, "loss": 0.4155, "rewards/accuracies": 1.0, "rewards/generated": -3.0317835807800293, "rewards/margins": 22.840307235717773, "rewards/real": 19.808523178100586, "step": 3650 }, { "epoch": 0.4401154401154401, "grad_norm": 55.42098247030718, "learning_rate": 3.1106360235168355e-07, "logits/generated": -2.196582078933716, "logits/real": -2.285003423690796, "logps/generated": -487.26318359375, "logps/real": -242.1387481689453, "loss": 0.469, "rewards/accuracies": 1.0, "rewards/generated": -3.281810760498047, "rewards/margins": 19.729379653930664, "rewards/real": 16.44757080078125, "step": 3660 }, { "epoch": 0.44131794131794133, "grad_norm": 391.66853943369136, "learning_rate": 3.103955104222341e-07, "logits/generated": -2.206817388534546, "logits/real": -2.2829651832580566, "logps/generated": -533.7303466796875, "logps/real": -289.9284973144531, "loss": 0.8139, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.014866828918457, "rewards/margins": 23.939260482788086, "rewards/real": 19.924394607543945, "step": 3670 }, { "epoch": 0.44252044252044254, "grad_norm": 6.573225525685875, "learning_rate": 3.097274184927846e-07, "logits/generated": -2.1602535247802734, "logits/real": -2.2709264755249023, "logps/generated": -467.51837158203125, "logps/real": -187.49595642089844, "loss": 0.3035, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.115992069244385, "rewards/margins": 21.21573829650879, "rewards/real": 14.099746704101562, "step": 3680 }, { "epoch": 0.44372294372294374, "grad_norm": 7.0052738704130375, "learning_rate": 3.090593265633351e-07, "logits/generated": -2.2725062370300293, "logits/real": -2.316488742828369, "logps/generated": -489.68359375, "logps/real": -240.08065795898438, "loss": 0.5854, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.560263991355896, "rewards/margins": 19.86050796508789, "rewards/real": 21.420772552490234, "step": 3690 }, { "epoch": 0.44492544492544495, "grad_norm": 2.4838609997369243, "learning_rate": 3.0839123463388563e-07, "logits/generated": -2.1661388874053955, "logits/real": -2.229221820831299, "logps/generated": -693.8267822265625, "logps/real": -265.43084716796875, "loss": 0.647, "rewards/accuracies": 1.0, "rewards/generated": -6.291512489318848, "rewards/margins": 24.833316802978516, "rewards/real": 18.541805267333984, "step": 3700 }, { "epoch": 0.44612794612794615, "grad_norm": 21.20920717884601, "learning_rate": 3.0772314270443614e-07, "logits/generated": -2.2226345539093018, "logits/real": -2.341580867767334, "logps/generated": -478.16461181640625, "logps/real": -295.2587585449219, "loss": 0.5038, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.17221125960350037, "rewards/margins": 24.30082893371582, "rewards/real": 24.128616333007812, "step": 3710 }, { "epoch": 0.44733044733044736, "grad_norm": 173.4538878199435, "learning_rate": 3.070550507749866e-07, "logits/generated": -2.0368051528930664, "logits/real": -2.2117326259613037, "logps/generated": -567.9580078125, "logps/real": -254.87460327148438, "loss": 0.5571, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.8097100257873535, "rewards/margins": 26.1510066986084, "rewards/real": 19.341297149658203, "step": 3720 }, { "epoch": 0.4485329485329485, "grad_norm": 7.3825012679693165, "learning_rate": 3.063869588455371e-07, "logits/generated": -2.0971477031707764, "logits/real": -2.150735855102539, "logps/generated": -513.3101806640625, "logps/real": -224.2945556640625, "loss": 0.5287, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.508622646331787, "rewards/margins": 23.58926773071289, "rewards/real": 17.080646514892578, "step": 3730 }, { "epoch": 0.4497354497354497, "grad_norm": 5.180936608677949, "learning_rate": 3.057188669160876e-07, "logits/generated": -2.1248841285705566, "logits/real": -2.1839470863342285, "logps/generated": -437.79107666015625, "logps/real": -202.60720825195312, "loss": 0.6053, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.073121547698975, "rewards/margins": 22.3405704498291, "rewards/real": 16.26744842529297, "step": 3740 }, { "epoch": 0.4509379509379509, "grad_norm": 864.6075584973929, "learning_rate": 3.0505077498663816e-07, "logits/generated": -2.140291690826416, "logits/real": -2.2139744758605957, "logps/generated": -533.6538696289062, "logps/real": -227.28787231445312, "loss": 0.6704, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.38225793838501, "rewards/margins": 23.71435546875, "rewards/real": 18.332096099853516, "step": 3750 }, { "epoch": 0.4521404521404521, "grad_norm": 216.3524497920051, "learning_rate": 3.0438268305718867e-07, "logits/generated": -2.2790138721466064, "logits/real": -2.357851028442383, "logps/generated": -640.957763671875, "logps/real": -423.6236267089844, "loss": 0.489, "rewards/accuracies": 0.949999988079071, "rewards/generated": 4.965917110443115, "rewards/margins": 26.667720794677734, "rewards/real": 31.63364028930664, "step": 3760 }, { "epoch": 0.4533429533429533, "grad_norm": 159.65228346706937, "learning_rate": 3.037145911277392e-07, "logits/generated": -2.1005051136016846, "logits/real": -2.237699031829834, "logps/generated": -390.77569580078125, "logps/real": -202.80323791503906, "loss": 0.4214, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.24750280380249, "rewards/margins": 22.559703826904297, "rewards/real": 15.312200546264648, "step": 3770 }, { "epoch": 0.45454545454545453, "grad_norm": 25.65213333469863, "learning_rate": 3.030464991982897e-07, "logits/generated": -2.240879535675049, "logits/real": -2.274664878845215, "logps/generated": -534.3191528320312, "logps/real": -239.8083038330078, "loss": 0.6036, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.094273090362549, "rewards/margins": 20.56509017944336, "rewards/real": 18.470813751220703, "step": 3780 }, { "epoch": 0.45574795574795574, "grad_norm": 12.046752007252833, "learning_rate": 3.023784072688402e-07, "logits/generated": -2.2010159492492676, "logits/real": -2.255561590194702, "logps/generated": -407.95745849609375, "logps/real": -196.4077911376953, "loss": 0.3797, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.5161245465278625, "rewards/margins": 14.831266403198242, "rewards/real": 14.315142631530762, "step": 3790 }, { "epoch": 0.45695045695045694, "grad_norm": 37.34126795951205, "learning_rate": 3.017103153393907e-07, "logits/generated": -2.237163782119751, "logits/real": -2.309365749359131, "logps/generated": -512.1398315429688, "logps/real": -200.3365478515625, "loss": 0.6084, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 0.10469992458820343, "rewards/margins": 15.552648544311523, "rewards/real": 15.6573486328125, "step": 3800 }, { "epoch": 0.45815295815295815, "grad_norm": 397.40898206986094, "learning_rate": 3.010422234099412e-07, "logits/generated": -2.2593846321105957, "logits/real": -2.2778258323669434, "logps/generated": -539.22998046875, "logps/real": -222.98300170898438, "loss": 0.3289, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.4914422035217285, "rewards/margins": 21.29345703125, "rewards/real": 17.802013397216797, "step": 3810 }, { "epoch": 0.45935545935545935, "grad_norm": 156.24867212632122, "learning_rate": 3.0037413148049165e-07, "logits/generated": -2.2712087631225586, "logits/real": -2.3158586025238037, "logps/generated": -462.0484924316406, "logps/real": -261.26666259765625, "loss": 0.531, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 3.1764886379241943, "rewards/margins": 17.392261505126953, "rewards/real": 20.56875228881836, "step": 3820 }, { "epoch": 0.46055796055796056, "grad_norm": 283.6040019447855, "learning_rate": 2.997060395510422e-07, "logits/generated": -2.2225520610809326, "logits/real": -2.327397346496582, "logps/generated": -682.3387451171875, "logps/real": -340.1170959472656, "loss": 0.4905, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.2472844123840332, "rewards/margins": 24.164981842041016, "rewards/real": 24.412263870239258, "step": 3830 }, { "epoch": 0.46176046176046176, "grad_norm": 20.895431585751865, "learning_rate": 2.990379476215927e-07, "logits/generated": -2.271763801574707, "logits/real": -2.362795352935791, "logps/generated": -500.9606018066406, "logps/real": -245.93502807617188, "loss": 0.586, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.9510433673858643, "rewards/margins": 17.889434814453125, "rewards/real": 20.84048080444336, "step": 3840 }, { "epoch": 0.46296296296296297, "grad_norm": 3.300174115179123, "learning_rate": 2.983698556921432e-07, "logits/generated": -2.1928889751434326, "logits/real": -2.233339786529541, "logps/generated": -487.408447265625, "logps/real": -275.59490966796875, "loss": 0.4336, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.8288896679878235, "rewards/margins": 20.417415618896484, "rewards/real": 19.58852767944336, "step": 3850 }, { "epoch": 0.46416546416546417, "grad_norm": 391.80759150042627, "learning_rate": 2.9770176376269373e-07, "logits/generated": -2.252769947052002, "logits/real": -2.323683977127075, "logps/generated": -461.66595458984375, "logps/real": -276.63360595703125, "loss": 0.4622, "rewards/accuracies": 0.875, "rewards/generated": 0.5805469751358032, "rewards/margins": 18.976581573486328, "rewards/real": 19.55712890625, "step": 3860 }, { "epoch": 0.4653679653679654, "grad_norm": 510.6974364883727, "learning_rate": 2.9703367183324424e-07, "logits/generated": -2.3645882606506348, "logits/real": -2.3484604358673096, "logps/generated": -489.74993896484375, "logps/real": -312.33343505859375, "loss": 0.617, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.48322534561157227, "rewards/margins": 19.076953887939453, "rewards/real": 19.560176849365234, "step": 3870 }, { "epoch": 0.4665704665704666, "grad_norm": 488.6204227927367, "learning_rate": 2.9636557990379475e-07, "logits/generated": -2.288896083831787, "logits/real": -2.309587240219116, "logps/generated": -503.6145935058594, "logps/real": -268.85858154296875, "loss": 0.7618, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.37723970413208, "rewards/margins": 19.37235450744629, "rewards/real": 21.74959373474121, "step": 3880 }, { "epoch": 0.4677729677729678, "grad_norm": 13.171927751582817, "learning_rate": 2.9569748797434525e-07, "logits/generated": -2.223548173904419, "logits/real": -2.239373207092285, "logps/generated": -525.7351684570312, "logps/real": -271.37725830078125, "loss": 0.4788, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.04958176612854, "rewards/margins": 18.80972671508789, "rewards/real": 17.760143280029297, "step": 3890 }, { "epoch": 0.468975468975469, "grad_norm": 167.68241170986718, "learning_rate": 2.9502939604489576e-07, "logits/generated": -2.185739040374756, "logits/real": -2.288041591644287, "logps/generated": -701.4237060546875, "logps/real": -303.81219482421875, "loss": 0.5354, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.0823731422424316, "rewards/margins": 26.959665298461914, "rewards/real": 25.877288818359375, "step": 3900 }, { "epoch": 0.4701779701779702, "grad_norm": 4.075691179085272, "learning_rate": 2.943613041154463e-07, "logits/generated": -2.1385841369628906, "logits/real": -2.1470489501953125, "logps/generated": -504.74237060546875, "logps/real": -223.2626190185547, "loss": 0.4364, "rewards/accuracies": 1.0, "rewards/generated": -6.160588264465332, "rewards/margins": 24.97075843811035, "rewards/real": 18.810171127319336, "step": 3910 }, { "epoch": 0.4713804713804714, "grad_norm": 204.0725652594611, "learning_rate": 2.9369321218599677e-07, "logits/generated": -2.1229546070098877, "logits/real": -2.2415804862976074, "logps/generated": -464.67620849609375, "logps/real": -272.47393798828125, "loss": 0.4992, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.2768571376800537, "rewards/margins": 23.420005798339844, "rewards/real": 21.143152236938477, "step": 3920 }, { "epoch": 0.4725829725829726, "grad_norm": 3.7067046281291276, "learning_rate": 2.930251202565473e-07, "logits/generated": -2.0180985927581787, "logits/real": -2.087268590927124, "logps/generated": -467.0703125, "logps/real": -187.01832580566406, "loss": 0.3222, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.516222953796387, "rewards/margins": 18.26113510131836, "rewards/real": 12.744909286499023, "step": 3930 }, { "epoch": 0.4737854737854738, "grad_norm": 50.38328578695344, "learning_rate": 2.923570283270978e-07, "logits/generated": -2.1050660610198975, "logits/real": -2.141570568084717, "logps/generated": -579.7655029296875, "logps/real": -259.5626525878906, "loss": 0.5136, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.849890232086182, "rewards/margins": 25.93613624572754, "rewards/real": 20.086244583129883, "step": 3940 }, { "epoch": 0.474987974987975, "grad_norm": 16.680861667054387, "learning_rate": 2.916889363976483e-07, "logits/generated": -2.0800085067749023, "logits/real": -2.2065744400024414, "logps/generated": -606.1420288085938, "logps/real": -282.26123046875, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/generated": -5.822685718536377, "rewards/margins": 26.468358993530273, "rewards/real": 20.645671844482422, "step": 3950 }, { "epoch": 0.47619047619047616, "grad_norm": 54.59701336045353, "learning_rate": 2.910208444681988e-07, "logits/generated": -2.2042269706726074, "logits/real": -2.208150625228882, "logps/generated": -578.5890502929688, "logps/real": -277.19866943359375, "loss": 0.4799, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.8450977206230164, "rewards/margins": 21.83233070373535, "rewards/real": 22.677427291870117, "step": 3960 }, { "epoch": 0.47739297739297737, "grad_norm": 4.805880140636131, "learning_rate": 2.903527525387493e-07, "logits/generated": -2.2389702796936035, "logits/real": -2.255767583847046, "logps/generated": -519.0675659179688, "logps/real": -219.37637329101562, "loss": 0.4427, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.1465516090393066, "rewards/margins": 19.872838973999023, "rewards/real": 16.726289749145508, "step": 3970 }, { "epoch": 0.4785954785954786, "grad_norm": 64.9203979827319, "learning_rate": 2.896846606092998e-07, "logits/generated": -2.1840157508850098, "logits/real": -2.27323317527771, "logps/generated": -549.7556762695312, "logps/real": -334.9892272949219, "loss": 0.6059, "rewards/accuracies": 0.875, "rewards/generated": 0.023430729284882545, "rewards/margins": 22.822460174560547, "rewards/real": 22.845895767211914, "step": 3980 }, { "epoch": 0.4797979797979798, "grad_norm": 382.27072011663375, "learning_rate": 2.8901656867985037e-07, "logits/generated": -2.1771812438964844, "logits/real": -2.262188196182251, "logps/generated": -599.5274658203125, "logps/real": -259.5793762207031, "loss": 0.4793, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.041812896728516, "rewards/margins": 28.17879867553711, "rewards/real": 20.13698959350586, "step": 3990 }, { "epoch": 0.481000481000481, "grad_norm": 10.010252032118698, "learning_rate": 2.883484767504009e-07, "logits/generated": -2.1737263202667236, "logits/real": -2.2469887733459473, "logps/generated": -543.7032470703125, "logps/real": -244.39498901367188, "loss": 0.5971, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.1522727012634277, "rewards/margins": 21.82972526550293, "rewards/real": 18.677452087402344, "step": 4000 }, { "epoch": 0.481000481000481, "eval_logits/generated": -2.240758180618286, "eval_logits/real": -2.287383556365967, "eval_logps/generated": -494.74700927734375, "eval_logps/real": -266.41497802734375, "eval_loss": 0.43489810824394226, "eval_rewards/accuracies": 0.9553571343421936, "eval_rewards/generated": -0.49654555320739746, "eval_rewards/margins": 20.982145309448242, "eval_rewards/real": 20.485599517822266, "eval_runtime": 159.3593, "eval_samples_per_second": 6.275, "eval_steps_per_second": 0.527, "step": 4000 }, { "epoch": 0.4822029822029822, "grad_norm": 6.445596093537904, "learning_rate": 2.876803848209514e-07, "logits/generated": -2.178842067718506, "logits/real": -2.245429277420044, "logps/generated": -537.5115966796875, "logps/real": -223.5152130126953, "loss": 0.2816, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.587291717529297, "rewards/margins": 20.84734535217285, "rewards/real": 15.260052680969238, "step": 4010 }, { "epoch": 0.4834054834054834, "grad_norm": 181.80066698601422, "learning_rate": 2.8701229289150184e-07, "logits/generated": -2.1754794120788574, "logits/real": -2.26991605758667, "logps/generated": -446.3489685058594, "logps/real": -302.1445007324219, "loss": 0.8848, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.6614997386932373, "rewards/margins": 19.420612335205078, "rewards/real": 21.08211326599121, "step": 4020 }, { "epoch": 0.4846079846079846, "grad_norm": 10.999344866462478, "learning_rate": 2.8634420096205234e-07, "logits/generated": -2.2095115184783936, "logits/real": -2.3496994972229004, "logps/generated": -639.6976318359375, "logps/real": -293.78436279296875, "loss": 0.6402, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 4.128434181213379, "rewards/margins": 23.227733612060547, "rewards/real": 27.356164932250977, "step": 4030 }, { "epoch": 0.4858104858104858, "grad_norm": 49.49335315176659, "learning_rate": 2.8567610903260285e-07, "logits/generated": -2.1898367404937744, "logits/real": -2.2683277130126953, "logps/generated": -567.0438232421875, "logps/real": -238.8550262451172, "loss": 0.6797, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.44430685043335, "rewards/margins": 25.625652313232422, "rewards/real": 21.181346893310547, "step": 4040 }, { "epoch": 0.487012987012987, "grad_norm": 1352.6411990032718, "learning_rate": 2.8500801710315336e-07, "logits/generated": -2.189768075942993, "logits/real": -2.24078106880188, "logps/generated": -406.25115966796875, "logps/real": -223.9019317626953, "loss": 0.7146, "rewards/accuracies": 0.875, "rewards/generated": -4.249040126800537, "rewards/margins": 18.82101058959961, "rewards/real": 14.571968078613281, "step": 4050 }, { "epoch": 0.4882154882154882, "grad_norm": 9.88780533710644, "learning_rate": 2.8433992517370386e-07, "logits/generated": -2.208832263946533, "logits/real": -2.219021797180176, "logps/generated": -428.3487243652344, "logps/real": -189.39505004882812, "loss": 0.4363, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.900750637054443, "rewards/margins": 19.817745208740234, "rewards/real": 13.916994094848633, "step": 4060 }, { "epoch": 0.4894179894179894, "grad_norm": 141.58807713316912, "learning_rate": 2.836718332442544e-07, "logits/generated": -2.2209315299987793, "logits/real": -2.262233257293701, "logps/generated": -545.9734497070312, "logps/real": -252.4385528564453, "loss": 0.441, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.508567810058594, "rewards/margins": 24.501379013061523, "rewards/real": 19.992813110351562, "step": 4070 }, { "epoch": 0.4906204906204906, "grad_norm": 9.10219444841313, "learning_rate": 2.8300374131480493e-07, "logits/generated": -2.22090744972229, "logits/real": -2.2742881774902344, "logps/generated": -505.38427734375, "logps/real": -247.7169189453125, "loss": 0.4201, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.9506261348724365, "rewards/margins": 20.888425827026367, "rewards/real": 18.937801361083984, "step": 4080 }, { "epoch": 0.49182299182299183, "grad_norm": 2.509799430542211, "learning_rate": 2.8233564938535543e-07, "logits/generated": -2.089916229248047, "logits/real": -2.238445997238159, "logps/generated": -484.5831604003906, "logps/real": -222.5631866455078, "loss": 0.4783, "rewards/accuracies": 1.0, "rewards/generated": -7.338548183441162, "rewards/margins": 24.402645111083984, "rewards/real": 17.064098358154297, "step": 4090 }, { "epoch": 0.49302549302549303, "grad_norm": 801.1080653021605, "learning_rate": 2.8166755745590594e-07, "logits/generated": -2.205252170562744, "logits/real": -2.2826685905456543, "logps/generated": -569.2166748046875, "logps/real": -209.69137573242188, "loss": 0.5325, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.447823524475098, "rewards/margins": 21.286632537841797, "rewards/real": 15.838808059692383, "step": 4100 }, { "epoch": 0.49422799422799424, "grad_norm": 13.449589197643624, "learning_rate": 2.8099946552645645e-07, "logits/generated": -2.1758921146392822, "logits/real": -2.2592873573303223, "logps/generated": -606.7257690429688, "logps/real": -278.8661804199219, "loss": 0.5223, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.077622413635254, "rewards/margins": 24.696016311645508, "rewards/real": 19.61839485168457, "step": 4110 }, { "epoch": 0.49543049543049544, "grad_norm": 41.94418584186682, "learning_rate": 2.803313735970069e-07, "logits/generated": -2.265984058380127, "logits/real": -2.2622601985931396, "logps/generated": -510.1703186035156, "logps/real": -228.3834228515625, "loss": 0.4545, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.8092894554138184, "rewards/margins": 22.863197326660156, "rewards/real": 19.053909301757812, "step": 4120 }, { "epoch": 0.49663299663299665, "grad_norm": 76.52416670658293, "learning_rate": 2.796632816675574e-07, "logits/generated": -2.324951648712158, "logits/real": -2.3724920749664307, "logps/generated": -561.64892578125, "logps/real": -283.00494384765625, "loss": 1.0526, "rewards/accuracies": 0.875, "rewards/generated": 3.573587417602539, "rewards/margins": 20.410449981689453, "rewards/real": 23.984039306640625, "step": 4130 }, { "epoch": 0.49783549783549785, "grad_norm": 241.36928343336322, "learning_rate": 2.789951897381079e-07, "logits/generated": -2.2231245040893555, "logits/real": -2.29709529876709, "logps/generated": -488.70989990234375, "logps/real": -226.192138671875, "loss": 0.4705, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -1.9012588262557983, "rewards/margins": 20.033512115478516, "rewards/real": 18.132253646850586, "step": 4140 }, { "epoch": 0.49903799903799906, "grad_norm": 489.43241464733893, "learning_rate": 2.7832709780865847e-07, "logits/generated": -2.2443854808807373, "logits/real": -2.2525110244750977, "logps/generated": -504.917724609375, "logps/real": -257.2982177734375, "loss": 0.6455, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.7884670495986938, "rewards/margins": 17.46521759033203, "rewards/real": 19.25368309020996, "step": 4150 }, { "epoch": 0.5002405002405003, "grad_norm": 12.460598594612183, "learning_rate": 2.77659005879209e-07, "logits/generated": -2.1264469623565674, "logits/real": -2.263762950897217, "logps/generated": -571.4362182617188, "logps/real": -244.6605987548828, "loss": 0.356, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.415511131286621, "rewards/margins": 23.671628952026367, "rewards/real": 18.256114959716797, "step": 4160 }, { "epoch": 0.5014430014430015, "grad_norm": 103.6520781489663, "learning_rate": 2.769909139497595e-07, "logits/generated": -2.121946334838867, "logits/real": -2.2184207439422607, "logps/generated": -561.1491088867188, "logps/real": -255.83102416992188, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/generated": -0.4852834641933441, "rewards/margins": 22.426822662353516, "rewards/real": 21.941539764404297, "step": 4170 }, { "epoch": 0.5026455026455027, "grad_norm": 6.917698758779508, "learning_rate": 2.7632282202031e-07, "logits/generated": -2.1365036964416504, "logits/real": -2.297579526901245, "logps/generated": -497.52410888671875, "logps/real": -276.71240234375, "loss": 0.3082, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -2.8837666511535645, "rewards/margins": 25.9765682220459, "rewards/real": 23.092803955078125, "step": 4180 }, { "epoch": 0.5038480038480039, "grad_norm": 35.29472337904465, "learning_rate": 2.756547300908605e-07, "logits/generated": -2.231517791748047, "logits/real": -2.3270859718322754, "logps/generated": -512.0706176757812, "logps/real": -302.6792297363281, "loss": 0.5751, "rewards/accuracies": 0.875, "rewards/generated": 3.048867702484131, "rewards/margins": 19.952205657958984, "rewards/real": 23.00107192993164, "step": 4190 }, { "epoch": 0.5050505050505051, "grad_norm": 5.824501053955008, "learning_rate": 2.74986638161411e-07, "logits/generated": -2.1285107135772705, "logits/real": -2.1955060958862305, "logps/generated": -407.77099609375, "logps/real": -229.5274200439453, "loss": 0.336, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.16152045130729675, "rewards/margins": 18.308820724487305, "rewards/real": 18.470340728759766, "step": 4200 }, { "epoch": 0.5062530062530063, "grad_norm": 425.6269134016861, "learning_rate": 2.743185462319615e-07, "logits/generated": -2.1789069175720215, "logits/real": -2.305410385131836, "logps/generated": -587.2059326171875, "logps/real": -291.29583740234375, "loss": 0.6028, "rewards/accuracies": 0.925000011920929, "rewards/generated": 4.074764251708984, "rewards/margins": 20.289831161499023, "rewards/real": 24.36459732055664, "step": 4210 }, { "epoch": 0.5074555074555075, "grad_norm": 17.987909722350626, "learning_rate": 2.7365045430251197e-07, "logits/generated": -2.1587955951690674, "logits/real": -2.255331516265869, "logps/generated": -565.9151000976562, "logps/real": -317.25189208984375, "loss": 0.8092, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.0316405296325684, "rewards/margins": 25.426677703857422, "rewards/real": 23.395034790039062, "step": 4220 }, { "epoch": 0.5086580086580087, "grad_norm": 17.09561538762422, "learning_rate": 2.729823623730625e-07, "logits/generated": -2.098085641860962, "logits/real": -2.2109179496765137, "logps/generated": -608.9376831054688, "logps/real": -244.25595092773438, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/generated": -7.205714225769043, "rewards/margins": 24.17291831970215, "rewards/real": 16.96720314025879, "step": 4230 }, { "epoch": 0.5098605098605099, "grad_norm": 4.895647233741109, "learning_rate": 2.7231427044361303e-07, "logits/generated": -2.158693790435791, "logits/real": -2.3043694496154785, "logps/generated": -489.5899353027344, "logps/real": -231.9427032470703, "loss": 0.4309, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.786470890045166, "rewards/margins": 23.3662052154541, "rewards/real": 20.579734802246094, "step": 4240 }, { "epoch": 0.5110630110630111, "grad_norm": 61.37309275152032, "learning_rate": 2.7164617851416354e-07, "logits/generated": -2.2396254539489746, "logits/real": -2.316911220550537, "logps/generated": -544.9541625976562, "logps/real": -302.1190185546875, "loss": 0.7248, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 2.833289623260498, "rewards/margins": 22.39664077758789, "rewards/real": 25.229928970336914, "step": 4250 }, { "epoch": 0.5122655122655123, "grad_norm": 16.05886857939807, "learning_rate": 2.7097808658471404e-07, "logits/generated": -2.2014529705047607, "logits/real": -2.3276166915893555, "logps/generated": -540.337158203125, "logps/real": -325.4220886230469, "loss": 0.5844, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.1857694387435913, "rewards/margins": 27.976749420166016, "rewards/real": 26.79098129272461, "step": 4260 }, { "epoch": 0.5134680134680135, "grad_norm": 154.12658595517766, "learning_rate": 2.7030999465526455e-07, "logits/generated": -2.1820855140686035, "logits/real": -2.2535789012908936, "logps/generated": -452.01446533203125, "logps/real": -194.50796508789062, "loss": 0.5686, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.419161319732666, "rewards/margins": 18.813814163208008, "rewards/real": 16.3946533203125, "step": 4270 }, { "epoch": 0.5146705146705147, "grad_norm": 388.34247577646323, "learning_rate": 2.6964190272581506e-07, "logits/generated": -2.1291770935058594, "logits/real": -2.2273473739624023, "logps/generated": -515.061767578125, "logps/real": -171.399658203125, "loss": 0.7451, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -7.2243475914001465, "rewards/margins": 21.68121910095215, "rewards/real": 14.456872940063477, "step": 4280 }, { "epoch": 0.5158730158730159, "grad_norm": 9.629887855782778, "learning_rate": 2.6897381079636556e-07, "logits/generated": -2.1529221534729004, "logits/real": -2.2144412994384766, "logps/generated": -592.399169921875, "logps/real": -262.12939453125, "loss": 0.8383, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.442110776901245, "rewards/margins": 22.367305755615234, "rewards/real": 19.925195693969727, "step": 4290 }, { "epoch": 0.517075517075517, "grad_norm": 37.95328601503405, "learning_rate": 2.6830571886691607e-07, "logits/generated": -2.1695070266723633, "logits/real": -2.266334056854248, "logps/generated": -548.8338623046875, "logps/real": -218.3822479248047, "loss": 0.4624, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.688658237457275, "rewards/margins": 21.59425163269043, "rewards/real": 16.905593872070312, "step": 4300 }, { "epoch": 0.5182780182780182, "grad_norm": 462.2514751934306, "learning_rate": 2.6763762693746663e-07, "logits/generated": -2.2629504203796387, "logits/real": -2.304380416870117, "logps/generated": -492.2303161621094, "logps/real": -231.40292358398438, "loss": 0.9736, "rewards/accuracies": 0.875, "rewards/generated": 0.7846146821975708, "rewards/margins": 18.682939529418945, "rewards/real": 19.467554092407227, "step": 4310 }, { "epoch": 0.5194805194805194, "grad_norm": 642.0571785403265, "learning_rate": 2.6696953500801714e-07, "logits/generated": -2.064579486846924, "logits/real": -2.1611812114715576, "logps/generated": -471.14105224609375, "logps/real": -195.31710815429688, "loss": 0.5209, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.035947799682617, "rewards/margins": 20.410633087158203, "rewards/real": 14.37468433380127, "step": 4320 }, { "epoch": 0.5206830206830206, "grad_norm": 4.065203395958874, "learning_rate": 2.663014430785676e-07, "logits/generated": -2.0566446781158447, "logits/real": -2.177860975265503, "logps/generated": -618.2171630859375, "logps/real": -298.42803955078125, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/generated": -10.0460844039917, "rewards/margins": 29.6387996673584, "rewards/real": 19.59271812438965, "step": 4330 }, { "epoch": 0.5218855218855218, "grad_norm": 425.4068108907742, "learning_rate": 2.656333511491181e-07, "logits/generated": -2.126441478729248, "logits/real": -2.2806851863861084, "logps/generated": -589.7638549804688, "logps/real": -284.98175048828125, "loss": 0.3274, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.0182390213012695, "rewards/margins": 29.235637664794922, "rewards/real": 22.21739959716797, "step": 4340 }, { "epoch": 0.523088023088023, "grad_norm": 63.78711095544259, "learning_rate": 2.649652592196686e-07, "logits/generated": -2.158569097518921, "logits/real": -2.268720865249634, "logps/generated": -575.865478515625, "logps/real": -278.2831115722656, "loss": 0.5682, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.7679094076156616, "rewards/margins": 24.413375854492188, "rewards/real": 22.645465850830078, "step": 4350 }, { "epoch": 0.5242905242905243, "grad_norm": 32.803578430575214, "learning_rate": 2.642971672902191e-07, "logits/generated": -2.1932380199432373, "logits/real": -2.247445821762085, "logps/generated": -623.6664428710938, "logps/real": -294.74267578125, "loss": 0.4304, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.904718399047852, "rewards/margins": 29.348217010498047, "rewards/real": 24.443500518798828, "step": 4360 }, { "epoch": 0.5254930254930255, "grad_norm": 13.421504285193922, "learning_rate": 2.636290753607696e-07, "logits/generated": -2.2893805503845215, "logits/real": -2.3213396072387695, "logps/generated": -472.1160583496094, "logps/real": -254.22085571289062, "loss": 0.3534, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.9855934977531433, "rewards/margins": 19.296947479248047, "rewards/real": 20.28253746032715, "step": 4370 }, { "epoch": 0.5266955266955267, "grad_norm": 296.78305518652917, "learning_rate": 2.629609834313201e-07, "logits/generated": -2.2980878353118896, "logits/real": -2.361016273498535, "logps/generated": -516.3341674804688, "logps/real": -303.3874206542969, "loss": 0.5903, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 4.377728462219238, "rewards/margins": 19.445972442626953, "rewards/real": 23.823699951171875, "step": 4380 }, { "epoch": 0.5278980278980279, "grad_norm": 233.02334201767775, "learning_rate": 2.622928915018707e-07, "logits/generated": -2.1487505435943604, "logits/real": -2.2552433013916016, "logps/generated": -536.1646118164062, "logps/real": -255.44229125976562, "loss": 0.4646, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -5.993595123291016, "rewards/margins": 23.187070846557617, "rewards/real": 17.1934757232666, "step": 4390 }, { "epoch": 0.5291005291005291, "grad_norm": 28.309176553575497, "learning_rate": 2.616247995724212e-07, "logits/generated": -2.1555910110473633, "logits/real": -2.258888006210327, "logps/generated": -531.6666259765625, "logps/real": -224.35879516601562, "loss": 0.5644, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.229693412780762, "rewards/margins": 24.458274841308594, "rewards/real": 19.228580474853516, "step": 4400 }, { "epoch": 0.5303030303030303, "grad_norm": 8.738143579495391, "learning_rate": 2.609567076429717e-07, "logits/generated": -2.0264885425567627, "logits/real": -2.2401506900787354, "logps/generated": -559.1680908203125, "logps/real": -238.16748046875, "loss": 0.2895, "rewards/accuracies": 1.0, "rewards/generated": -11.519235610961914, "rewards/margins": 30.78560447692871, "rewards/real": 19.266366958618164, "step": 4410 }, { "epoch": 0.5315055315055315, "grad_norm": 45.80025387344876, "learning_rate": 2.602886157135222e-07, "logits/generated": -2.1714773178100586, "logits/real": -2.288045883178711, "logps/generated": -480.1744079589844, "logps/real": -183.60899353027344, "loss": 0.642, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.770855903625488, "rewards/margins": 20.459308624267578, "rewards/real": 15.688453674316406, "step": 4420 }, { "epoch": 0.5327080327080327, "grad_norm": 315.0925130759843, "learning_rate": 2.5962052378407265e-07, "logits/generated": -2.2238011360168457, "logits/real": -2.3348443508148193, "logps/generated": -574.4429931640625, "logps/real": -252.9526824951172, "loss": 0.5518, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.0077054500579834, "rewards/margins": 22.653966903686523, "rewards/real": 21.646263122558594, "step": 4430 }, { "epoch": 0.5339105339105339, "grad_norm": 3.65988090515442, "learning_rate": 2.5895243185462316e-07, "logits/generated": -2.117257595062256, "logits/real": -2.20717191696167, "logps/generated": -528.3685913085938, "logps/real": -202.49948120117188, "loss": 0.472, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.200154304504395, "rewards/margins": 27.355255126953125, "rewards/real": 17.155101776123047, "step": 4440 }, { "epoch": 0.5351130351130351, "grad_norm": 27.958395736363162, "learning_rate": 2.5828433992517367e-07, "logits/generated": -2.157768726348877, "logits/real": -2.287290096282959, "logps/generated": -614.3516845703125, "logps/real": -249.3728790283203, "loss": 0.507, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.969240188598633, "rewards/margins": 28.98443603515625, "rewards/real": 18.015193939208984, "step": 4450 }, { "epoch": 0.5363155363155363, "grad_norm": 145.6163028300804, "learning_rate": 2.5761624799572417e-07, "logits/generated": -2.1811509132385254, "logits/real": -2.272597074508667, "logps/generated": -468.0287170410156, "logps/real": -245.0470733642578, "loss": 0.8015, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 0.17038825154304504, "rewards/margins": 21.22028160095215, "rewards/real": 21.390670776367188, "step": 4460 }, { "epoch": 0.5375180375180375, "grad_norm": 326.0530242436695, "learning_rate": 2.5694815606627473e-07, "logits/generated": -2.125600576400757, "logits/real": -2.271019697189331, "logps/generated": -618.333251953125, "logps/real": -288.1407165527344, "loss": 0.4697, "rewards/accuracies": 0.875, "rewards/generated": -10.342113494873047, "rewards/margins": 30.96488380432129, "rewards/real": 20.622770309448242, "step": 4470 }, { "epoch": 0.5387205387205387, "grad_norm": 8.59878987701069, "learning_rate": 2.5628006413682524e-07, "logits/generated": -2.0594630241394043, "logits/real": -2.1491947174072266, "logps/generated": -571.4081420898438, "logps/real": -204.54397583007812, "loss": 0.6242, "rewards/accuracies": 0.949999988079071, "rewards/generated": -15.420013427734375, "rewards/margins": 30.255401611328125, "rewards/real": 14.8353910446167, "step": 4480 }, { "epoch": 0.5399230399230399, "grad_norm": 18.588685320751384, "learning_rate": 2.5561197220737575e-07, "logits/generated": -2.122663974761963, "logits/real": -2.1763739585876465, "logps/generated": -630.9954223632812, "logps/real": -239.72048950195312, "loss": 0.7294, "rewards/accuracies": 0.925000011920929, "rewards/generated": -9.249320030212402, "rewards/margins": 27.802181243896484, "rewards/real": 18.552860260009766, "step": 4490 }, { "epoch": 0.5411255411255411, "grad_norm": 7.443956790286945, "learning_rate": 2.5494388027792625e-07, "logits/generated": -2.0731163024902344, "logits/real": -2.243588447570801, "logps/generated": -573.4722900390625, "logps/real": -237.854248046875, "loss": 0.56, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.057721138000488, "rewards/margins": 27.704166412353516, "rewards/real": 18.646442413330078, "step": 4500 }, { "epoch": 0.5423280423280423, "grad_norm": 590.8963263743082, "learning_rate": 2.5427578834847676e-07, "logits/generated": -2.1537926197052, "logits/real": -2.278892993927002, "logps/generated": -554.9658203125, "logps/real": -272.32745361328125, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/generated": -8.246979713439941, "rewards/margins": 30.182857513427734, "rewards/real": 21.935882568359375, "step": 4510 }, { "epoch": 0.5435305435305435, "grad_norm": 9.00170587382552, "learning_rate": 2.5360769641902726e-07, "logits/generated": -2.189779281616211, "logits/real": -2.1937317848205566, "logps/generated": -582.6541748046875, "logps/real": -279.0462646484375, "loss": 0.75, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -4.090191841125488, "rewards/margins": 24.9362850189209, "rewards/real": 20.84609603881836, "step": 4520 }, { "epoch": 0.5447330447330447, "grad_norm": 7.287777276537829, "learning_rate": 2.529396044895777e-07, "logits/generated": -2.156224012374878, "logits/real": -2.2385141849517822, "logps/generated": -508.9384765625, "logps/real": -226.91232299804688, "loss": 0.4841, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.3534417152404785, "rewards/margins": 22.22745704650879, "rewards/real": 17.874013900756836, "step": 4530 }, { "epoch": 0.5459355459355459, "grad_norm": 9.560588297044037, "learning_rate": 2.522715125601282e-07, "logits/generated": -2.1847052574157715, "logits/real": -2.221968412399292, "logps/generated": -565.5045166015625, "logps/real": -307.8379211425781, "loss": 0.3461, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.35024070739746094, "rewards/margins": 23.976543426513672, "rewards/real": 23.626300811767578, "step": 4540 }, { "epoch": 0.5471380471380471, "grad_norm": 855.0784316315297, "learning_rate": 2.516034206306788e-07, "logits/generated": -2.2074856758117676, "logits/real": -2.2566702365875244, "logps/generated": -506.1853942871094, "logps/real": -253.6111297607422, "loss": 0.6346, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 0.37258845567703247, "rewards/margins": 21.079408645629883, "rewards/real": 21.451997756958008, "step": 4550 }, { "epoch": 0.5483405483405484, "grad_norm": 269.6334200133016, "learning_rate": 2.509353287012293e-07, "logits/generated": -2.172344207763672, "logits/real": -2.2206785678863525, "logps/generated": -537.7301025390625, "logps/real": -306.47161865234375, "loss": 0.8162, "rewards/accuracies": 0.875, "rewards/generated": -1.3855597972869873, "rewards/margins": 20.728174209594727, "rewards/real": 19.342613220214844, "step": 4560 }, { "epoch": 0.5495430495430496, "grad_norm": 16.075733740721557, "learning_rate": 2.502672367717798e-07, "logits/generated": -2.1734321117401123, "logits/real": -2.187448740005493, "logps/generated": -473.0311584472656, "logps/real": -198.8009796142578, "loss": 0.66, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.780452728271484, "rewards/margins": 21.286962509155273, "rewards/real": 16.506511688232422, "step": 4570 }, { "epoch": 0.5507455507455508, "grad_norm": 138.58626667433194, "learning_rate": 2.495991448423303e-07, "logits/generated": -2.16571307182312, "logits/real": -2.2767515182495117, "logps/generated": -563.0849609375, "logps/real": -245.07321166992188, "loss": 0.5309, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.1942694187164307, "rewards/margins": 19.43132209777832, "rewards/real": 18.237051010131836, "step": 4580 }, { "epoch": 0.551948051948052, "grad_norm": 648.1946419644537, "learning_rate": 2.489310529128808e-07, "logits/generated": -2.1995575428009033, "logits/real": -2.2965829372406006, "logps/generated": -555.7725830078125, "logps/real": -311.797607421875, "loss": 0.8279, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.440613269805908, "rewards/margins": 24.914764404296875, "rewards/real": 22.474153518676758, "step": 4590 }, { "epoch": 0.5531505531505532, "grad_norm": 647.4498063120639, "learning_rate": 2.482629609834313e-07, "logits/generated": -2.211308002471924, "logits/real": -2.269942045211792, "logps/generated": -506.86395263671875, "logps/real": -261.13433837890625, "loss": 0.4098, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.497762441635132, "rewards/margins": 24.93470001220703, "rewards/real": 22.436935424804688, "step": 4600 }, { "epoch": 0.5543530543530544, "grad_norm": 200.72127308793227, "learning_rate": 2.475948690539818e-07, "logits/generated": -2.075261116027832, "logits/real": -2.1782639026641846, "logps/generated": -457.989990234375, "logps/real": -176.88552856445312, "loss": 0.5318, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.84760856628418, "rewards/margins": 23.295930862426758, "rewards/real": 14.448323249816895, "step": 4610 }, { "epoch": 0.5555555555555556, "grad_norm": 27.45296345983844, "learning_rate": 2.4692677712453233e-07, "logits/generated": -2.098384380340576, "logits/real": -2.219517469406128, "logps/generated": -512.6092529296875, "logps/real": -205.8456268310547, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/generated": -9.539118766784668, "rewards/margins": 27.148761749267578, "rewards/real": 17.609642028808594, "step": 4620 }, { "epoch": 0.5567580567580568, "grad_norm": 104.32724521467392, "learning_rate": 2.4625868519508284e-07, "logits/generated": -2.1575980186462402, "logits/real": -2.1771247386932373, "logps/generated": -608.8951416015625, "logps/real": -234.56838989257812, "loss": 0.7239, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.182953834533691, "rewards/margins": 24.645427703857422, "rewards/real": 18.462474822998047, "step": 4630 }, { "epoch": 0.557960557960558, "grad_norm": 358.8584874258517, "learning_rate": 2.4559059326563334e-07, "logits/generated": -2.117887496948242, "logits/real": -2.1815619468688965, "logps/generated": -480.5862731933594, "logps/real": -256.3266906738281, "loss": 0.4473, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.405491828918457, "rewards/margins": 24.457073211669922, "rewards/real": 18.051584243774414, "step": 4640 }, { "epoch": 0.5591630591630592, "grad_norm": 530.6903562987329, "learning_rate": 2.4492250133618385e-07, "logits/generated": -2.153294563293457, "logits/real": -2.2766127586364746, "logps/generated": -532.4159545898438, "logps/real": -253.4622802734375, "loss": 0.5976, "rewards/accuracies": 0.875, "rewards/generated": -3.4828033447265625, "rewards/margins": 24.550065994262695, "rewards/real": 21.067264556884766, "step": 4650 }, { "epoch": 0.5603655603655604, "grad_norm": 114.18689365535329, "learning_rate": 2.4425440940673435e-07, "logits/generated": -2.113819122314453, "logits/real": -2.2033469676971436, "logps/generated": -414.91937255859375, "logps/real": -219.30322265625, "loss": 0.6139, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.890634536743164, "rewards/margins": 19.421606063842773, "rewards/real": 15.530969619750977, "step": 4660 }, { "epoch": 0.5615680615680616, "grad_norm": 6.1113228792604755, "learning_rate": 2.4358631747728486e-07, "logits/generated": -2.1934733390808105, "logits/real": -2.259469985961914, "logps/generated": -512.7033081054688, "logps/real": -256.7181701660156, "loss": 0.4412, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.2184066772460938, "rewards/margins": 25.44553565979004, "rewards/real": 22.227128982543945, "step": 4670 }, { "epoch": 0.5627705627705628, "grad_norm": 612.5756888336651, "learning_rate": 2.4291822554783537e-07, "logits/generated": -2.2330594062805176, "logits/real": -2.2993900775909424, "logps/generated": -575.5404052734375, "logps/real": -278.91351318359375, "loss": 0.5093, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.784644365310669, "rewards/margins": 23.952312469482422, "rewards/real": 21.167667388916016, "step": 4680 }, { "epoch": 0.563973063973064, "grad_norm": 501.67042350771, "learning_rate": 2.422501336183859e-07, "logits/generated": -2.172921657562256, "logits/real": -2.312257766723633, "logps/generated": -566.7554321289062, "logps/real": -301.3826599121094, "loss": 0.5214, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.1573255062103271, "rewards/margins": 25.16106605529785, "rewards/real": 24.003740310668945, "step": 4690 }, { "epoch": 0.5651755651755652, "grad_norm": 6.8216741272885395, "learning_rate": 2.415820416889364e-07, "logits/generated": -2.1608948707580566, "logits/real": -2.173957109451294, "logps/generated": -521.7059326171875, "logps/real": -191.4599151611328, "loss": 0.5399, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.457831382751465, "rewards/margins": 20.805423736572266, "rewards/real": 15.34759521484375, "step": 4700 }, { "epoch": 0.5663780663780664, "grad_norm": 78.41588467368189, "learning_rate": 2.409139497594869e-07, "logits/generated": -2.216021776199341, "logits/real": -2.2811038494110107, "logps/generated": -452.0791015625, "logps/real": -255.2354278564453, "loss": 0.5296, "rewards/accuracies": 0.949999988079071, "rewards/generated": 2.0734400749206543, "rewards/margins": 20.416227340698242, "rewards/real": 22.489665985107422, "step": 4710 }, { "epoch": 0.5675805675805676, "grad_norm": 346.7871605597502, "learning_rate": 2.402458578300374e-07, "logits/generated": -2.256971597671509, "logits/real": -2.269321918487549, "logps/generated": -584.9602661132812, "logps/real": -244.5535430908203, "loss": 0.6844, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.950667381286621, "rewards/margins": 25.587900161743164, "rewards/real": 20.637231826782227, "step": 4720 }, { "epoch": 0.5687830687830688, "grad_norm": 10.61780206218379, "learning_rate": 2.395777659005879e-07, "logits/generated": -2.1691219806671143, "logits/real": -2.277150869369507, "logps/generated": -576.3107299804688, "logps/real": -228.7597198486328, "loss": 0.3825, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.529725074768066, "rewards/margins": 27.679636001586914, "rewards/real": 15.149909973144531, "step": 4730 }, { "epoch": 0.56998556998557, "grad_norm": 48.30832593367765, "learning_rate": 2.389096739711384e-07, "logits/generated": -2.2049403190612793, "logits/real": -2.2890119552612305, "logps/generated": -489.464599609375, "logps/real": -249.37661743164062, "loss": 0.4363, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.34356689453125, "rewards/margins": 23.882205963134766, "rewards/real": 20.538637161254883, "step": 4740 }, { "epoch": 0.5711880711880711, "grad_norm": 481.38532300937277, "learning_rate": 2.3824158204168894e-07, "logits/generated": -2.117734432220459, "logits/real": -2.254347562789917, "logps/generated": -598.91015625, "logps/real": -262.19464111328125, "loss": 0.594, "rewards/accuracies": 0.925000011920929, "rewards/generated": -10.40277099609375, "rewards/margins": 33.866859436035156, "rewards/real": 23.464086532592773, "step": 4750 }, { "epoch": 0.5723905723905723, "grad_norm": 1172.5219283790636, "learning_rate": 2.3757349011223942e-07, "logits/generated": -2.2469725608825684, "logits/real": -2.3247828483581543, "logps/generated": -598.7657470703125, "logps/real": -303.0007019042969, "loss": 0.6705, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.37302571535110474, "rewards/margins": 24.82962417602539, "rewards/real": 25.20265007019043, "step": 4760 }, { "epoch": 0.5735930735930735, "grad_norm": 118.60208214977199, "learning_rate": 2.3690539818278993e-07, "logits/generated": -2.162161111831665, "logits/real": -2.188624620437622, "logps/generated": -547.6588134765625, "logps/real": -236.26162719726562, "loss": 0.3569, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.1087751388549805, "rewards/margins": 25.359819412231445, "rewards/real": 20.251041412353516, "step": 4770 }, { "epoch": 0.5747955747955747, "grad_norm": 140.3193831233246, "learning_rate": 2.3623730625334046e-07, "logits/generated": -2.1264591217041016, "logits/real": -2.267110824584961, "logps/generated": -461.0486755371094, "logps/real": -199.8528594970703, "loss": 0.6175, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.784832000732422, "rewards/margins": 24.5319766998291, "rewards/real": 16.74714469909668, "step": 4780 }, { "epoch": 0.575998075998076, "grad_norm": 104.62016696163226, "learning_rate": 2.3556921432389097e-07, "logits/generated": -2.056239366531372, "logits/real": -2.2162742614746094, "logps/generated": -481.51568603515625, "logps/real": -254.6533966064453, "loss": 0.7163, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.585820198059082, "rewards/margins": 27.165359497070312, "rewards/real": 20.579540252685547, "step": 4790 }, { "epoch": 0.5772005772005772, "grad_norm": 4.807259602897871, "learning_rate": 2.3490112239444147e-07, "logits/generated": -2.1776959896087646, "logits/real": -2.2318732738494873, "logps/generated": -485.2994079589844, "logps/real": -257.42547607421875, "loss": 0.4402, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.6174880862236023, "rewards/margins": 22.350873947143555, "rewards/real": 22.968364715576172, "step": 4800 }, { "epoch": 0.5784030784030784, "grad_norm": 304.66402094080183, "learning_rate": 2.3423303046499195e-07, "logits/generated": -2.0449118614196777, "logits/real": -2.196877956390381, "logps/generated": -529.0111083984375, "logps/real": -236.92764282226562, "loss": 1.1566, "rewards/accuracies": 1.0, "rewards/generated": -5.8719611167907715, "rewards/margins": 22.51728057861328, "rewards/real": 16.64531898498535, "step": 4810 }, { "epoch": 0.5796055796055796, "grad_norm": 45.365321601083735, "learning_rate": 2.3356493853554248e-07, "logits/generated": -2.1713552474975586, "logits/real": -2.264538526535034, "logps/generated": -631.4778442382812, "logps/real": -281.14837646484375, "loss": 0.4348, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.010066032409668, "rewards/margins": 28.2640438079834, "rewards/real": 24.253976821899414, "step": 4820 }, { "epoch": 0.5808080808080808, "grad_norm": 275.7824791264617, "learning_rate": 2.32896846606093e-07, "logits/generated": -2.1254169940948486, "logits/real": -2.2445826530456543, "logps/generated": -427.7737731933594, "logps/real": -225.2066192626953, "loss": 0.6204, "rewards/accuracies": 0.925000011920929, "rewards/generated": -0.2635501027107239, "rewards/margins": 19.014629364013672, "rewards/real": 18.75107765197754, "step": 4830 }, { "epoch": 0.582010582010582, "grad_norm": 11.846372987741605, "learning_rate": 2.322287546766435e-07, "logits/generated": -2.119321346282959, "logits/real": -2.217468023300171, "logps/generated": -502.81781005859375, "logps/real": -218.6884307861328, "loss": 0.4669, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.799521446228027, "rewards/margins": 23.770450592041016, "rewards/real": 16.970928192138672, "step": 4840 }, { "epoch": 0.5832130832130832, "grad_norm": 61.45898220843556, "learning_rate": 2.31560662747194e-07, "logits/generated": -2.158651828765869, "logits/real": -2.2841765880584717, "logps/generated": -594.6893310546875, "logps/real": -245.26876831054688, "loss": 0.6698, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.5212578773498535, "rewards/margins": 28.7156925201416, "rewards/real": 23.194433212280273, "step": 4850 }, { "epoch": 0.5844155844155844, "grad_norm": 239.29847546686884, "learning_rate": 2.308925708177445e-07, "logits/generated": -2.169623851776123, "logits/real": -2.30018949508667, "logps/generated": -599.7987670898438, "logps/real": -324.08929443359375, "loss": 0.5165, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.353647708892822, "rewards/margins": 32.837196350097656, "rewards/real": 27.48354721069336, "step": 4860 }, { "epoch": 0.5856180856180856, "grad_norm": 30.769959529932, "learning_rate": 2.3022447888829502e-07, "logits/generated": -2.1507740020751953, "logits/real": -2.128061532974243, "logps/generated": -489.041748046875, "logps/real": -224.7561492919922, "loss": 0.4079, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.5154643058776855, "rewards/margins": 23.196117401123047, "rewards/real": 17.680652618408203, "step": 4870 }, { "epoch": 0.5868205868205868, "grad_norm": 4.848906062900028, "learning_rate": 2.2955638695884552e-07, "logits/generated": -2.1520180702209473, "logits/real": -2.237091064453125, "logps/generated": -609.17041015625, "logps/real": -242.2404022216797, "loss": 0.2811, "rewards/accuracies": 1.0, "rewards/generated": -11.879868507385254, "rewards/margins": 31.69455337524414, "rewards/real": 19.81468391418457, "step": 4880 }, { "epoch": 0.588023088023088, "grad_norm": 313.5974176423623, "learning_rate": 2.2888829502939603e-07, "logits/generated": -2.1980972290039062, "logits/real": -2.259403705596924, "logps/generated": -566.7474365234375, "logps/real": -256.3862609863281, "loss": 0.3452, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.331935167312622, "rewards/margins": 24.445987701416016, "rewards/real": 21.114049911499023, "step": 4890 }, { "epoch": 0.5892255892255892, "grad_norm": 572.6774350234213, "learning_rate": 2.2822020309994656e-07, "logits/generated": -2.1327273845672607, "logits/real": -2.1624417304992676, "logps/generated": -560.4371948242188, "logps/real": -244.71969604492188, "loss": 0.7624, "rewards/accuracies": 0.875, "rewards/generated": -6.603463649749756, "rewards/margins": 24.912017822265625, "rewards/real": 18.308555603027344, "step": 4900 }, { "epoch": 0.5904280904280904, "grad_norm": 37.134041407638364, "learning_rate": 2.2755211117049704e-07, "logits/generated": -2.1626598834991455, "logits/real": -2.2314906120300293, "logps/generated": -504.32177734375, "logps/real": -207.5485382080078, "loss": 0.5152, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.761170387268066, "rewards/margins": 25.588455200195312, "rewards/real": 17.827287673950195, "step": 4910 }, { "epoch": 0.5916305916305916, "grad_norm": 410.62480019639355, "learning_rate": 2.2688401924104755e-07, "logits/generated": -2.0983004570007324, "logits/real": -2.194437265396118, "logps/generated": -481.0418395996094, "logps/real": -225.505615234375, "loss": 0.808, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.941951274871826, "rewards/margins": 25.70985984802246, "rewards/real": 20.76790428161621, "step": 4920 }, { "epoch": 0.5928330928330928, "grad_norm": 3.2504897147129155, "learning_rate": 2.2621592731159806e-07, "logits/generated": -2.0753908157348633, "logits/real": -2.1659369468688965, "logps/generated": -538.7593994140625, "logps/real": -209.56631469726562, "loss": 0.6038, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.15286922454834, "rewards/margins": 22.93928337097168, "rewards/real": 17.786415100097656, "step": 4930 }, { "epoch": 0.594035594035594, "grad_norm": 317.2347512715378, "learning_rate": 2.255478353821486e-07, "logits/generated": -2.15364408493042, "logits/real": -2.2721619606018066, "logps/generated": -505.0735778808594, "logps/real": -261.0234375, "loss": 0.8925, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.7675155401229858, "rewards/margins": 21.99693489074707, "rewards/real": 20.229415893554688, "step": 4940 }, { "epoch": 0.5952380952380952, "grad_norm": 5.36587952872531, "learning_rate": 2.248797434526991e-07, "logits/generated": -2.0879244804382324, "logits/real": -2.1579527854919434, "logps/generated": -502.4501037597656, "logps/real": -238.88949584960938, "loss": 0.2876, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.519805908203125, "rewards/margins": 25.457929611206055, "rewards/real": 17.938121795654297, "step": 4950 }, { "epoch": 0.5964405964405964, "grad_norm": 389.43453301017496, "learning_rate": 2.2421165152324957e-07, "logits/generated": -2.1876790523529053, "logits/real": -2.260486125946045, "logps/generated": -598.5149536132812, "logps/real": -302.97760009765625, "loss": 0.6785, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.6763100624084473, "rewards/margins": 27.476821899414062, "rewards/real": 24.80051040649414, "step": 4960 }, { "epoch": 0.5976430976430976, "grad_norm": 249.65916655479336, "learning_rate": 2.2354355959380008e-07, "logits/generated": -2.1480464935302734, "logits/real": -2.2145352363586426, "logps/generated": -614.873046875, "logps/real": -289.75042724609375, "loss": 0.4912, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.20695424079895, "rewards/margins": 25.05868148803711, "rewards/real": 22.851726531982422, "step": 4970 }, { "epoch": 0.5988455988455988, "grad_norm": 204.81001460914447, "learning_rate": 2.2287546766435061e-07, "logits/generated": -2.0724501609802246, "logits/real": -2.2189583778381348, "logps/generated": -591.15283203125, "logps/real": -243.0890655517578, "loss": 0.4145, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.009787559509277, "rewards/margins": 32.64847183227539, "rewards/real": 20.638683319091797, "step": 4980 }, { "epoch": 0.6000481000481, "grad_norm": 4.543041941388648, "learning_rate": 2.2220737573490112e-07, "logits/generated": -2.071737766265869, "logits/real": -2.21366286277771, "logps/generated": -529.865966796875, "logps/real": -186.53591918945312, "loss": 0.2482, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.864470481872559, "rewards/margins": 22.951242446899414, "rewards/real": 15.086771011352539, "step": 4990 }, { "epoch": 0.6012506012506013, "grad_norm": 14.838024605051249, "learning_rate": 2.2153928380545163e-07, "logits/generated": -2.2031359672546387, "logits/real": -2.316646099090576, "logps/generated": -499.90338134765625, "logps/real": -267.6744079589844, "loss": 0.418, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.471962928771973, "rewards/margins": 26.8084659576416, "rewards/real": 22.336502075195312, "step": 5000 }, { "epoch": 0.6012506012506013, "eval_logits/generated": -2.207758903503418, "eval_logits/real": -2.256751537322998, "eval_logps/generated": -509.637451171875, "eval_logps/real": -257.37213134765625, "eval_loss": 0.4741517901420593, "eval_rewards/accuracies": 0.961309552192688, "eval_rewards/generated": -1.9855931997299194, "eval_rewards/margins": 23.375478744506836, "eval_rewards/real": 21.389888763427734, "eval_runtime": 159.0112, "eval_samples_per_second": 6.289, "eval_steps_per_second": 0.528, "step": 5000 }, { "epoch": 0.6024531024531025, "grad_norm": 112.36891804205959, "learning_rate": 2.208711918760021e-07, "logits/generated": -2.1393589973449707, "logits/real": -2.206692934036255, "logps/generated": -498.6484375, "logps/real": -234.1495819091797, "loss": 0.6687, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.348142623901367, "rewards/margins": 23.36178207397461, "rewards/real": 20.01363754272461, "step": 5010 }, { "epoch": 0.6036556036556037, "grad_norm": 159.44117873507713, "learning_rate": 2.2020309994655264e-07, "logits/generated": -2.194477081298828, "logits/real": -2.1287307739257812, "logps/generated": -581.6893310546875, "logps/real": -229.9732208251953, "loss": 0.567, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -3.6853485107421875, "rewards/margins": 23.484485626220703, "rewards/real": 19.79913902282715, "step": 5020 }, { "epoch": 0.6048581048581049, "grad_norm": 393.8714196973956, "learning_rate": 2.1953500801710315e-07, "logits/generated": -1.9988247156143188, "logits/real": -2.2962424755096436, "logps/generated": -581.6021118164062, "logps/real": -289.51068115234375, "loss": 0.541, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.950952053070068, "rewards/margins": 30.75851821899414, "rewards/real": 23.807567596435547, "step": 5030 }, { "epoch": 0.6060606060606061, "grad_norm": 452.055430809275, "learning_rate": 2.1886691608765365e-07, "logits/generated": -2.1991615295410156, "logits/real": -2.1468417644500732, "logps/generated": -572.5194091796875, "logps/real": -186.25791931152344, "loss": 0.316, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.603403091430664, "rewards/margins": 25.295948028564453, "rewards/real": 16.69254493713379, "step": 5040 }, { "epoch": 0.6072631072631073, "grad_norm": 4.622813566085611, "learning_rate": 2.1819882415820416e-07, "logits/generated": -2.07625150680542, "logits/real": -2.1684768199920654, "logps/generated": -437.0848083496094, "logps/real": -192.69497680664062, "loss": 0.2383, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.420557022094727, "rewards/margins": 23.884294509887695, "rewards/real": 15.463737487792969, "step": 5050 }, { "epoch": 0.6084656084656085, "grad_norm": 696.8993519314927, "learning_rate": 2.1753073222875467e-07, "logits/generated": -2.1094517707824707, "logits/real": -2.2138452529907227, "logps/generated": -641.497314453125, "logps/real": -246.94497680664062, "loss": 0.7344, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -7.214853763580322, "rewards/margins": 28.655588150024414, "rewards/real": 21.440731048583984, "step": 5060 }, { "epoch": 0.6096681096681097, "grad_norm": 26.17801486404397, "learning_rate": 2.1686264029930517e-07, "logits/generated": -2.01373291015625, "logits/real": -2.134265422821045, "logps/generated": -577.6567993164062, "logps/real": -246.38558959960938, "loss": 0.4124, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.266793251037598, "rewards/margins": 29.215808868408203, "rewards/real": 18.949010848999023, "step": 5070 }, { "epoch": 0.6108706108706109, "grad_norm": 20.446915382607806, "learning_rate": 2.1619454836985568e-07, "logits/generated": -2.1533255577087402, "logits/real": -2.224644184112549, "logps/generated": -678.8082885742188, "logps/real": -314.3406677246094, "loss": 0.4912, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.711610317230225, "rewards/margins": 28.086917877197266, "rewards/real": 22.375307083129883, "step": 5080 }, { "epoch": 0.6120731120731121, "grad_norm": 13.903932402505381, "learning_rate": 2.1552645644040619e-07, "logits/generated": -2.196721315383911, "logits/real": -2.222590923309326, "logps/generated": -572.3262939453125, "logps/real": -248.94873046875, "loss": 0.507, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.296078205108643, "rewards/margins": 28.022445678710938, "rewards/real": 21.726369857788086, "step": 5090 }, { "epoch": 0.6132756132756133, "grad_norm": 10.818548609026534, "learning_rate": 2.1485836451095672e-07, "logits/generated": -2.0338661670684814, "logits/real": -2.21252179145813, "logps/generated": -399.80047607421875, "logps/real": -232.8976287841797, "loss": 0.6458, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.923448085784912, "rewards/margins": 21.71235466003418, "rewards/real": 18.78890609741211, "step": 5100 }, { "epoch": 0.6144781144781145, "grad_norm": 45.33669880957139, "learning_rate": 2.141902725815072e-07, "logits/generated": -1.9484567642211914, "logits/real": -2.158186435699463, "logps/generated": -486.42034912109375, "logps/real": -195.52969360351562, "loss": 0.5066, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.924624443054199, "rewards/margins": 25.316829681396484, "rewards/real": 17.3922061920166, "step": 5110 }, { "epoch": 0.6156806156806157, "grad_norm": 13.4415581964778, "learning_rate": 2.135221806520577e-07, "logits/generated": -2.100008010864258, "logits/real": -2.19741153717041, "logps/generated": -582.730224609375, "logps/real": -227.67434692382812, "loss": 0.3412, "rewards/accuracies": 1.0, "rewards/generated": -9.561922073364258, "rewards/margins": 27.262313842773438, "rewards/real": 17.70039176940918, "step": 5120 }, { "epoch": 0.6168831168831169, "grad_norm": 94.93494353194716, "learning_rate": 2.128540887226082e-07, "logits/generated": -2.14306902885437, "logits/real": -2.1899149417877197, "logps/generated": -427.8895568847656, "logps/real": -209.77548217773438, "loss": 0.3483, "rewards/accuracies": 1.0, "rewards/generated": -2.1006529331207275, "rewards/margins": 20.818208694458008, "rewards/real": 18.71755599975586, "step": 5130 }, { "epoch": 0.6180856180856181, "grad_norm": 206.2417095504339, "learning_rate": 2.1218599679315874e-07, "logits/generated": -2.167590379714966, "logits/real": -2.1929023265838623, "logps/generated": -542.5330810546875, "logps/real": -215.34066772460938, "loss": 0.5407, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.4143515825271606, "rewards/margins": 19.591306686401367, "rewards/real": 18.17695426940918, "step": 5140 }, { "epoch": 0.6192881192881193, "grad_norm": 10.163706195238307, "learning_rate": 2.1151790486370925e-07, "logits/generated": -2.1078526973724365, "logits/real": -2.1608290672302246, "logps/generated": -559.0823364257812, "logps/real": -183.32273864746094, "loss": 0.4086, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.760201454162598, "rewards/margins": 26.649608612060547, "rewards/real": 15.889409065246582, "step": 5150 }, { "epoch": 0.6204906204906205, "grad_norm": 25.25433949615869, "learning_rate": 2.1084981293425973e-07, "logits/generated": -2.070712089538574, "logits/real": -2.1418638229370117, "logps/generated": -536.2274169921875, "logps/real": -262.9801940917969, "loss": 0.4258, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.6793304681777954, "rewards/margins": 22.470706939697266, "rewards/real": 20.79137420654297, "step": 5160 }, { "epoch": 0.6216931216931217, "grad_norm": 4.333580297827752, "learning_rate": 2.1018172100481024e-07, "logits/generated": -2.023705244064331, "logits/real": -2.0775601863861084, "logps/generated": -642.1519775390625, "logps/real": -283.9187316894531, "loss": 0.5697, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -6.47070837020874, "rewards/margins": 29.55684471130371, "rewards/real": 23.086135864257812, "step": 5170 }, { "epoch": 0.622895622895623, "grad_norm": 25.221501546308797, "learning_rate": 2.0951362907536077e-07, "logits/generated": -2.1672415733337402, "logits/real": -2.0736351013183594, "logps/generated": -578.4718017578125, "logps/real": -255.50955200195312, "loss": 0.4278, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.31916409730911255, "rewards/margins": 20.810937881469727, "rewards/real": 20.49177360534668, "step": 5180 }, { "epoch": 0.6240981240981242, "grad_norm": 8.732133139172033, "learning_rate": 2.0884553714591128e-07, "logits/generated": -2.047583818435669, "logits/real": -2.1747946739196777, "logps/generated": -596.5800170898438, "logps/real": -201.64271545410156, "loss": 0.5734, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.516444206237793, "rewards/margins": 28.915258407592773, "rewards/real": 18.398815155029297, "step": 5190 }, { "epoch": 0.6253006253006252, "grad_norm": 370.3254087257309, "learning_rate": 2.0817744521646178e-07, "logits/generated": -2.069950580596924, "logits/real": -2.2123332023620605, "logps/generated": -665.2952880859375, "logps/real": -226.219482421875, "loss": 0.5487, "rewards/accuracies": 0.949999988079071, "rewards/generated": -14.587949752807617, "rewards/margins": 34.09016799926758, "rewards/real": 19.50221824645996, "step": 5200 }, { "epoch": 0.6265031265031265, "grad_norm": 5.007378874706131, "learning_rate": 2.0750935328701226e-07, "logits/generated": -2.0197086334228516, "logits/real": -2.12208890914917, "logps/generated": -512.1553955078125, "logps/real": -155.11798095703125, "loss": 0.3861, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.422812461853027, "rewards/margins": 25.80704689025879, "rewards/real": 13.384236335754395, "step": 5210 }, { "epoch": 0.6277056277056277, "grad_norm": 135.436747139612, "learning_rate": 2.068412613575628e-07, "logits/generated": -2.103518009185791, "logits/real": -2.1637232303619385, "logps/generated": -487.9015197753906, "logps/real": -233.65478515625, "loss": 0.7396, "rewards/accuracies": 0.925000011920929, "rewards/generated": -2.6520657539367676, "rewards/margins": 24.12938117980957, "rewards/real": 21.477313995361328, "step": 5220 }, { "epoch": 0.6289081289081289, "grad_norm": 26.49116652112335, "learning_rate": 2.061731694281133e-07, "logits/generated": -2.0646491050720215, "logits/real": -2.1591761112213135, "logps/generated": -614.2501220703125, "logps/real": -311.14678955078125, "loss": 0.5308, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -5.64684534072876, "rewards/margins": 30.806594848632812, "rewards/real": 25.15974998474121, "step": 5230 }, { "epoch": 0.6301106301106301, "grad_norm": 36.768821984186204, "learning_rate": 2.055050774986638e-07, "logits/generated": -2.1185431480407715, "logits/real": -2.1948776245117188, "logps/generated": -635.8701171875, "logps/real": -278.8142395019531, "loss": 0.3912, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.881353378295898, "rewards/margins": 28.80423927307129, "rewards/real": 19.92288589477539, "step": 5240 }, { "epoch": 0.6313131313131313, "grad_norm": 447.741760926687, "learning_rate": 2.0483698556921431e-07, "logits/generated": -2.1760456562042236, "logits/real": -2.197248935699463, "logps/generated": -524.3167114257812, "logps/real": -197.69581604003906, "loss": 0.7793, "rewards/accuracies": 0.875, "rewards/generated": -1.4320223331451416, "rewards/margins": 19.761924743652344, "rewards/real": 18.32990264892578, "step": 5250 }, { "epoch": 0.6325156325156325, "grad_norm": 150.43983607994468, "learning_rate": 2.0416889363976482e-07, "logits/generated": -2.1038804054260254, "logits/real": -2.1359784603118896, "logps/generated": -433.695556640625, "logps/real": -245.3255615234375, "loss": 0.7071, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.2940962314605713, "rewards/margins": 18.276500701904297, "rewards/real": 19.570598602294922, "step": 5260 }, { "epoch": 0.6337181337181337, "grad_norm": 13.51250696223638, "learning_rate": 2.0350080171031533e-07, "logits/generated": -2.062173843383789, "logits/real": -2.139639377593994, "logps/generated": -596.686279296875, "logps/real": -313.1416931152344, "loss": 0.3623, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.309140682220459, "rewards/margins": 27.867406845092773, "rewards/real": 22.558265686035156, "step": 5270 }, { "epoch": 0.6349206349206349, "grad_norm": 13.454090853723232, "learning_rate": 2.0283270978086583e-07, "logits/generated": -2.088893413543701, "logits/real": -2.1333022117614746, "logps/generated": -550.6292724609375, "logps/real": -226.11404418945312, "loss": 0.5424, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.3938703536987305, "rewards/margins": 26.277690887451172, "rewards/real": 18.883819580078125, "step": 5280 }, { "epoch": 0.6361231361231361, "grad_norm": 162.15397128407542, "learning_rate": 2.0216461785141634e-07, "logits/generated": -2.1805367469787598, "logits/real": -2.2109501361846924, "logps/generated": -442.6311950683594, "logps/real": -217.99612426757812, "loss": 0.3433, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.4469165802001953, "rewards/margins": 21.494651794433594, "rewards/real": 18.047733306884766, "step": 5290 }, { "epoch": 0.6373256373256373, "grad_norm": 43.28127747297377, "learning_rate": 2.0149652592196687e-07, "logits/generated": -2.2064998149871826, "logits/real": -2.196180820465088, "logps/generated": -554.3094482421875, "logps/real": -271.9448547363281, "loss": 0.4625, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.1922428607940674, "rewards/margins": 26.602619171142578, "rewards/real": 23.41037940979004, "step": 5300 }, { "epoch": 0.6385281385281385, "grad_norm": 355.1036412273798, "learning_rate": 2.0082843399251735e-07, "logits/generated": -2.1402595043182373, "logits/real": -2.250767946243286, "logps/generated": -532.1314697265625, "logps/real": -248.40756225585938, "loss": 0.4621, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.158871650695801, "rewards/margins": 24.802583694458008, "rewards/real": 19.64371109008789, "step": 5310 }, { "epoch": 0.6397306397306397, "grad_norm": 43.81793349573058, "learning_rate": 2.0016034206306786e-07, "logits/generated": -2.138998031616211, "logits/real": -2.1637184619903564, "logps/generated": -660.0232543945312, "logps/real": -312.3558349609375, "loss": 0.5127, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.3712105751037598, "rewards/margins": 29.01308822631836, "rewards/real": 25.64188003540039, "step": 5320 }, { "epoch": 0.6409331409331409, "grad_norm": 28.48245777540848, "learning_rate": 1.9949225013361837e-07, "logits/generated": -2.059576988220215, "logits/real": -2.113861560821533, "logps/generated": -755.4419555664062, "logps/real": -264.72528076171875, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/generated": -18.074052810668945, "rewards/margins": 38.974754333496094, "rewards/real": 20.900699615478516, "step": 5330 }, { "epoch": 0.6421356421356421, "grad_norm": 8.245086448679869, "learning_rate": 1.988241582041689e-07, "logits/generated": -2.0729799270629883, "logits/real": -2.182405471801758, "logps/generated": -580.021484375, "logps/real": -287.5953063964844, "loss": 0.4583, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.655449867248535, "rewards/margins": 27.83284568786621, "rewards/real": 22.17739486694336, "step": 5340 }, { "epoch": 0.6433381433381433, "grad_norm": 1149.7886967956752, "learning_rate": 1.981560662747194e-07, "logits/generated": -2.096813201904297, "logits/real": -2.152869939804077, "logps/generated": -550.7825317382812, "logps/real": -268.78656005859375, "loss": 0.5668, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.9959911108016968, "rewards/margins": 22.33914566040039, "rewards/real": 20.343156814575195, "step": 5350 }, { "epoch": 0.6445406445406445, "grad_norm": 195.37705731684127, "learning_rate": 1.9748797434526989e-07, "logits/generated": -2.0178475379943848, "logits/real": -2.0801384449005127, "logps/generated": -467.127197265625, "logps/real": -196.71151733398438, "loss": 0.4073, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -6.072965621948242, "rewards/margins": 22.643278121948242, "rewards/real": 16.570314407348633, "step": 5360 }, { "epoch": 0.6457431457431457, "grad_norm": 653.2545289632761, "learning_rate": 1.968198824158204e-07, "logits/generated": -2.115100383758545, "logits/real": -2.1691863536834717, "logps/generated": -491.2147521972656, "logps/real": -195.5085906982422, "loss": 0.6017, "rewards/accuracies": 1.0, "rewards/generated": -4.717627048492432, "rewards/margins": 22.232973098754883, "rewards/real": 17.515344619750977, "step": 5370 }, { "epoch": 0.6469456469456469, "grad_norm": 4.235068886873702, "learning_rate": 1.9615179048637093e-07, "logits/generated": -2.1100916862487793, "logits/real": -2.1362085342407227, "logps/generated": -632.2305908203125, "logps/real": -290.26898193359375, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/generated": -7.390206336975098, "rewards/margins": 30.07211685180664, "rewards/real": 22.68191146850586, "step": 5380 }, { "epoch": 0.6481481481481481, "grad_norm": 758.1775530985985, "learning_rate": 1.9548369855692143e-07, "logits/generated": -2.063777446746826, "logits/real": -2.129456043243408, "logps/generated": -422.287109375, "logps/real": -223.12283325195312, "loss": 0.5461, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.549731254577637, "rewards/margins": 21.80051612854004, "rewards/real": 17.25078582763672, "step": 5390 }, { "epoch": 0.6493506493506493, "grad_norm": 4.045416311092152, "learning_rate": 1.9481560662747194e-07, "logits/generated": -2.0610225200653076, "logits/real": -2.075570821762085, "logps/generated": -612.0631103515625, "logps/real": -253.8446044921875, "loss": 0.9237, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.8615546226501465, "rewards/margins": 24.515655517578125, "rewards/real": 16.654102325439453, "step": 5400 }, { "epoch": 0.6505531505531505, "grad_norm": 296.0250028944842, "learning_rate": 1.9414751469802242e-07, "logits/generated": -1.991396188735962, "logits/real": -2.116002321243286, "logps/generated": -589.1071166992188, "logps/real": -270.3807067871094, "loss": 0.2863, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.402120590209961, "rewards/margins": 30.735759735107422, "rewards/real": 22.33364486694336, "step": 5410 }, { "epoch": 0.6517556517556518, "grad_norm": 501.9845126673603, "learning_rate": 1.9347942276857295e-07, "logits/generated": -2.101869821548462, "logits/real": -2.1424288749694824, "logps/generated": -531.9818115234375, "logps/real": -191.2215118408203, "loss": 0.5794, "rewards/accuracies": 1.0, "rewards/generated": -5.969061851501465, "rewards/margins": 24.978412628173828, "rewards/real": 19.00935173034668, "step": 5420 }, { "epoch": 0.652958152958153, "grad_norm": 15.897220212742806, "learning_rate": 1.9281133083912346e-07, "logits/generated": -2.0708134174346924, "logits/real": -2.177319049835205, "logps/generated": -529.0983276367188, "logps/real": -257.287109375, "loss": 0.4172, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.855011940002441, "rewards/margins": 27.917526245117188, "rewards/real": 22.062515258789062, "step": 5430 }, { "epoch": 0.6541606541606542, "grad_norm": 167.7270009301335, "learning_rate": 1.9214323890967396e-07, "logits/generated": -2.202213764190674, "logits/real": -2.2001771926879883, "logps/generated": -585.6710815429688, "logps/real": -242.23291015625, "loss": 0.5382, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.8997087478637695, "rewards/margins": 27.777307510375977, "rewards/real": 21.87760353088379, "step": 5440 }, { "epoch": 0.6553631553631554, "grad_norm": 296.6462020101403, "learning_rate": 1.9147514698022447e-07, "logits/generated": -2.138925075531006, "logits/real": -2.197624921798706, "logps/generated": -473.5198669433594, "logps/real": -224.0499267578125, "loss": 0.6335, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.7679665088653564, "rewards/margins": 21.625890731811523, "rewards/real": 17.857921600341797, "step": 5450 }, { "epoch": 0.6565656565656566, "grad_norm": 4.355353127300036, "learning_rate": 1.9080705505077498e-07, "logits/generated": -2.145355701446533, "logits/real": -2.248900890350342, "logps/generated": -412.97637939453125, "logps/real": -194.98733520507812, "loss": 0.3381, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.5382111072540283, "rewards/margins": 20.178089141845703, "rewards/real": 16.639875411987305, "step": 5460 }, { "epoch": 0.6577681577681578, "grad_norm": 4.318768270434199, "learning_rate": 1.9013896312132548e-07, "logits/generated": -2.0456652641296387, "logits/real": -2.139618396759033, "logps/generated": -522.5445556640625, "logps/real": -227.39199829101562, "loss": 0.444, "rewards/accuracies": 1.0, "rewards/generated": -5.697910785675049, "rewards/margins": 26.31026840209961, "rewards/real": 20.61235809326172, "step": 5470 }, { "epoch": 0.658970658970659, "grad_norm": 6.296367735745215, "learning_rate": 1.89470871191876e-07, "logits/generated": -2.0240321159362793, "logits/real": -2.1139636039733887, "logps/generated": -528.0909423828125, "logps/real": -231.0218963623047, "loss": 0.4341, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.761052131652832, "rewards/margins": 27.318090438842773, "rewards/real": 19.557037353515625, "step": 5480 }, { "epoch": 0.6601731601731602, "grad_norm": 373.84841874812463, "learning_rate": 1.888027792624265e-07, "logits/generated": -2.052645444869995, "logits/real": -2.149437427520752, "logps/generated": -547.0812377929688, "logps/real": -232.49789428710938, "loss": 0.3688, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.1844024658203125, "rewards/margins": 25.815088272094727, "rewards/real": 18.630680084228516, "step": 5490 }, { "epoch": 0.6613756613756614, "grad_norm": 550.8932954693026, "learning_rate": 1.8813468733297703e-07, "logits/generated": -2.090024709701538, "logits/real": -2.1519649028778076, "logps/generated": -482.130615234375, "logps/real": -256.81280517578125, "loss": 0.5207, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -0.7443699836730957, "rewards/margins": 20.62835693359375, "rewards/real": 19.883983612060547, "step": 5500 }, { "epoch": 0.6625781625781626, "grad_norm": 10.982656623342988, "learning_rate": 1.874665954035275e-07, "logits/generated": -2.1782007217407227, "logits/real": -2.216689109802246, "logps/generated": -604.4592895507812, "logps/real": -289.37969970703125, "loss": 0.4007, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.4544551372528076, "rewards/margins": 26.7467041015625, "rewards/real": 25.292251586914062, "step": 5510 }, { "epoch": 0.6637806637806638, "grad_norm": 632.2383449502196, "learning_rate": 1.8679850347407802e-07, "logits/generated": -2.1914610862731934, "logits/real": -2.211108684539795, "logps/generated": -595.5673217773438, "logps/real": -277.1487731933594, "loss": 0.6555, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.866485595703125, "rewards/margins": 23.17861557006836, "rewards/real": 24.045101165771484, "step": 5520 }, { "epoch": 0.664983164983165, "grad_norm": 22.145461961198414, "learning_rate": 1.8613041154462852e-07, "logits/generated": -1.9987192153930664, "logits/real": -2.1111345291137695, "logps/generated": -458.30291748046875, "logps/real": -181.0555419921875, "loss": 0.4282, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.222566604614258, "rewards/margins": 23.844867706298828, "rewards/real": 14.622297286987305, "step": 5530 }, { "epoch": 0.6661856661856662, "grad_norm": 31.09422927151055, "learning_rate": 1.8546231961517905e-07, "logits/generated": -2.0543627738952637, "logits/real": -2.140010356903076, "logps/generated": -481.1874084472656, "logps/real": -235.50100708007812, "loss": 0.3559, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.622293472290039, "rewards/margins": 23.292308807373047, "rewards/real": 18.670015335083008, "step": 5540 }, { "epoch": 0.6673881673881674, "grad_norm": 182.68321796237728, "learning_rate": 1.8479422768572956e-07, "logits/generated": -2.1397509574890137, "logits/real": -2.166752815246582, "logps/generated": -607.8590698242188, "logps/real": -323.15216064453125, "loss": 0.8327, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.7382205128669739, "rewards/margins": 29.84052085876465, "rewards/real": 29.1023006439209, "step": 5550 }, { "epoch": 0.6685906685906686, "grad_norm": 624.3960294411138, "learning_rate": 1.8412613575628004e-07, "logits/generated": -1.9738527536392212, "logits/real": -2.0829389095306396, "logps/generated": -466.9283752441406, "logps/real": -216.74368286132812, "loss": 0.413, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.393265724182129, "rewards/margins": 23.90040397644043, "rewards/real": 17.507137298583984, "step": 5560 }, { "epoch": 0.6697931697931698, "grad_norm": 200.27427354551966, "learning_rate": 1.8345804382683055e-07, "logits/generated": -2.095628499984741, "logits/real": -2.0717434883117676, "logps/generated": -503.68194580078125, "logps/real": -270.31744384765625, "loss": 0.494, "rewards/accuracies": 0.800000011920929, "rewards/generated": 2.8924455642700195, "rewards/margins": 18.55295753479004, "rewards/real": 21.445402145385742, "step": 5570 }, { "epoch": 0.670995670995671, "grad_norm": 147.98607420805476, "learning_rate": 1.8278995189738108e-07, "logits/generated": -2.0196242332458496, "logits/real": -2.0777719020843506, "logps/generated": -483.0901794433594, "logps/real": -207.4373321533203, "loss": 0.3298, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.44189739227295, "rewards/margins": 24.731090545654297, "rewards/real": 16.289196014404297, "step": 5580 }, { "epoch": 0.6721981721981722, "grad_norm": 8.402665709870938, "learning_rate": 1.821218599679316e-07, "logits/generated": -2.0823845863342285, "logits/real": -2.113119602203369, "logps/generated": -497.16619873046875, "logps/real": -248.0476837158203, "loss": 0.5447, "rewards/accuracies": 1.0, "rewards/generated": -3.768075466156006, "rewards/margins": 23.414453506469727, "rewards/real": 19.646379470825195, "step": 5590 }, { "epoch": 0.6734006734006734, "grad_norm": 188.20614427776823, "learning_rate": 1.814537680384821e-07, "logits/generated": -2.0918502807617188, "logits/real": -2.226760149002075, "logps/generated": -531.3416137695312, "logps/real": -247.00234985351562, "loss": 0.3956, "rewards/accuracies": 1.0, "rewards/generated": -6.44674825668335, "rewards/margins": 27.317821502685547, "rewards/real": 20.871074676513672, "step": 5600 }, { "epoch": 0.6746031746031746, "grad_norm": 377.66966547315417, "learning_rate": 1.8078567610903257e-07, "logits/generated": -2.034510850906372, "logits/real": -2.0546746253967285, "logps/generated": -477.59014892578125, "logps/real": -204.1554718017578, "loss": 0.4414, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.174881935119629, "rewards/margins": 20.559953689575195, "rewards/real": 16.385068893432617, "step": 5610 }, { "epoch": 0.6758056758056759, "grad_norm": 17.852773970762474, "learning_rate": 1.801175841795831e-07, "logits/generated": -1.9572585821151733, "logits/real": -2.0911784172058105, "logps/generated": -576.0623779296875, "logps/real": -236.39163208007812, "loss": 0.3946, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.064367294311523, "rewards/margins": 31.457971572875977, "rewards/real": 20.393604278564453, "step": 5620 }, { "epoch": 0.6770081770081771, "grad_norm": 12.508713084195344, "learning_rate": 1.794494922501336e-07, "logits/generated": -2.0246353149414062, "logits/real": -2.0877811908721924, "logps/generated": -526.1453857421875, "logps/real": -233.11831665039062, "loss": 0.6675, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.685117244720459, "rewards/margins": 26.600414276123047, "rewards/real": 19.91529655456543, "step": 5630 }, { "epoch": 0.6782106782106783, "grad_norm": 3.189240337896366, "learning_rate": 1.7878140032068412e-07, "logits/generated": -2.022733211517334, "logits/real": -2.0900216102600098, "logps/generated": -552.0997924804688, "logps/real": -228.74978637695312, "loss": 0.323, "rewards/accuracies": 1.0, "rewards/generated": -3.4965484142303467, "rewards/margins": 24.248926162719727, "rewards/real": 20.752376556396484, "step": 5640 }, { "epoch": 0.6794131794131794, "grad_norm": 622.0812126260017, "learning_rate": 1.7811330839123463e-07, "logits/generated": -2.0119502544403076, "logits/real": -2.050553798675537, "logps/generated": -435.39068603515625, "logps/real": -227.7612762451172, "loss": 0.551, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -5.234732151031494, "rewards/margins": 22.789058685302734, "rewards/real": 17.5543270111084, "step": 5650 }, { "epoch": 0.6806156806156806, "grad_norm": 5.240742719763742, "learning_rate": 1.7744521646178516e-07, "logits/generated": -2.0209696292877197, "logits/real": -2.0807480812072754, "logps/generated": -569.5696411132812, "logps/real": -262.31939697265625, "loss": 0.4188, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.3068060874938965, "rewards/margins": 27.74716567993164, "rewards/real": 24.44036102294922, "step": 5660 }, { "epoch": 0.6818181818181818, "grad_norm": 21.301646869618097, "learning_rate": 1.7677712453233564e-07, "logits/generated": -2.172571897506714, "logits/real": -2.1529417037963867, "logps/generated": -620.6819458007812, "logps/real": -294.45037841796875, "loss": 0.3965, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.7491583824157715, "rewards/margins": 26.50079917907715, "rewards/real": 25.75164222717285, "step": 5670 }, { "epoch": 0.683020683020683, "grad_norm": 271.4522301801874, "learning_rate": 1.7610903260288615e-07, "logits/generated": -2.031538724899292, "logits/real": -2.1985344886779785, "logps/generated": -451.4507751464844, "logps/real": -258.37646484375, "loss": 0.5469, "rewards/accuracies": 0.925000011920929, "rewards/generated": -1.760457992553711, "rewards/margins": 22.76521873474121, "rewards/real": 21.004764556884766, "step": 5680 }, { "epoch": 0.6842231842231842, "grad_norm": 99.75632907600144, "learning_rate": 1.7544094067343665e-07, "logits/generated": -2.0461623668670654, "logits/real": -2.0558812618255615, "logps/generated": -610.9781494140625, "logps/real": -274.8094787597656, "loss": 0.7229, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -5.510365962982178, "rewards/margins": 26.819034576416016, "rewards/real": 21.308670043945312, "step": 5690 }, { "epoch": 0.6854256854256854, "grad_norm": 50.96943824689933, "learning_rate": 1.7477284874398718e-07, "logits/generated": -2.001626491546631, "logits/real": -2.144613742828369, "logps/generated": -557.7528686523438, "logps/real": -265.2804260253906, "loss": 0.5109, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.805296897888184, "rewards/margins": 26.44843101501465, "rewards/real": 20.643136978149414, "step": 5700 }, { "epoch": 0.6866281866281866, "grad_norm": 5.990396117820854, "learning_rate": 1.741047568145377e-07, "logits/generated": -2.0240745544433594, "logits/real": -2.1288633346557617, "logps/generated": -508.2197265625, "logps/real": -285.49688720703125, "loss": 0.7271, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 0.6658209562301636, "rewards/margins": 24.800508499145508, "rewards/real": 25.46632957458496, "step": 5710 }, { "epoch": 0.6878306878306878, "grad_norm": 3.140266926337619, "learning_rate": 1.7343666488508817e-07, "logits/generated": -2.002182722091675, "logits/real": -2.0924344062805176, "logps/generated": -647.659912109375, "logps/real": -244.3607940673828, "loss": 0.5404, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.544800758361816, "rewards/margins": 30.63179588317871, "rewards/real": 23.08699607849121, "step": 5720 }, { "epoch": 0.689033189033189, "grad_norm": 567.7624407796745, "learning_rate": 1.7276857295563868e-07, "logits/generated": -1.9848625659942627, "logits/real": -2.1359121799468994, "logps/generated": -606.2243041992188, "logps/real": -305.01019287109375, "loss": 0.6091, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.167637348175049, "rewards/margins": 31.583202362060547, "rewards/real": 25.415569305419922, "step": 5730 }, { "epoch": 0.6902356902356902, "grad_norm": 322.53393331117076, "learning_rate": 1.721004810261892e-07, "logits/generated": -2.008922576904297, "logits/real": -2.085409641265869, "logps/generated": -479.20367431640625, "logps/real": -213.8471221923828, "loss": 0.5368, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.688708782196045, "rewards/margins": 21.00283432006836, "rewards/real": 16.314123153686523, "step": 5740 }, { "epoch": 0.6914381914381914, "grad_norm": 9.76122340799412, "learning_rate": 1.7143238909673972e-07, "logits/generated": -2.0706915855407715, "logits/real": -2.057990312576294, "logps/generated": -579.1304321289062, "logps/real": -279.77874755859375, "loss": 0.6366, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.460413932800293, "rewards/margins": 25.602603912353516, "rewards/real": 19.142189025878906, "step": 5750 }, { "epoch": 0.6926406926406926, "grad_norm": 18.230797953612804, "learning_rate": 1.7076429716729022e-07, "logits/generated": -1.9432156085968018, "logits/real": -2.0407614707946777, "logps/generated": -468.81231689453125, "logps/real": -285.3664245605469, "loss": 0.452, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.701001167297363, "rewards/margins": 25.964786529541016, "rewards/real": 21.263784408569336, "step": 5760 }, { "epoch": 0.6938431938431938, "grad_norm": 2.952253383717668, "learning_rate": 1.700962052378407e-07, "logits/generated": -1.9354089498519897, "logits/real": -2.0565319061279297, "logps/generated": -650.11083984375, "logps/real": -210.1114044189453, "loss": 0.5012, "rewards/accuracies": 0.949999988079071, "rewards/generated": -14.890890121459961, "rewards/margins": 33.58374786376953, "rewards/real": 18.692859649658203, "step": 5770 }, { "epoch": 0.695045695045695, "grad_norm": 108.8064878680952, "learning_rate": 1.6942811330839124e-07, "logits/generated": -1.940023422241211, "logits/real": -2.0163187980651855, "logps/generated": -510.94744873046875, "logps/real": -232.2707977294922, "loss": 0.5536, "rewards/accuracies": 0.875, "rewards/generated": -3.8307113647460938, "rewards/margins": 23.60468101501465, "rewards/real": 19.773967742919922, "step": 5780 }, { "epoch": 0.6962481962481962, "grad_norm": 164.3427417398238, "learning_rate": 1.6876002137894174e-07, "logits/generated": -2.051631212234497, "logits/real": -2.044851779937744, "logps/generated": -502.2870178222656, "logps/real": -201.31678771972656, "loss": 0.3162, "rewards/accuracies": 1.0, "rewards/generated": -7.455059051513672, "rewards/margins": 25.29348373413086, "rewards/real": 17.838424682617188, "step": 5790 }, { "epoch": 0.6974506974506974, "grad_norm": 29.78272376689156, "learning_rate": 1.6809192944949225e-07, "logits/generated": -2.0994935035705566, "logits/real": -2.1238558292388916, "logps/generated": -623.0423583984375, "logps/real": -267.8221435546875, "loss": 0.6061, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.353062152862549, "rewards/margins": 28.156200408935547, "rewards/real": 24.80313491821289, "step": 5800 }, { "epoch": 0.6986531986531986, "grad_norm": 4.69845125660451, "learning_rate": 1.6742383752004276e-07, "logits/generated": -1.9189367294311523, "logits/real": -2.0586776733398438, "logps/generated": -522.8191528320312, "logps/real": -200.33062744140625, "loss": 0.6415, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.614595413208008, "rewards/margins": 27.717693328857422, "rewards/real": 17.103099822998047, "step": 5810 }, { "epoch": 0.6998556998556998, "grad_norm": 90.75671553332367, "learning_rate": 1.6675574559059326e-07, "logits/generated": -2.0881330966949463, "logits/real": -2.0540738105773926, "logps/generated": -507.21990966796875, "logps/real": -233.40072631835938, "loss": 0.3856, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.9376654624938965, "rewards/margins": 24.910873413085938, "rewards/real": 20.973209381103516, "step": 5820 }, { "epoch": 0.701058201058201, "grad_norm": 132.21204600637103, "learning_rate": 1.6608765366114377e-07, "logits/generated": -2.0674076080322266, "logits/real": -2.0564589500427246, "logps/generated": -470.0575256347656, "logps/real": -280.09295654296875, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/generated": -3.8033595085144043, "rewards/margins": 26.579105377197266, "rewards/real": 22.775747299194336, "step": 5830 }, { "epoch": 0.7022607022607023, "grad_norm": 89.54335729475466, "learning_rate": 1.6541956173169427e-07, "logits/generated": -1.9924952983856201, "logits/real": -2.0267462730407715, "logps/generated": -573.144775390625, "logps/real": -223.96987915039062, "loss": 0.5523, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.361065864562988, "rewards/margins": 23.596874237060547, "rewards/real": 18.235807418823242, "step": 5840 }, { "epoch": 0.7034632034632035, "grad_norm": 320.41593254069187, "learning_rate": 1.6475146980224478e-07, "logits/generated": -2.0367844104766846, "logits/real": -2.0753395557403564, "logps/generated": -585.0662841796875, "logps/real": -220.3904571533203, "loss": 0.469, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -12.774155616760254, "rewards/margins": 27.506397247314453, "rewards/real": 14.732243537902832, "step": 5850 }, { "epoch": 0.7046657046657047, "grad_norm": 6.764828870520677, "learning_rate": 1.640833778727953e-07, "logits/generated": -2.030487060546875, "logits/real": -2.0653622150421143, "logps/generated": -505.8202209472656, "logps/real": -328.5468444824219, "loss": 0.5061, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.1627795696258545, "rewards/margins": 24.476404190063477, "rewards/real": 24.313623428344727, "step": 5860 }, { "epoch": 0.7058682058682059, "grad_norm": 16.987320333356795, "learning_rate": 1.634152859433458e-07, "logits/generated": -1.9706943035125732, "logits/real": -2.079078435897827, "logps/generated": -696.0484619140625, "logps/real": -298.5466003417969, "loss": 0.8071, "rewards/accuracies": 0.925000011920929, "rewards/generated": -8.095951080322266, "rewards/margins": 31.24196434020996, "rewards/real": 23.146013259887695, "step": 5870 }, { "epoch": 0.7070707070707071, "grad_norm": 15.059796604406513, "learning_rate": 1.627471940138963e-07, "logits/generated": -1.966286063194275, "logits/real": -2.106269121170044, "logps/generated": -605.654296875, "logps/real": -273.3966979980469, "loss": 0.6209, "rewards/accuracies": 0.925000011920929, "rewards/generated": -8.536871910095215, "rewards/margins": 28.432214736938477, "rewards/real": 19.895343780517578, "step": 5880 }, { "epoch": 0.7082732082732083, "grad_norm": 27.996792266130523, "learning_rate": 1.620791020844468e-07, "logits/generated": -2.0093941688537598, "logits/real": -2.0758864879608154, "logps/generated": -580.8387451171875, "logps/real": -225.3408966064453, "loss": 0.4954, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.541053295135498, "rewards/margins": 24.935813903808594, "rewards/real": 19.39476203918457, "step": 5890 }, { "epoch": 0.7094757094757095, "grad_norm": 593.1716641091133, "learning_rate": 1.6141101015499731e-07, "logits/generated": -2.03402042388916, "logits/real": -2.0171828269958496, "logps/generated": -601.9954833984375, "logps/real": -259.0872497558594, "loss": 0.5209, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.456506252288818, "rewards/margins": 29.47256088256836, "rewards/real": 23.016056060791016, "step": 5900 }, { "epoch": 0.7106782106782107, "grad_norm": 6.097200252474049, "learning_rate": 1.6074291822554785e-07, "logits/generated": -1.9546695947647095, "logits/real": -2.1085476875305176, "logps/generated": -585.7703857421875, "logps/real": -260.1246337890625, "loss": 0.3801, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.080541133880615, "rewards/margins": 30.35806655883789, "rewards/real": 23.277523040771484, "step": 5910 }, { "epoch": 0.7118807118807119, "grad_norm": 5.425167454398412, "learning_rate": 1.6007482629609833e-07, "logits/generated": -2.052748203277588, "logits/real": -2.085552215576172, "logps/generated": -553.0166625976562, "logps/real": -243.44729614257812, "loss": 0.3871, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.7129673957824707, "rewards/margins": 22.420289993286133, "rewards/real": 19.707324981689453, "step": 5920 }, { "epoch": 0.7130832130832131, "grad_norm": 194.37528617332168, "learning_rate": 1.5940673436664883e-07, "logits/generated": -1.8565423488616943, "logits/real": -2.048894166946411, "logps/generated": -611.2785034179688, "logps/real": -222.39208984375, "loss": 0.5031, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.9310884475708, "rewards/margins": 30.078760147094727, "rewards/real": 18.14767074584961, "step": 5930 }, { "epoch": 0.7142857142857143, "grad_norm": 584.9361748337963, "learning_rate": 1.5873864243719934e-07, "logits/generated": -2.0264651775360107, "logits/real": -1.9897197484970093, "logps/generated": -498.8528747558594, "logps/real": -231.07693481445312, "loss": 0.5602, "rewards/accuracies": 0.875, "rewards/generated": -3.90936541557312, "rewards/margins": 22.191932678222656, "rewards/real": 18.28256607055664, "step": 5940 }, { "epoch": 0.7154882154882155, "grad_norm": 73.77401089746371, "learning_rate": 1.5807055050774987e-07, "logits/generated": -1.9922844171524048, "logits/real": -2.053990602493286, "logps/generated": -582.9013671875, "logps/real": -260.50164794921875, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/generated": -3.423548936843872, "rewards/margins": 27.149404525756836, "rewards/real": 23.72585678100586, "step": 5950 }, { "epoch": 0.7166907166907167, "grad_norm": 264.109642798798, "learning_rate": 1.5740245857830038e-07, "logits/generated": -1.9862995147705078, "logits/real": -1.9136476516723633, "logps/generated": -516.9673461914062, "logps/real": -205.4317626953125, "loss": 0.3064, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.891024589538574, "rewards/margins": 24.194900512695312, "rewards/real": 16.303874969482422, "step": 5960 }, { "epoch": 0.7178932178932179, "grad_norm": 82.02801608322811, "learning_rate": 1.5673436664885086e-07, "logits/generated": -2.0520498752593994, "logits/real": -2.0354788303375244, "logps/generated": -523.3507690429688, "logps/real": -227.84402465820312, "loss": 0.565, "rewards/accuracies": 0.949999988079071, "rewards/generated": 0.1284686028957367, "rewards/margins": 19.374645233154297, "rewards/real": 19.503116607666016, "step": 5970 }, { "epoch": 0.7190957190957191, "grad_norm": 202.0180463553884, "learning_rate": 1.5606627471940136e-07, "logits/generated": -2.0054705142974854, "logits/real": -2.126939058303833, "logps/generated": -569.4356689453125, "logps/real": -259.4375, "loss": 0.4142, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.6010572910308838, "rewards/margins": 21.88002586364746, "rewards/real": 22.4810848236084, "step": 5980 }, { "epoch": 0.7202982202982203, "grad_norm": 16.238480948127794, "learning_rate": 1.553981827899519e-07, "logits/generated": -1.9652553796768188, "logits/real": -1.9859685897827148, "logps/generated": -503.29608154296875, "logps/real": -235.278564453125, "loss": 0.3261, "rewards/accuracies": 1.0, "rewards/generated": -4.676265716552734, "rewards/margins": 25.252643585205078, "rewards/real": 20.576377868652344, "step": 5990 }, { "epoch": 0.7215007215007215, "grad_norm": 4.694603966980558, "learning_rate": 1.547300908605024e-07, "logits/generated": -1.9758752584457397, "logits/real": -2.0604944229125977, "logps/generated": -581.5227661132812, "logps/real": -266.998291015625, "loss": 0.4272, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.795146942138672, "rewards/margins": 28.78839111328125, "rewards/real": 21.993244171142578, "step": 6000 }, { "epoch": 0.7215007215007215, "eval_logits/generated": -2.024104595184326, "eval_logits/real": -2.0559768676757812, "eval_logps/generated": -516.4866333007812, "eval_logps/real": -255.58375549316406, "eval_loss": 0.4181888997554779, "eval_rewards/accuracies": 0.9583333134651184, "eval_rewards/generated": -2.6705148220062256, "eval_rewards/margins": 24.239242553710938, "eval_rewards/real": 21.568729400634766, "eval_runtime": 158.085, "eval_samples_per_second": 6.326, "eval_steps_per_second": 0.531, "step": 6000 }, { "epoch": 0.7227032227032227, "grad_norm": 4.4590254558840705, "learning_rate": 1.540619989310529e-07, "logits/generated": -2.055990219116211, "logits/real": -2.114842176437378, "logps/generated": -518.754638671875, "logps/real": -254.91616821289062, "loss": 0.6621, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.7546546459198, "rewards/margins": 24.697486877441406, "rewards/real": 21.94283103942871, "step": 6010 }, { "epoch": 0.7239057239057239, "grad_norm": 200.0553975932058, "learning_rate": 1.533939070016034e-07, "logits/generated": -1.9855819940567017, "logits/real": -2.0946459770202637, "logps/generated": -540.9707641601562, "logps/real": -222.64013671875, "loss": 0.365, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.403491973876953, "rewards/margins": 29.19797706604004, "rewards/real": 19.794485092163086, "step": 6020 }, { "epoch": 0.7251082251082251, "grad_norm": 3.003949807765099, "learning_rate": 1.5272581507215392e-07, "logits/generated": -1.8843629360198975, "logits/real": -2.0339131355285645, "logps/generated": -535.8517456054688, "logps/real": -227.3661346435547, "loss": 0.4721, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -5.808887958526611, "rewards/margins": 24.485149383544922, "rewards/real": 18.676258087158203, "step": 6030 }, { "epoch": 0.7263107263107264, "grad_norm": 232.73246617854906, "learning_rate": 1.5205772314270443e-07, "logits/generated": -2.076704978942871, "logits/real": -2.051879405975342, "logps/generated": -525.3424072265625, "logps/real": -320.03985595703125, "loss": 0.3817, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -0.6194555163383484, "rewards/margins": 25.497285842895508, "rewards/real": 24.87782859802246, "step": 6040 }, { "epoch": 0.7275132275132276, "grad_norm": 955.3834726679851, "learning_rate": 1.5138963121325494e-07, "logits/generated": -2.004138946533203, "logits/real": -2.0520410537719727, "logps/generated": -532.3648071289062, "logps/real": -284.3950500488281, "loss": 0.4399, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.6230058670043945, "rewards/margins": 29.23556137084961, "rewards/real": 23.612552642822266, "step": 6050 }, { "epoch": 0.7287157287157288, "grad_norm": 428.49457126504836, "learning_rate": 1.5072153928380544e-07, "logits/generated": -2.0264859199523926, "logits/real": -1.9557311534881592, "logps/generated": -502.14862060546875, "logps/real": -221.5615997314453, "loss": 0.4106, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.6217753887176514, "rewards/margins": 22.318477630615234, "rewards/real": 18.696704864501953, "step": 6060 }, { "epoch": 0.72991822991823, "grad_norm": 5.379462742272412, "learning_rate": 1.5005344735435595e-07, "logits/generated": -2.019275426864624, "logits/real": -2.112255334854126, "logps/generated": -654.1192016601562, "logps/real": -278.43084716796875, "loss": 0.3359, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.167357921600342, "rewards/margins": 29.26619529724121, "rewards/real": 25.09884262084961, "step": 6070 }, { "epoch": 0.7311207311207312, "grad_norm": 4.860276357003056, "learning_rate": 1.4938535542490646e-07, "logits/generated": -2.0853078365325928, "logits/real": -2.122804641723633, "logps/generated": -506.30419921875, "logps/real": -212.27352905273438, "loss": 0.5013, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.887076377868652, "rewards/margins": 28.033214569091797, "rewards/real": 19.146141052246094, "step": 6080 }, { "epoch": 0.7323232323232324, "grad_norm": 113.56043168777462, "learning_rate": 1.4871726349545696e-07, "logits/generated": -2.0797107219696045, "logits/real": -2.0907530784606934, "logps/generated": -433.6893615722656, "logps/real": -209.5296173095703, "loss": 0.6372, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -2.3338398933410645, "rewards/margins": 21.73061752319336, "rewards/real": 19.396774291992188, "step": 6090 }, { "epoch": 0.7335257335257336, "grad_norm": 178.6721479700718, "learning_rate": 1.4804917156600747e-07, "logits/generated": -1.9706804752349854, "logits/real": -2.0371265411376953, "logps/generated": -474.51690673828125, "logps/real": -227.3787384033203, "loss": 0.6643, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.39561653137207, "rewards/margins": 23.835596084594727, "rewards/real": 19.439977645874023, "step": 6100 }, { "epoch": 0.7347282347282347, "grad_norm": 6.262325763848138, "learning_rate": 1.47381079636558e-07, "logits/generated": -1.9313364028930664, "logits/real": -1.927351951599121, "logps/generated": -519.8493041992188, "logps/real": -231.4251251220703, "loss": 0.5785, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.794744491577148, "rewards/margins": 22.81560707092285, "rewards/real": 18.020862579345703, "step": 6110 }, { "epoch": 0.7359307359307359, "grad_norm": 626.9184452960253, "learning_rate": 1.4671298770710848e-07, "logits/generated": -1.9691321849822998, "logits/real": -2.0907559394836426, "logps/generated": -637.7929077148438, "logps/real": -257.151123046875, "loss": 0.5389, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.425045013427734, "rewards/margins": 28.295806884765625, "rewards/real": 22.87076187133789, "step": 6120 }, { "epoch": 0.7371332371332371, "grad_norm": 188.77303204853564, "learning_rate": 1.46044895777659e-07, "logits/generated": -1.9789931774139404, "logits/real": -2.029228687286377, "logps/generated": -568.4287719726562, "logps/real": -198.3484344482422, "loss": 0.3117, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.4860520362854, "rewards/margins": 23.19948959350586, "rewards/real": 16.713438034057617, "step": 6130 }, { "epoch": 0.7383357383357383, "grad_norm": 7.017986930229006, "learning_rate": 1.453768038482095e-07, "logits/generated": -1.9300180673599243, "logits/real": -2.0610289573669434, "logps/generated": -556.9735107421875, "logps/real": -248.61489868164062, "loss": 0.3547, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.820444583892822, "rewards/margins": 23.19576644897461, "rewards/real": 18.375324249267578, "step": 6140 }, { "epoch": 0.7395382395382395, "grad_norm": 13.35008192716633, "learning_rate": 1.4470871191876003e-07, "logits/generated": -1.946213722229004, "logits/real": -2.075281858444214, "logps/generated": -557.5587768554688, "logps/real": -218.5151824951172, "loss": 0.6689, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.02005386352539, "rewards/margins": 28.047130584716797, "rewards/real": 20.027080535888672, "step": 6150 }, { "epoch": 0.7407407407407407, "grad_norm": 9.684037448105407, "learning_rate": 1.4404061998931053e-07, "logits/generated": -2.0908665657043457, "logits/real": -2.102464199066162, "logps/generated": -444.47808837890625, "logps/real": -203.1186981201172, "loss": 0.6941, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.7162132263183594, "rewards/margins": 19.0086612701416, "rewards/real": 18.29244613647461, "step": 6160 }, { "epoch": 0.7419432419432419, "grad_norm": 10.34774445413632, "learning_rate": 1.4337252805986101e-07, "logits/generated": -1.8997472524642944, "logits/real": -2.0007100105285645, "logps/generated": -667.5238037109375, "logps/real": -193.0680694580078, "loss": 0.3097, "rewards/accuracies": 1.0, "rewards/generated": -13.620000839233398, "rewards/margins": 33.0391731262207, "rewards/real": 19.419170379638672, "step": 6170 }, { "epoch": 0.7431457431457431, "grad_norm": 561.9992672748349, "learning_rate": 1.4270443613041152e-07, "logits/generated": -2.072580337524414, "logits/real": -2.018799304962158, "logps/generated": -526.4044189453125, "logps/real": -262.51361083984375, "loss": 0.6514, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.1388444900512695, "rewards/margins": 26.15079116821289, "rewards/real": 21.011945724487305, "step": 6180 }, { "epoch": 0.7443482443482443, "grad_norm": 4.51774798412955, "learning_rate": 1.4203634420096205e-07, "logits/generated": -1.9605543613433838, "logits/real": -2.016939163208008, "logps/generated": -458.9302673339844, "logps/real": -176.44093322753906, "loss": 0.2359, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.735542297363281, "rewards/margins": 25.70003318786621, "rewards/real": 15.964492797851562, "step": 6190 }, { "epoch": 0.7455507455507455, "grad_norm": 805.110846002169, "learning_rate": 1.4136825227151256e-07, "logits/generated": -2.067617893218994, "logits/real": -2.074852466583252, "logps/generated": -521.3026123046875, "logps/real": -248.1642608642578, "loss": 0.3533, "rewards/accuracies": 0.949999988079071, "rewards/generated": -0.04916229099035263, "rewards/margins": 21.618560791015625, "rewards/real": 21.569400787353516, "step": 6200 }, { "epoch": 0.7467532467532467, "grad_norm": 4.917719176714294, "learning_rate": 1.4070016034206307e-07, "logits/generated": -2.026477813720703, "logits/real": -2.0135531425476074, "logps/generated": -603.4595947265625, "logps/real": -240.67446899414062, "loss": 0.2961, "rewards/accuracies": 1.0, "rewards/generated": -9.173086166381836, "rewards/margins": 29.656009674072266, "rewards/real": 20.482921600341797, "step": 6210 }, { "epoch": 0.7479557479557479, "grad_norm": 30.80535543136375, "learning_rate": 1.4003206841261355e-07, "logits/generated": -1.9997466802597046, "logits/real": -2.16223406791687, "logps/generated": -482.9454040527344, "logps/real": -225.31100463867188, "loss": 0.4796, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.104976177215576, "rewards/margins": 26.778528213500977, "rewards/real": 20.67354965209961, "step": 6220 }, { "epoch": 0.7491582491582491, "grad_norm": 5.194576848457341, "learning_rate": 1.3936397648316408e-07, "logits/generated": -1.9211280345916748, "logits/real": -2.0602774620056152, "logps/generated": -482.47930908203125, "logps/real": -195.07928466796875, "loss": 0.5361, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.157212257385254, "rewards/margins": 25.305757522583008, "rewards/real": 16.14854621887207, "step": 6230 }, { "epoch": 0.7503607503607503, "grad_norm": 12.592927666077152, "learning_rate": 1.3869588455371459e-07, "logits/generated": -1.9879329204559326, "logits/real": -2.105809211730957, "logps/generated": -668.959228515625, "logps/real": -240.8298797607422, "loss": 0.5272, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.408947944641113, "rewards/margins": 31.326852798461914, "rewards/real": 20.917903900146484, "step": 6240 }, { "epoch": 0.7515632515632515, "grad_norm": 6.367074419526923, "learning_rate": 1.380277926242651e-07, "logits/generated": -1.926006555557251, "logits/real": -2.0523173809051514, "logps/generated": -558.9520263671875, "logps/real": -211.4641571044922, "loss": 0.3938, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.068964004516602, "rewards/margins": 25.008941650390625, "rewards/real": 18.939977645874023, "step": 6250 }, { "epoch": 0.7527657527657527, "grad_norm": 9.75024619425914, "learning_rate": 1.373597006948156e-07, "logits/generated": -2.0039355754852295, "logits/real": -2.078279495239258, "logps/generated": -633.4993286132812, "logps/real": -246.19046020507812, "loss": 0.7311, "rewards/accuracies": 1.0, "rewards/generated": -6.896474361419678, "rewards/margins": 29.550867080688477, "rewards/real": 22.654390335083008, "step": 6260 }, { "epoch": 0.753968253968254, "grad_norm": 6.1874289646490634, "learning_rate": 1.366916087653661e-07, "logits/generated": -2.0454373359680176, "logits/real": -2.014486789703369, "logps/generated": -567.9034423828125, "logps/real": -252.9257049560547, "loss": 0.351, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -3.65690279006958, "rewards/margins": 25.702701568603516, "rewards/real": 22.045795440673828, "step": 6270 }, { "epoch": 0.7551707551707552, "grad_norm": 78.83575585100137, "learning_rate": 1.360235168359166e-07, "logits/generated": -1.9779586791992188, "logits/real": -2.050172805786133, "logps/generated": -512.7911987304688, "logps/real": -200.3998565673828, "loss": 0.4225, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.451817512512207, "rewards/margins": 28.826004028320312, "rewards/real": 19.37418556213379, "step": 6280 }, { "epoch": 0.7563732563732564, "grad_norm": 151.34186107944922, "learning_rate": 1.3535542490646712e-07, "logits/generated": -1.965846061706543, "logits/real": -2.0169053077697754, "logps/generated": -538.800048828125, "logps/real": -193.9132537841797, "loss": 0.9546, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.19506549835205, "rewards/margins": 27.961584091186523, "rewards/real": 18.766517639160156, "step": 6290 }, { "epoch": 0.7575757575757576, "grad_norm": 36.86441070717078, "learning_rate": 1.3468733297701762e-07, "logits/generated": -1.937940001487732, "logits/real": -1.9921560287475586, "logps/generated": -608.825439453125, "logps/real": -214.90573120117188, "loss": 0.2837, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.684560775756836, "rewards/margins": 28.25661849975586, "rewards/real": 17.572059631347656, "step": 6300 }, { "epoch": 0.7587782587782588, "grad_norm": 5.9247245043509045, "learning_rate": 1.3401924104756816e-07, "logits/generated": -1.9313808679580688, "logits/real": -1.9859815835952759, "logps/generated": -597.9343872070312, "logps/real": -250.38735961914062, "loss": 0.5041, "rewards/accuracies": 1.0, "rewards/generated": -15.1850004196167, "rewards/margins": 33.82495880126953, "rewards/real": 18.639957427978516, "step": 6310 }, { "epoch": 0.75998075998076, "grad_norm": 57.69450815352928, "learning_rate": 1.3335114911811864e-07, "logits/generated": -2.0614192485809326, "logits/real": -2.0687270164489746, "logps/generated": -633.4412841796875, "logps/real": -255.1488037109375, "loss": 0.8344, "rewards/accuracies": 1.0, "rewards/generated": -7.370565891265869, "rewards/margins": 29.701574325561523, "rewards/real": 22.331008911132812, "step": 6320 }, { "epoch": 0.7611832611832612, "grad_norm": 17.046417261383656, "learning_rate": 1.3268305718866914e-07, "logits/generated": -1.9685719013214111, "logits/real": -2.101987361907959, "logps/generated": -700.80517578125, "logps/real": -268.86553955078125, "loss": 0.5641, "rewards/accuracies": 0.925000011920929, "rewards/generated": -11.582005500793457, "rewards/margins": 31.284893035888672, "rewards/real": 19.702882766723633, "step": 6330 }, { "epoch": 0.7623857623857624, "grad_norm": 57.379866714393906, "learning_rate": 1.3201496525921965e-07, "logits/generated": -1.9597723484039307, "logits/real": -2.05348801612854, "logps/generated": -749.7882080078125, "logps/real": -304.0247802734375, "loss": 0.421, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.726692199707031, "rewards/margins": 34.836952209472656, "rewards/real": 23.11026382446289, "step": 6340 }, { "epoch": 0.7635882635882636, "grad_norm": 9.697832231116402, "learning_rate": 1.3134687332977018e-07, "logits/generated": -2.0030665397644043, "logits/real": -2.068018913269043, "logps/generated": -565.5142822265625, "logps/real": -237.0439453125, "loss": 0.5869, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.754905700683594, "rewards/margins": 25.79233169555664, "rewards/real": 20.037425994873047, "step": 6350 }, { "epoch": 0.7647907647907648, "grad_norm": 15.4845204333541, "learning_rate": 1.306787814003207e-07, "logits/generated": -2.0362257957458496, "logits/real": -2.0825929641723633, "logps/generated": -501.44171142578125, "logps/real": -245.36727905273438, "loss": 0.342, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.3118367195129395, "rewards/margins": 25.564722061157227, "rewards/real": 21.252885818481445, "step": 6360 }, { "epoch": 0.765993265993266, "grad_norm": 11.618511890856688, "learning_rate": 1.3001068947087117e-07, "logits/generated": -1.9474092721939087, "logits/real": -2.0226938724517822, "logps/generated": -556.8031005859375, "logps/real": -235.0730743408203, "loss": 0.4303, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.593226432800293, "rewards/margins": 26.619098663330078, "rewards/real": 20.02587127685547, "step": 6370 }, { "epoch": 0.7671957671957672, "grad_norm": 63.097235994103166, "learning_rate": 1.2934259754142168e-07, "logits/generated": -1.989145278930664, "logits/real": -2.0506412982940674, "logps/generated": -560.9464111328125, "logps/real": -222.3338623046875, "loss": 0.7063, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.6073455810546875, "rewards/margins": 24.4576358795166, "rewards/real": 17.850292205810547, "step": 6380 }, { "epoch": 0.7683982683982684, "grad_norm": 1148.8564040017468, "learning_rate": 1.286745056119722e-07, "logits/generated": -2.025768756866455, "logits/real": -2.137629270553589, "logps/generated": -731.1598510742188, "logps/real": -310.55816650390625, "loss": 0.402, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.420269966125488, "rewards/margins": 33.447731018066406, "rewards/real": 28.027462005615234, "step": 6390 }, { "epoch": 0.7696007696007696, "grad_norm": 933.7753958864845, "learning_rate": 1.2800641368252272e-07, "logits/generated": -2.009885311126709, "logits/real": -2.0939135551452637, "logps/generated": -620.836181640625, "logps/real": -272.7360534667969, "loss": 0.4951, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.9595947265625, "rewards/margins": 29.82866859436035, "rewards/real": 22.86907196044922, "step": 6400 }, { "epoch": 0.7708032708032708, "grad_norm": 7.801546521060971, "learning_rate": 1.2733832175307322e-07, "logits/generated": -2.013359785079956, "logits/real": -2.1400272846221924, "logps/generated": -516.2764892578125, "logps/real": -220.3167724609375, "loss": 0.6196, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.015575885772705, "rewards/margins": 25.577045440673828, "rewards/real": 19.56147003173828, "step": 6410 }, { "epoch": 0.772005772005772, "grad_norm": 230.38571589406325, "learning_rate": 1.266702298236237e-07, "logits/generated": -1.9715086221694946, "logits/real": -2.0108389854431152, "logps/generated": -586.1846923828125, "logps/real": -239.9268798828125, "loss": 0.5405, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.519183158874512, "rewards/margins": 26.368633270263672, "rewards/real": 18.84945297241211, "step": 6420 }, { "epoch": 0.7732082732082732, "grad_norm": 130.59965528022764, "learning_rate": 1.2600213789417423e-07, "logits/generated": -2.043574810028076, "logits/real": -2.1122307777404785, "logps/generated": -652.2904663085938, "logps/real": -296.2471008300781, "loss": 0.6708, "rewards/accuracies": 0.949999988079071, "rewards/generated": -1.0932037830352783, "rewards/margins": 26.344066619873047, "rewards/real": 25.25086212158203, "step": 6430 }, { "epoch": 0.7744107744107744, "grad_norm": 182.6011016842905, "learning_rate": 1.2533404596472474e-07, "logits/generated": -2.0451924800872803, "logits/real": -2.1155965328216553, "logps/generated": -623.5161743164062, "logps/real": -239.1260986328125, "loss": 0.4863, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.482655048370361, "rewards/margins": 26.55521011352539, "rewards/real": 21.072551727294922, "step": 6440 }, { "epoch": 0.7756132756132756, "grad_norm": 4.40352858864226, "learning_rate": 1.2466595403527525e-07, "logits/generated": -2.032482147216797, "logits/real": -2.0970304012298584, "logps/generated": -574.2525024414062, "logps/real": -230.63027954101562, "loss": 0.5059, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.142547607421875, "rewards/margins": 33.791221618652344, "rewards/real": 20.648677825927734, "step": 6450 }, { "epoch": 0.7768157768157768, "grad_norm": 978.9515729424127, "learning_rate": 1.2399786210582575e-07, "logits/generated": -2.05214262008667, "logits/real": -2.0589637756347656, "logps/generated": -510.7826232910156, "logps/real": -182.57005310058594, "loss": 0.787, "rewards/accuracies": 0.875, "rewards/generated": -3.8935463428497314, "rewards/margins": 19.70840835571289, "rewards/real": 15.814860343933105, "step": 6460 }, { "epoch": 0.778018278018278, "grad_norm": 900.5029847031657, "learning_rate": 1.2332977017637626e-07, "logits/generated": -2.0314135551452637, "logits/real": -2.061591148376465, "logps/generated": -541.0048217773438, "logps/real": -246.12844848632812, "loss": 0.5588, "rewards/accuracies": 0.925000011920929, "rewards/generated": -3.4981799125671387, "rewards/margins": 23.609556198120117, "rewards/real": 20.111377716064453, "step": 6470 }, { "epoch": 0.7792207792207793, "grad_norm": 350.450392517772, "learning_rate": 1.2266167824692677e-07, "logits/generated": -1.9987430572509766, "logits/real": -2.0528876781463623, "logps/generated": -507.3895568847656, "logps/real": -253.5883331298828, "loss": 0.3899, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.739121437072754, "rewards/margins": 27.059429168701172, "rewards/real": 22.320308685302734, "step": 6480 }, { "epoch": 0.7804232804232805, "grad_norm": 2.570811632585443, "learning_rate": 1.2199358631747727e-07, "logits/generated": -2.061544179916382, "logits/real": -1.9877605438232422, "logps/generated": -541.1376342773438, "logps/real": -254.91421508789062, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/generated": -3.4496307373046875, "rewards/margins": 26.540197372436523, "rewards/real": 23.090566635131836, "step": 6490 }, { "epoch": 0.7816257816257817, "grad_norm": 400.89249109207947, "learning_rate": 1.2132549438802778e-07, "logits/generated": -2.032627582550049, "logits/real": -2.1158928871154785, "logps/generated": -471.20208740234375, "logps/real": -218.10458374023438, "loss": 0.5599, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.873850345611572, "rewards/margins": 26.938655853271484, "rewards/real": 20.064807891845703, "step": 6500 }, { "epoch": 0.7828282828282829, "grad_norm": 226.3122224179574, "learning_rate": 1.2065740245857829e-07, "logits/generated": -1.9897022247314453, "logits/real": -2.034402370452881, "logps/generated": -495.94775390625, "logps/real": -235.3659210205078, "loss": 0.4043, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.257264137268066, "rewards/margins": 25.510311126708984, "rewards/real": 21.253047943115234, "step": 6510 }, { "epoch": 0.7840307840307841, "grad_norm": 18.683962626117665, "learning_rate": 1.1998931052912882e-07, "logits/generated": -1.992257833480835, "logits/real": -2.17830228805542, "logps/generated": -511.45526123046875, "logps/real": -207.0403289794922, "loss": 0.8142, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.154990196228027, "rewards/margins": 26.8048095703125, "rewards/real": 19.649816513061523, "step": 6520 }, { "epoch": 0.7852332852332853, "grad_norm": 40.479713613093054, "learning_rate": 1.193212185996793e-07, "logits/generated": -1.9591795206069946, "logits/real": -2.120176315307617, "logps/generated": -476.0768127441406, "logps/real": -264.5736389160156, "loss": 0.3787, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.221365451812744, "rewards/margins": 26.010305404663086, "rewards/real": 20.788936614990234, "step": 6530 }, { "epoch": 0.7864357864357865, "grad_norm": 546.0181649497696, "learning_rate": 1.1865312667022982e-07, "logits/generated": -1.9258826971054077, "logits/real": -2.137423276901245, "logps/generated": -565.8904418945312, "logps/real": -247.72500610351562, "loss": 0.5775, "rewards/accuracies": 0.925000011920929, "rewards/generated": -8.614789962768555, "rewards/margins": 28.28069496154785, "rewards/real": 19.665903091430664, "step": 6540 }, { "epoch": 0.7876382876382877, "grad_norm": 201.09913616241806, "learning_rate": 1.1798503474078033e-07, "logits/generated": -1.8074251413345337, "logits/real": -2.046027421951294, "logps/generated": -423.1866760253906, "logps/real": -219.33309936523438, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/generated": -9.052923202514648, "rewards/margins": 29.064559936523438, "rewards/real": 20.01163101196289, "step": 6550 }, { "epoch": 0.7888407888407888, "grad_norm": 770.987711097659, "learning_rate": 1.1731694281133083e-07, "logits/generated": -2.021021604537964, "logits/real": -2.1250627040863037, "logps/generated": -503.29962158203125, "logps/real": -233.9160919189453, "loss": 0.452, "rewards/accuracies": 0.875, "rewards/generated": -11.509394645690918, "rewards/margins": 31.916366577148438, "rewards/real": 20.406970977783203, "step": 6560 }, { "epoch": 0.79004329004329, "grad_norm": 9.47549186282315, "learning_rate": 1.1664885088188134e-07, "logits/generated": -1.9661697149276733, "logits/real": -2.0995280742645264, "logps/generated": -666.216552734375, "logps/real": -290.75347900390625, "loss": 0.5366, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.03934097290039, "rewards/margins": 35.27936553955078, "rewards/real": 26.240026473999023, "step": 6570 }, { "epoch": 0.7912457912457912, "grad_norm": 573.4073702302659, "learning_rate": 1.1598075895243186e-07, "logits/generated": -2.035170078277588, "logits/real": -2.088874340057373, "logps/generated": -565.6022338867188, "logps/real": -191.28726196289062, "loss": 0.5497, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -12.782261848449707, "rewards/margins": 30.251855850219727, "rewards/real": 17.469593048095703, "step": 6580 }, { "epoch": 0.7924482924482924, "grad_norm": 6.662413515690038, "learning_rate": 1.1531266702298235e-07, "logits/generated": -1.9405333995819092, "logits/real": -1.9991796016693115, "logps/generated": -465.4478454589844, "logps/real": -166.42901611328125, "loss": 0.5633, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.728341102600098, "rewards/margins": 22.969757080078125, "rewards/real": 15.241415023803711, "step": 6590 }, { "epoch": 0.7936507936507936, "grad_norm": 7.935999121687919, "learning_rate": 1.1464457509353287e-07, "logits/generated": -2.0232961177825928, "logits/real": -2.1475448608398438, "logps/generated": -708.168701171875, "logps/real": -335.34228515625, "loss": 0.6386, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.621035575866699, "rewards/margins": 35.099361419677734, "rewards/real": 27.47833251953125, "step": 6600 }, { "epoch": 0.7948532948532948, "grad_norm": 168.3026198119923, "learning_rate": 1.1397648316408336e-07, "logits/generated": -1.8957293033599854, "logits/real": -2.0679030418395996, "logps/generated": -671.3610229492188, "logps/real": -263.46990966796875, "loss": 0.6755, "rewards/accuracies": 0.925000011920929, "rewards/generated": -11.244115829467773, "rewards/margins": 35.23192596435547, "rewards/real": 23.987808227539062, "step": 6610 }, { "epoch": 0.796055796055796, "grad_norm": 9.840976990538804, "learning_rate": 1.1330839123463388e-07, "logits/generated": -1.9226858615875244, "logits/real": -2.0272715091705322, "logps/generated": -622.4281616210938, "logps/real": -232.2010955810547, "loss": 0.4167, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.895415306091309, "rewards/margins": 32.337223052978516, "rewards/real": 22.441804885864258, "step": 6620 }, { "epoch": 0.7972582972582972, "grad_norm": 3.6905020279777365, "learning_rate": 1.1264029930518439e-07, "logits/generated": -1.980645775794983, "logits/real": -2.062753438949585, "logps/generated": -537.4622802734375, "logps/real": -228.3039093017578, "loss": 0.4354, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.7731428146362305, "rewards/margins": 26.594839096069336, "rewards/real": 20.821697235107422, "step": 6630 }, { "epoch": 0.7984607984607984, "grad_norm": 222.54171218343046, "learning_rate": 1.119722073757349e-07, "logits/generated": -1.9163379669189453, "logits/real": -1.9841057062149048, "logps/generated": -630.6839599609375, "logps/real": -251.23843383789062, "loss": 0.55, "rewards/accuracies": 1.0, "rewards/generated": -11.036982536315918, "rewards/margins": 32.203826904296875, "rewards/real": 21.16684341430664, "step": 6640 }, { "epoch": 0.7996632996632996, "grad_norm": 758.4916729267836, "learning_rate": 1.113041154462854e-07, "logits/generated": -1.8351774215698242, "logits/real": -1.9700706005096436, "logps/generated": -513.4570922851562, "logps/real": -160.77664184570312, "loss": 0.3732, "rewards/accuracies": 1.0, "rewards/generated": -14.325859069824219, "rewards/margins": 29.440387725830078, "rewards/real": 15.114524841308594, "step": 6650 }, { "epoch": 0.8008658008658008, "grad_norm": 130.25800665654907, "learning_rate": 1.1063602351683592e-07, "logits/generated": -1.9696983098983765, "logits/real": -2.020036458969116, "logps/generated": -462.839111328125, "logps/real": -241.97744750976562, "loss": 0.795, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.808574199676514, "rewards/margins": 26.586395263671875, "rewards/real": 21.77781867980957, "step": 6660 }, { "epoch": 0.802068302068302, "grad_norm": 16.445419819966222, "learning_rate": 1.0996793158738642e-07, "logits/generated": -1.9043128490447998, "logits/real": -2.0672061443328857, "logps/generated": -650.8314208984375, "logps/real": -262.8508605957031, "loss": 0.384, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.162294387817383, "rewards/margins": 29.417739868164062, "rewards/real": 21.25544548034668, "step": 6670 }, { "epoch": 0.8032708032708032, "grad_norm": 7.624439658847343, "learning_rate": 1.0929983965793694e-07, "logits/generated": -1.993038535118103, "logits/real": -2.048717498779297, "logps/generated": -583.8704833984375, "logps/real": -280.97979736328125, "loss": 0.3869, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.1804001331329346, "rewards/margins": 25.349430084228516, "rewards/real": 23.169029235839844, "step": 6680 }, { "epoch": 0.8044733044733045, "grad_norm": 933.4362052887401, "learning_rate": 1.0863174772848743e-07, "logits/generated": -1.9302880764007568, "logits/real": -2.0569405555725098, "logps/generated": -593.2938232421875, "logps/real": -216.85385131835938, "loss": 0.5222, "rewards/accuracies": 0.925000011920929, "rewards/generated": -10.829951286315918, "rewards/margins": 31.20162582397461, "rewards/real": 20.371673583984375, "step": 6690 }, { "epoch": 0.8056758056758057, "grad_norm": 4.893149792148093, "learning_rate": 1.0796365579903795e-07, "logits/generated": -1.9423173666000366, "logits/real": -2.0856854915618896, "logps/generated": -676.6719360351562, "logps/real": -263.2870178222656, "loss": 0.4651, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.902135848999023, "rewards/margins": 33.78864669799805, "rewards/real": 23.88651466369629, "step": 6700 }, { "epoch": 0.8068783068783069, "grad_norm": 11.948304545504657, "learning_rate": 1.0729556386958845e-07, "logits/generated": -1.8900234699249268, "logits/real": -2.0345757007598877, "logps/generated": -510.62908935546875, "logps/real": -217.3412628173828, "loss": 0.4978, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.852546691894531, "rewards/margins": 27.93526840209961, "rewards/real": 19.08272361755371, "step": 6710 }, { "epoch": 0.8080808080808081, "grad_norm": 864.3594472666391, "learning_rate": 1.0662747194013896e-07, "logits/generated": -1.904443383216858, "logits/real": -2.0020639896392822, "logps/generated": -621.1746826171875, "logps/real": -263.2376403808594, "loss": 0.8638, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -3.8390984535217285, "rewards/margins": 24.18063735961914, "rewards/real": 20.34153938293457, "step": 6720 }, { "epoch": 0.8092833092833093, "grad_norm": 158.72252802279118, "learning_rate": 1.0595938001068947e-07, "logits/generated": -1.909276008605957, "logits/real": -2.0260133743286133, "logps/generated": -518.3134765625, "logps/real": -230.59768676757812, "loss": 0.55, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.227375984191895, "rewards/margins": 30.50686264038086, "rewards/real": 20.279489517211914, "step": 6730 }, { "epoch": 0.8104858104858105, "grad_norm": 7.650011866937278, "learning_rate": 1.0529128808123997e-07, "logits/generated": -1.8908302783966064, "logits/real": -2.0459790229797363, "logps/generated": -572.00390625, "logps/real": -218.95510864257812, "loss": 0.3082, "rewards/accuracies": 1.0, "rewards/generated": -9.059887886047363, "rewards/margins": 27.71695899963379, "rewards/real": 18.657072067260742, "step": 6740 }, { "epoch": 0.8116883116883117, "grad_norm": 115.32391375793355, "learning_rate": 1.0462319615179048e-07, "logits/generated": -1.9581515789031982, "logits/real": -2.0542149543762207, "logps/generated": -551.6161499023438, "logps/real": -274.6429443359375, "loss": 0.2775, "rewards/accuracies": 1.0, "rewards/generated": -6.853793144226074, "rewards/margins": 29.69968605041504, "rewards/real": 22.845888137817383, "step": 6750 }, { "epoch": 0.8128908128908129, "grad_norm": 74.47718182874267, "learning_rate": 1.03955104222341e-07, "logits/generated": -2.052121639251709, "logits/real": -2.0548155307769775, "logps/generated": -547.2852783203125, "logps/real": -237.0869140625, "loss": 0.3278, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.021565437316895, "rewards/margins": 28.7548828125, "rewards/real": 20.73331642150879, "step": 6760 }, { "epoch": 0.8140933140933141, "grad_norm": 152.04333408446354, "learning_rate": 1.032870122928915e-07, "logits/generated": -2.066861867904663, "logits/real": -2.1651744842529297, "logps/generated": -582.880615234375, "logps/real": -310.56732177734375, "loss": 0.5398, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.206866502761841, "rewards/margins": 24.49967384338379, "rewards/real": 26.706539154052734, "step": 6770 }, { "epoch": 0.8152958152958153, "grad_norm": 9.084553097851261, "learning_rate": 1.0261892036344201e-07, "logits/generated": -1.9865939617156982, "logits/real": -2.064363479614258, "logps/generated": -642.2066650390625, "logps/real": -245.4627227783203, "loss": 0.4214, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.984448432922363, "rewards/margins": 35.89861297607422, "rewards/real": 22.91416358947754, "step": 6780 }, { "epoch": 0.8164983164983165, "grad_norm": 580.8818222282044, "learning_rate": 1.019508284339925e-07, "logits/generated": -2.004951000213623, "logits/real": -2.1178276538848877, "logps/generated": -463.5888671875, "logps/real": -223.80380249023438, "loss": 0.5307, "rewards/accuracies": 0.875, "rewards/generated": -3.36588978767395, "rewards/margins": 22.291778564453125, "rewards/real": 18.925886154174805, "step": 6790 }, { "epoch": 0.8177008177008177, "grad_norm": 21.60433737742114, "learning_rate": 1.0128273650454303e-07, "logits/generated": -1.9284942150115967, "logits/real": -2.1252195835113525, "logps/generated": -642.3855590820312, "logps/real": -238.59030151367188, "loss": 0.4324, "rewards/accuracies": 0.949999988079071, "rewards/generated": -12.920016288757324, "rewards/margins": 36.168033599853516, "rewards/real": 23.24801254272461, "step": 6800 }, { "epoch": 0.8189033189033189, "grad_norm": 5.858153105500669, "learning_rate": 1.0061464457509353e-07, "logits/generated": -1.8915159702301025, "logits/real": -2.036672592163086, "logps/generated": -510.6033630371094, "logps/real": -263.3202209472656, "loss": 0.6308, "rewards/accuracies": 0.925000011920929, "rewards/generated": -10.245492935180664, "rewards/margins": 29.829025268554688, "rewards/real": 19.58353042602539, "step": 6810 }, { "epoch": 0.8201058201058201, "grad_norm": 225.78300369364672, "learning_rate": 9.994655264564404e-08, "logits/generated": -1.978859543800354, "logits/real": -1.9988905191421509, "logps/generated": -479.1241149902344, "logps/real": -235.0752716064453, "loss": 0.455, "rewards/accuracies": 1.0, "rewards/generated": -4.077695369720459, "rewards/margins": 27.0776424407959, "rewards/real": 22.999948501586914, "step": 6820 }, { "epoch": 0.8213083213083213, "grad_norm": 133.61865275911646, "learning_rate": 9.927846071619455e-08, "logits/generated": -1.9701054096221924, "logits/real": -2.0301175117492676, "logps/generated": -533.6475219726562, "logps/real": -220.45571899414062, "loss": 0.5627, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.806885719299316, "rewards/margins": 25.719934463500977, "rewards/real": 18.913049697875977, "step": 6830 }, { "epoch": 0.8225108225108225, "grad_norm": 170.27248987133945, "learning_rate": 9.861036878674505e-08, "logits/generated": -1.928148865699768, "logits/real": -2.001552104949951, "logps/generated": -599.8140258789062, "logps/real": -286.3150329589844, "loss": 0.3573, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.314204216003418, "rewards/margins": 30.91522216796875, "rewards/real": 23.601016998291016, "step": 6840 }, { "epoch": 0.8237133237133237, "grad_norm": 4.315270534391476, "learning_rate": 9.794227685729556e-08, "logits/generated": -1.9817625284194946, "logits/real": -2.1318881511688232, "logps/generated": -531.9886474609375, "logps/real": -194.29922485351562, "loss": 0.2382, "rewards/accuracies": 1.0, "rewards/generated": -10.49198055267334, "rewards/margins": 29.082393646240234, "rewards/real": 18.590412139892578, "step": 6850 }, { "epoch": 0.8249158249158249, "grad_norm": 86.45065458537489, "learning_rate": 9.727418492784608e-08, "logits/generated": -1.9709510803222656, "logits/real": -2.0084948539733887, "logps/generated": -638.9036254882812, "logps/real": -209.0804901123047, "loss": 0.5148, "rewards/accuracies": 0.949999988079071, "rewards/generated": -13.607638359069824, "rewards/margins": 31.35004234313965, "rewards/real": 17.74240493774414, "step": 6860 }, { "epoch": 0.8261183261183261, "grad_norm": 4.1963448755801105, "learning_rate": 9.660609299839657e-08, "logits/generated": -2.0052900314331055, "logits/real": -2.0138003826141357, "logps/generated": -534.0217895507812, "logps/real": -248.12564086914062, "loss": 0.3906, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -7.690367698669434, "rewards/margins": 29.35007667541504, "rewards/real": 21.659709930419922, "step": 6870 }, { "epoch": 0.8273208273208273, "grad_norm": 62.15658277048181, "learning_rate": 9.593800106894709e-08, "logits/generated": -2.0607285499572754, "logits/real": -2.098620653152466, "logps/generated": -441.8622131347656, "logps/real": -192.32180786132812, "loss": 0.374, "rewards/accuracies": 0.925000011920929, "rewards/generated": 0.18954944610595703, "rewards/margins": 17.867490768432617, "rewards/real": 18.057039260864258, "step": 6880 }, { "epoch": 0.8285233285233286, "grad_norm": 804.5373140504205, "learning_rate": 9.526990913949758e-08, "logits/generated": -1.9349445104599, "logits/real": -2.020285129547119, "logps/generated": -498.94610595703125, "logps/real": -209.37149047851562, "loss": 0.4807, "rewards/accuracies": 1.0, "rewards/generated": -9.69743537902832, "rewards/margins": 29.25558853149414, "rewards/real": 19.558155059814453, "step": 6890 }, { "epoch": 0.8297258297258298, "grad_norm": 75.33035304692565, "learning_rate": 9.46018172100481e-08, "logits/generated": -2.0229721069335938, "logits/real": -2.096458911895752, "logps/generated": -484.26904296875, "logps/real": -217.74746704101562, "loss": 0.3872, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.0623698234558105, "rewards/margins": 24.061389923095703, "rewards/real": 20.999019622802734, "step": 6900 }, { "epoch": 0.830928330928331, "grad_norm": 97.15901976461026, "learning_rate": 9.393372528059861e-08, "logits/generated": -1.9885648488998413, "logits/real": -2.04494571685791, "logps/generated": -567.73974609375, "logps/real": -232.93212890625, "loss": 0.7692, "rewards/accuracies": 0.925000011920929, "rewards/generated": -6.393340110778809, "rewards/margins": 28.214279174804688, "rewards/real": 21.820940017700195, "step": 6910 }, { "epoch": 0.8321308321308322, "grad_norm": 35.0444948605439, "learning_rate": 9.326563335114912e-08, "logits/generated": -2.0096848011016846, "logits/real": -2.067962169647217, "logps/generated": -633.2785034179688, "logps/real": -288.0771484375, "loss": 0.4973, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.354552268981934, "rewards/margins": 31.78200340270996, "rewards/real": 23.42745018005371, "step": 6920 }, { "epoch": 0.8333333333333334, "grad_norm": 123.22927466145892, "learning_rate": 9.259754142169962e-08, "logits/generated": -1.9568450450897217, "logits/real": -2.014069080352783, "logps/generated": -572.5548095703125, "logps/real": -266.781982421875, "loss": 0.6136, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -7.571218967437744, "rewards/margins": 29.4760799407959, "rewards/real": 21.904863357543945, "step": 6930 }, { "epoch": 0.8345358345358346, "grad_norm": 122.64015784986167, "learning_rate": 9.192944949225013e-08, "logits/generated": -1.8873875141143799, "logits/real": -2.0744128227233887, "logps/generated": -655.6689453125, "logps/real": -219.0276336669922, "loss": 0.4071, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -16.227237701416016, "rewards/margins": 32.821048736572266, "rewards/real": 16.59380531311035, "step": 6940 }, { "epoch": 0.8357383357383358, "grad_norm": 494.9675003166132, "learning_rate": 9.126135756280064e-08, "logits/generated": -1.9662196636199951, "logits/real": -2.0359504222869873, "logps/generated": -632.2025146484375, "logps/real": -303.913330078125, "loss": 0.4842, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.139708518981934, "rewards/margins": 30.668176651000977, "rewards/real": 25.52846336364746, "step": 6950 }, { "epoch": 0.836940836940837, "grad_norm": 244.69796644920746, "learning_rate": 9.059326563335116e-08, "logits/generated": -1.9423549175262451, "logits/real": -2.023282527923584, "logps/generated": -599.820068359375, "logps/real": -240.3815460205078, "loss": 0.5156, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.402791023254395, "rewards/margins": 30.808792114257812, "rewards/real": 20.4060001373291, "step": 6960 }, { "epoch": 0.8381433381433382, "grad_norm": 159.58700442465454, "learning_rate": 8.992517370390165e-08, "logits/generated": -1.9688711166381836, "logits/real": -2.034064769744873, "logps/generated": -570.5164794921875, "logps/real": -255.22421264648438, "loss": 0.4113, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.9481892585754395, "rewards/margins": 27.642398834228516, "rewards/real": 22.694210052490234, "step": 6970 }, { "epoch": 0.8393458393458394, "grad_norm": 12.64567192145013, "learning_rate": 8.925708177445217e-08, "logits/generated": -1.9905973672866821, "logits/real": -2.032841205596924, "logps/generated": -592.6995849609375, "logps/real": -225.61178588867188, "loss": 0.302, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -6.645966529846191, "rewards/margins": 25.207401275634766, "rewards/real": 18.56143569946289, "step": 6980 }, { "epoch": 0.8405483405483406, "grad_norm": 271.33612272282033, "learning_rate": 8.858898984500266e-08, "logits/generated": -1.8832422494888306, "logits/real": -2.047926187515259, "logps/generated": -593.6348876953125, "logps/real": -221.31332397460938, "loss": 0.575, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -12.091665267944336, "rewards/margins": 30.641738891601562, "rewards/real": 18.55006980895996, "step": 6990 }, { "epoch": 0.8417508417508418, "grad_norm": 93.43620222046046, "learning_rate": 8.792089791555318e-08, "logits/generated": -1.8331258296966553, "logits/real": -1.9706172943115234, "logps/generated": -571.3441162109375, "logps/real": -230.90087890625, "loss": 0.408, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.930742263793945, "rewards/margins": 31.25909996032715, "rewards/real": 20.328359603881836, "step": 7000 }, { "epoch": 0.8417508417508418, "eval_logits/generated": -1.9645313024520874, "eval_logits/real": -2.0342915058135986, "eval_logps/generated": -586.2899169921875, "eval_logps/real": -257.3894958496094, "eval_loss": 0.38710272312164307, "eval_rewards/accuracies": 0.9732142686843872, "eval_rewards/generated": -9.650843620300293, "eval_rewards/margins": 31.038997650146484, "eval_rewards/real": 21.388153076171875, "eval_runtime": 158.3613, "eval_samples_per_second": 6.315, "eval_steps_per_second": 0.53, "step": 7000 }, { "epoch": 0.8429533429533429, "grad_norm": 517.0074744751448, "learning_rate": 8.725280598610369e-08, "logits/generated": -1.8653123378753662, "logits/real": -1.953324556350708, "logps/generated": -561.4413452148438, "logps/real": -200.07469177246094, "loss": 0.6815, "rewards/accuracies": 0.925000011920929, "rewards/generated": -13.99188232421875, "rewards/margins": 30.130197525024414, "rewards/real": 16.13831901550293, "step": 7010 }, { "epoch": 0.8441558441558441, "grad_norm": 41.85935550831186, "learning_rate": 8.65847140566542e-08, "logits/generated": -1.9038314819335938, "logits/real": -1.9930827617645264, "logps/generated": -547.8190307617188, "logps/real": -272.4012145996094, "loss": 0.5132, "rewards/accuracies": 0.925000011920929, "rewards/generated": -4.457758903503418, "rewards/margins": 28.457626342773438, "rewards/real": 23.999866485595703, "step": 7020 }, { "epoch": 0.8453583453583453, "grad_norm": 44.89850728430392, "learning_rate": 8.59166221272047e-08, "logits/generated": -1.8842484951019287, "logits/real": -2.023623466491699, "logps/generated": -588.6026000976562, "logps/real": -243.3715057373047, "loss": 0.4327, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -14.154241561889648, "rewards/margins": 35.011322021484375, "rewards/real": 20.857078552246094, "step": 7030 }, { "epoch": 0.8465608465608465, "grad_norm": 471.8481253389293, "learning_rate": 8.524853019775521e-08, "logits/generated": -1.8785381317138672, "logits/real": -2.0542819499969482, "logps/generated": -779.9952392578125, "logps/real": -300.3929748535156, "loss": 0.4442, "rewards/accuracies": 1.0, "rewards/generated": -14.369863510131836, "rewards/margins": 42.48217010498047, "rewards/real": 28.112308502197266, "step": 7040 }, { "epoch": 0.8477633477633477, "grad_norm": 166.0510317451011, "learning_rate": 8.458043826830571e-08, "logits/generated": -1.9051055908203125, "logits/real": -1.9138116836547852, "logps/generated": -497.93084716796875, "logps/real": -223.7330780029297, "loss": 0.3103, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.9573392868042, "rewards/margins": 27.659826278686523, "rewards/real": 18.702489852905273, "step": 7050 }, { "epoch": 0.8489658489658489, "grad_norm": 10.610026437075513, "learning_rate": 8.391234633885623e-08, "logits/generated": -1.8697566986083984, "logits/real": -1.965914011001587, "logps/generated": -640.02001953125, "logps/real": -240.3203887939453, "loss": 0.5067, "rewards/accuracies": 0.949999988079071, "rewards/generated": -13.202654838562012, "rewards/margins": 33.456077575683594, "rewards/real": 20.253421783447266, "step": 7060 }, { "epoch": 0.8501683501683501, "grad_norm": 64.04734108668225, "learning_rate": 8.324425440940673e-08, "logits/generated": -1.9457658529281616, "logits/real": -2.020012140274048, "logps/generated": -608.0465087890625, "logps/real": -256.51568603515625, "loss": 0.2771, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.455371856689453, "rewards/margins": 29.595605850219727, "rewards/real": 22.14023208618164, "step": 7070 }, { "epoch": 0.8513708513708513, "grad_norm": 734.7277589875729, "learning_rate": 8.257616247995723e-08, "logits/generated": -1.9419653415679932, "logits/real": -1.950111985206604, "logps/generated": -528.2342529296875, "logps/real": -194.86752319335938, "loss": 0.6012, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.547017574310303, "rewards/margins": 24.086828231811523, "rewards/real": 17.539810180664062, "step": 7080 }, { "epoch": 0.8525733525733525, "grad_norm": 10.21439558684444, "learning_rate": 8.190807055050774e-08, "logits/generated": -1.960397720336914, "logits/real": -2.0412917137145996, "logps/generated": -646.4866943359375, "logps/real": -210.7541961669922, "loss": 0.5506, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.799673080444336, "rewards/margins": 29.200973510742188, "rewards/real": 20.40129852294922, "step": 7090 }, { "epoch": 0.8537758537758537, "grad_norm": 8.265386169096248, "learning_rate": 8.123997862105825e-08, "logits/generated": -1.9347922801971436, "logits/real": -1.9989608526229858, "logps/generated": -509.216796875, "logps/real": -190.09078979492188, "loss": 0.5294, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.542779922485352, "rewards/margins": 26.825023651123047, "rewards/real": 16.28224754333496, "step": 7100 }, { "epoch": 0.854978354978355, "grad_norm": 94.99077031729618, "learning_rate": 8.057188669160877e-08, "logits/generated": -1.9375168085098267, "logits/real": -2.0451226234436035, "logps/generated": -736.7534790039062, "logps/real": -355.28924560546875, "loss": 0.4881, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.175879955291748, "rewards/margins": 34.601112365722656, "rewards/real": 30.425235748291016, "step": 7110 }, { "epoch": 0.8561808561808562, "grad_norm": 326.2487012399304, "learning_rate": 7.990379476215926e-08, "logits/generated": -1.9523429870605469, "logits/real": -2.1103038787841797, "logps/generated": -689.837158203125, "logps/real": -298.9279479980469, "loss": 0.432, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.586548805236816, "rewards/margins": 35.03469467163086, "rewards/real": 24.44814682006836, "step": 7120 }, { "epoch": 0.8573833573833574, "grad_norm": 588.9535954751599, "learning_rate": 7.923570283270978e-08, "logits/generated": -2.0439341068267822, "logits/real": -2.042471408843994, "logps/generated": -539.2032470703125, "logps/real": -226.07455444335938, "loss": 0.6732, "rewards/accuracies": 0.875, "rewards/generated": -7.877067565917969, "rewards/margins": 27.782094955444336, "rewards/real": 19.905027389526367, "step": 7130 }, { "epoch": 0.8585858585858586, "grad_norm": 8.447035038625407, "learning_rate": 7.856761090326027e-08, "logits/generated": -1.9612295627593994, "logits/real": -1.9542171955108643, "logps/generated": -550.6262817382812, "logps/real": -221.4468994140625, "loss": 0.4235, "rewards/accuracies": 0.875, "rewards/generated": -2.9779059886932373, "rewards/margins": 23.170961380004883, "rewards/real": 20.19305419921875, "step": 7140 }, { "epoch": 0.8597883597883598, "grad_norm": 10.612418017872224, "learning_rate": 7.789951897381079e-08, "logits/generated": -1.9437658786773682, "logits/real": -2.0600814819335938, "logps/generated": -625.6292724609375, "logps/real": -257.81756591796875, "loss": 0.3105, "rewards/accuracies": 0.949999988079071, "rewards/generated": -13.496139526367188, "rewards/margins": 36.835514068603516, "rewards/real": 23.339374542236328, "step": 7150 }, { "epoch": 0.860990860990861, "grad_norm": 63.352885973480475, "learning_rate": 7.72314270443613e-08, "logits/generated": -1.8945993185043335, "logits/real": -2.0037083625793457, "logps/generated": -586.4344482421875, "logps/real": -275.34893798828125, "loss": 0.791, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.72334098815918, "rewards/margins": 27.0490665435791, "rewards/real": 22.325727462768555, "step": 7160 }, { "epoch": 0.8621933621933622, "grad_norm": 2.8121678032177697, "learning_rate": 7.65633351149118e-08, "logits/generated": -1.8705205917358398, "logits/real": -2.000699520111084, "logps/generated": -539.0193481445312, "logps/real": -236.5692138671875, "loss": 0.4325, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.5588555335998535, "rewards/margins": 25.52818489074707, "rewards/real": 20.969329833984375, "step": 7170 }, { "epoch": 0.8633958633958634, "grad_norm": 8.795268136151748, "learning_rate": 7.589524318546231e-08, "logits/generated": -1.9034866094589233, "logits/real": -1.9636344909667969, "logps/generated": -595.8299560546875, "logps/real": -272.61981201171875, "loss": 0.5842, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.489290237426758, "rewards/margins": 33.71501541137695, "rewards/real": 23.225725173950195, "step": 7180 }, { "epoch": 0.8645983645983646, "grad_norm": 3.5256842285820325, "learning_rate": 7.522715125601283e-08, "logits/generated": -1.8724931478500366, "logits/real": -1.9148505926132202, "logps/generated": -525.8731689453125, "logps/real": -235.45309448242188, "loss": 0.2558, "rewards/accuracies": 1.0, "rewards/generated": -12.958587646484375, "rewards/margins": 34.816864013671875, "rewards/real": 21.858278274536133, "step": 7190 }, { "epoch": 0.8658008658008658, "grad_norm": 16.04433358783389, "learning_rate": 7.455905932656332e-08, "logits/generated": -1.9577831029891968, "logits/real": -2.068490982055664, "logps/generated": -503.7447814941406, "logps/real": -236.80325317382812, "loss": 0.4185, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.415589332580566, "rewards/margins": 26.97598648071289, "rewards/real": 19.56039810180664, "step": 7200 }, { "epoch": 0.867003367003367, "grad_norm": 18.83334364700047, "learning_rate": 7.389096739711384e-08, "logits/generated": -1.9357426166534424, "logits/real": -2.0584323406219482, "logps/generated": -594.2290649414062, "logps/real": -190.7951202392578, "loss": 0.561, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.759885787963867, "rewards/margins": 29.231525421142578, "rewards/real": 18.471637725830078, "step": 7210 }, { "epoch": 0.8682058682058682, "grad_norm": 5.678415180008009, "learning_rate": 7.322287546766434e-08, "logits/generated": -1.943680763244629, "logits/real": -1.9875980615615845, "logps/generated": -566.5198364257812, "logps/real": -239.02627563476562, "loss": 0.5686, "rewards/accuracies": 0.925000011920929, "rewards/generated": -10.269927978515625, "rewards/margins": 30.888427734375, "rewards/real": 20.61849594116211, "step": 7220 }, { "epoch": 0.8694083694083694, "grad_norm": 12.09045317910382, "learning_rate": 7.255478353821486e-08, "logits/generated": -1.8886387348175049, "logits/real": -1.9483449459075928, "logps/generated": -642.1652221679688, "logps/real": -291.56207275390625, "loss": 0.6902, "rewards/accuracies": 0.925000011920929, "rewards/generated": -9.111310958862305, "rewards/margins": 31.13655662536621, "rewards/real": 22.025249481201172, "step": 7230 }, { "epoch": 0.8706108706108706, "grad_norm": 254.62835920882608, "learning_rate": 7.188669160876536e-08, "logits/generated": -1.8878610134124756, "logits/real": -2.057114362716675, "logps/generated": -629.2080078125, "logps/real": -266.1337585449219, "loss": 0.4407, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.400242805480957, "rewards/margins": 33.62870407104492, "rewards/real": 23.22845458984375, "step": 7240 }, { "epoch": 0.8718133718133718, "grad_norm": 13.901591824064043, "learning_rate": 7.121859967931587e-08, "logits/generated": -1.9008684158325195, "logits/real": -2.0401580333709717, "logps/generated": -579.7138671875, "logps/real": -243.6083984375, "loss": 0.6432, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.450406074523926, "rewards/margins": 33.217533111572266, "rewards/real": 22.76712417602539, "step": 7250 }, { "epoch": 0.873015873015873, "grad_norm": 2613.4291397945603, "learning_rate": 7.055050774986638e-08, "logits/generated": -1.8321186304092407, "logits/real": -1.9275792837142944, "logps/generated": -502.3748474121094, "logps/real": -176.3705596923828, "loss": 0.5526, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.413863182067871, "rewards/margins": 28.312484741210938, "rewards/real": 16.898624420166016, "step": 7260 }, { "epoch": 0.8742183742183742, "grad_norm": 161.14984363106092, "learning_rate": 6.988241582041688e-08, "logits/generated": -1.9329860210418701, "logits/real": -1.9756215810775757, "logps/generated": -531.51953125, "logps/real": -193.31625366210938, "loss": 0.3499, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.719414710998535, "rewards/margins": 25.512435913085938, "rewards/real": 17.793020248413086, "step": 7270 }, { "epoch": 0.8754208754208754, "grad_norm": 7.54855355781832, "learning_rate": 6.921432389096739e-08, "logits/generated": -1.894049048423767, "logits/real": -1.9347572326660156, "logps/generated": -717.5528564453125, "logps/real": -291.5484313964844, "loss": 0.3911, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.796369552612305, "rewards/margins": 31.634693145751953, "rewards/real": 25.83832359313965, "step": 7280 }, { "epoch": 0.8766233766233766, "grad_norm": 4.585649150995327, "learning_rate": 6.854623196151791e-08, "logits/generated": -1.8500807285308838, "logits/real": -1.949877142906189, "logps/generated": -584.8020629882812, "logps/real": -191.82269287109375, "loss": 0.4532, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -14.867227554321289, "rewards/margins": 33.48359298706055, "rewards/real": 18.616365432739258, "step": 7290 }, { "epoch": 0.8778258778258778, "grad_norm": 163.66992629619799, "learning_rate": 6.78781400320684e-08, "logits/generated": -1.8061542510986328, "logits/real": -1.9497833251953125, "logps/generated": -601.3792724609375, "logps/real": -195.39517211914062, "loss": 0.4238, "rewards/accuracies": 0.925000011920929, "rewards/generated": -13.47065258026123, "rewards/margins": 31.295124053955078, "rewards/real": 17.82447052001953, "step": 7300 }, { "epoch": 0.879028379028379, "grad_norm": 874.8283124911345, "learning_rate": 6.721004810261892e-08, "logits/generated": -1.912121057510376, "logits/real": -2.057476282119751, "logps/generated": -625.40869140625, "logps/real": -265.7558288574219, "loss": 0.6074, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.613348960876465, "rewards/margins": 30.51369857788086, "rewards/real": 23.90035057067871, "step": 7310 }, { "epoch": 0.8802308802308803, "grad_norm": 192.46196828072283, "learning_rate": 6.654195617316941e-08, "logits/generated": -1.8323675394058228, "logits/real": -2.0080084800720215, "logps/generated": -778.146484375, "logps/real": -275.4057922363281, "loss": 0.3373, "rewards/accuracies": 1.0, "rewards/generated": -20.505359649658203, "rewards/margins": 46.92961883544922, "rewards/real": 26.42426109313965, "step": 7320 }, { "epoch": 0.8814333814333815, "grad_norm": 16.746936291128744, "learning_rate": 6.587386424371993e-08, "logits/generated": -1.8217490911483765, "logits/real": -1.9621708393096924, "logps/generated": -692.5020751953125, "logps/real": -257.6358642578125, "loss": 0.4704, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -15.981904983520508, "rewards/margins": 39.987735748291016, "rewards/real": 24.005828857421875, "step": 7330 }, { "epoch": 0.8826358826358827, "grad_norm": 4.272381400943576, "learning_rate": 6.520577231427044e-08, "logits/generated": -1.9419548511505127, "logits/real": -2.0278639793395996, "logps/generated": -532.7462158203125, "logps/real": -272.96588134765625, "loss": 0.4588, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.291725158691406, "rewards/margins": 27.473276138305664, "rewards/real": 23.181550979614258, "step": 7340 }, { "epoch": 0.8838383838383839, "grad_norm": 619.0048583791896, "learning_rate": 6.453768038482095e-08, "logits/generated": -1.829918622970581, "logits/real": -1.9880342483520508, "logps/generated": -636.5155029296875, "logps/real": -252.5404815673828, "loss": 0.5431, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.412005424499512, "rewards/margins": 34.718605041503906, "rewards/real": 22.306598663330078, "step": 7350 }, { "epoch": 0.8850408850408851, "grad_norm": 3.541707656089414, "learning_rate": 6.386958845537145e-08, "logits/generated": -1.8724817037582397, "logits/real": -1.9666862487792969, "logps/generated": -564.6494140625, "logps/real": -180.87533569335938, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/generated": -13.268491744995117, "rewards/margins": 29.77277183532715, "rewards/real": 16.5042781829834, "step": 7360 }, { "epoch": 0.8862433862433863, "grad_norm": 45.22433734634177, "learning_rate": 6.320149652592196e-08, "logits/generated": -1.825797438621521, "logits/real": -1.95993971824646, "logps/generated": -462.71942138671875, "logps/real": -206.0211944580078, "loss": 0.3297, "rewards/accuracies": 1.0, "rewards/generated": -6.641558647155762, "rewards/margins": 24.676057815551758, "rewards/real": 18.034500122070312, "step": 7370 }, { "epoch": 0.8874458874458875, "grad_norm": 3.263328103984659, "learning_rate": 6.253340459647247e-08, "logits/generated": -1.9000494480133057, "logits/real": -2.037306547164917, "logps/generated": -552.0850830078125, "logps/real": -226.86886596679688, "loss": 0.6875, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.1590576171875, "rewards/margins": 30.00253677368164, "rewards/real": 21.84347915649414, "step": 7380 }, { "epoch": 0.8886483886483887, "grad_norm": 1317.5802959768355, "learning_rate": 6.186531266702299e-08, "logits/generated": -1.9330966472625732, "logits/real": -1.9535415172576904, "logps/generated": -689.3192138671875, "logps/real": -240.9399871826172, "loss": 0.3784, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -17.82880401611328, "rewards/margins": 36.90077209472656, "rewards/real": 19.071969985961914, "step": 7390 }, { "epoch": 0.8898508898508899, "grad_norm": 291.43035513035505, "learning_rate": 6.119722073757349e-08, "logits/generated": -1.9878549575805664, "logits/real": -2.045534610748291, "logps/generated": -672.8856201171875, "logps/real": -267.2503356933594, "loss": 0.6009, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.401240348815918, "rewards/margins": 34.384620666503906, "rewards/real": 25.983383178710938, "step": 7400 }, { "epoch": 0.8910533910533911, "grad_norm": 2.316927988260796, "learning_rate": 6.0529128808124e-08, "logits/generated": -1.83743155002594, "logits/real": -1.9109361171722412, "logps/generated": -617.5614624023438, "logps/real": -201.71026611328125, "loss": 0.3709, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.857139587402344, "rewards/margins": 32.52408981323242, "rewards/real": 18.66695213317871, "step": 7410 }, { "epoch": 0.8922558922558923, "grad_norm": 28.72886021301842, "learning_rate": 5.98610368786745e-08, "logits/generated": -1.8223955631256104, "logits/real": -1.9261900186538696, "logps/generated": -563.7001953125, "logps/real": -244.0134735107422, "loss": 0.3791, "rewards/accuracies": 0.949999988079071, "rewards/generated": -8.443937301635742, "rewards/margins": 30.967792510986328, "rewards/real": 22.523853302001953, "step": 7420 }, { "epoch": 0.8934583934583935, "grad_norm": 10.349477439848279, "learning_rate": 5.919294494922501e-08, "logits/generated": -1.8162206411361694, "logits/real": -1.994471549987793, "logps/generated": -745.6394653320312, "logps/real": -283.2781066894531, "loss": 0.53, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -14.154869079589844, "rewards/margins": 39.75368881225586, "rewards/real": 25.598817825317383, "step": 7430 }, { "epoch": 0.8946608946608947, "grad_norm": 15.26625109062255, "learning_rate": 5.852485301977552e-08, "logits/generated": -1.8924280405044556, "logits/real": -1.9609463214874268, "logps/generated": -615.1890869140625, "logps/real": -264.7192687988281, "loss": 0.5444, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.55783224105835, "rewards/margins": 32.70935821533203, "rewards/real": 25.15152931213379, "step": 7440 }, { "epoch": 0.8958633958633959, "grad_norm": 3.263604527573491, "learning_rate": 5.785676109032603e-08, "logits/generated": -1.8957017660140991, "logits/real": -1.939389944076538, "logps/generated": -651.6373901367188, "logps/real": -247.1461944580078, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/generated": -14.57429313659668, "rewards/margins": 37.14098358154297, "rewards/real": 22.56669044494629, "step": 7450 }, { "epoch": 0.897065897065897, "grad_norm": 2.635712388647341, "learning_rate": 5.718866916087654e-08, "logits/generated": -1.8295533657073975, "logits/real": -1.9486722946166992, "logps/generated": -727.4974975585938, "logps/real": -242.79702758789062, "loss": 0.5288, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -21.53809356689453, "rewards/margins": 41.07743835449219, "rewards/real": 19.53934097290039, "step": 7460 }, { "epoch": 0.8982683982683982, "grad_norm": 177.86797665233868, "learning_rate": 5.6520577231427044e-08, "logits/generated": -1.800366759300232, "logits/real": -1.9663546085357666, "logps/generated": -892.2824096679688, "logps/real": -278.8819274902344, "loss": 0.4458, "rewards/accuracies": 1.0, "rewards/generated": -21.799272537231445, "rewards/margins": 46.44826889038086, "rewards/real": 24.648998260498047, "step": 7470 }, { "epoch": 0.8994708994708994, "grad_norm": 10.941101545300283, "learning_rate": 5.585248530197755e-08, "logits/generated": -1.7807782888412476, "logits/real": -1.8585500717163086, "logps/generated": -591.650146484375, "logps/real": -203.64344787597656, "loss": 0.4428, "rewards/accuracies": 1.0, "rewards/generated": -16.4709415435791, "rewards/margins": 34.7379150390625, "rewards/real": 18.26697540283203, "step": 7480 }, { "epoch": 0.9006734006734006, "grad_norm": 3.097175905320357, "learning_rate": 5.518439337252806e-08, "logits/generated": -1.7874730825424194, "logits/real": -2.030160903930664, "logps/generated": -834.2674560546875, "logps/real": -270.4729919433594, "loss": 0.4874, "rewards/accuracies": 1.0, "rewards/generated": -25.003660202026367, "rewards/margins": 50.28822326660156, "rewards/real": 25.284561157226562, "step": 7490 }, { "epoch": 0.9018759018759018, "grad_norm": 224.99376236215403, "learning_rate": 5.451630144307857e-08, "logits/generated": -1.7494442462921143, "logits/real": -2.023285150527954, "logps/generated": -550.2077026367188, "logps/real": -246.26657104492188, "loss": 0.4641, "rewards/accuracies": 0.925000011920929, "rewards/generated": -9.512105941772461, "rewards/margins": 34.51152420043945, "rewards/real": 24.999414443969727, "step": 7500 }, { "epoch": 0.903078403078403, "grad_norm": 4.694829544789701, "learning_rate": 5.384820951362908e-08, "logits/generated": -1.9265758991241455, "logits/real": -1.9717557430267334, "logps/generated": -588.18994140625, "logps/real": -232.59988403320312, "loss": 0.3716, "rewards/accuracies": 0.949999988079071, "rewards/generated": -11.967580795288086, "rewards/margins": 32.77628707885742, "rewards/real": 20.808704376220703, "step": 7510 }, { "epoch": 0.9042809042809042, "grad_norm": 3.5710623458466215, "learning_rate": 5.3180117584179583e-08, "logits/generated": -1.8794788122177124, "logits/real": -1.8858184814453125, "logps/generated": -442.54931640625, "logps/real": -209.28738403320312, "loss": 0.5693, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.389479637145996, "rewards/margins": 21.84439468383789, "rewards/real": 16.454912185668945, "step": 7520 }, { "epoch": 0.9054834054834054, "grad_norm": 6.677030844505423, "learning_rate": 5.251202565473009e-08, "logits/generated": -1.841382384300232, "logits/real": -1.9387576580047607, "logps/generated": -678.8121337890625, "logps/real": -250.5919952392578, "loss": 0.4173, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.574440002441406, "rewards/margins": 35.772891998291016, "rewards/real": 22.19845199584961, "step": 7530 }, { "epoch": 0.9066859066859067, "grad_norm": 6.179269175547231, "learning_rate": 5.1843933725280596e-08, "logits/generated": -1.8453378677368164, "logits/real": -1.8640903234481812, "logps/generated": -596.0718994140625, "logps/real": -291.5093688964844, "loss": 0.3728, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.117281913757324, "rewards/margins": 32.634605407714844, "rewards/real": 23.517322540283203, "step": 7540 }, { "epoch": 0.9078884078884079, "grad_norm": 107.4651123020714, "learning_rate": 5.117584179583111e-08, "logits/generated": -1.8386691808700562, "logits/real": -1.8639602661132812, "logps/generated": -549.8564453125, "logps/real": -174.96145629882812, "loss": 0.3114, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.235156059265137, "rewards/margins": 30.93783950805664, "rewards/real": 17.702680587768555, "step": 7550 }, { "epoch": 0.9090909090909091, "grad_norm": 2.703509066798271, "learning_rate": 5.0507749866381616e-08, "logits/generated": -1.8066171407699585, "logits/real": -1.8202998638153076, "logps/generated": -568.2048950195312, "logps/real": -237.21194458007812, "loss": 0.3881, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.227745056152344, "rewards/margins": 29.165557861328125, "rewards/real": 21.93781089782715, "step": 7560 }, { "epoch": 0.9102934102934103, "grad_norm": 403.3692528199429, "learning_rate": 4.983965793693212e-08, "logits/generated": -1.8202968835830688, "logits/real": -1.9496692419052124, "logps/generated": -574.8258056640625, "logps/real": -235.11007690429688, "loss": 0.4766, "rewards/accuracies": 0.875, "rewards/generated": -9.05565071105957, "rewards/margins": 29.5675106048584, "rewards/real": 20.511859893798828, "step": 7570 }, { "epoch": 0.9114959114959115, "grad_norm": 860.5112240891744, "learning_rate": 4.917156600748263e-08, "logits/generated": -1.8284162282943726, "logits/real": -1.9748990535736084, "logps/generated": -445.96661376953125, "logps/real": -191.8043975830078, "loss": 0.3949, "rewards/accuracies": 0.949999988079071, "rewards/generated": -11.082816123962402, "rewards/margins": 26.514028549194336, "rewards/real": 15.4312105178833, "step": 7580 }, { "epoch": 0.9126984126984127, "grad_norm": 8.460906879650542, "learning_rate": 4.8503474078033135e-08, "logits/generated": -1.8548015356063843, "logits/real": -1.9636313915252686, "logps/generated": -608.8326416015625, "logps/real": -298.0274963378906, "loss": 0.4868, "rewards/accuracies": 1.0, "rewards/generated": -5.996362209320068, "rewards/margins": 34.14162063598633, "rewards/real": 28.1452579498291, "step": 7590 }, { "epoch": 0.9139009139009139, "grad_norm": 67.33088596415548, "learning_rate": 4.783538214858365e-08, "logits/generated": -1.8178812265396118, "logits/real": -1.9490362405776978, "logps/generated": -563.5987548828125, "logps/real": -217.73654174804688, "loss": 0.3139, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.452391624450684, "rewards/margins": 32.99445724487305, "rewards/real": 21.542064666748047, "step": 7600 }, { "epoch": 0.9151034151034151, "grad_norm": 19.193494222713156, "learning_rate": 4.7167290219134155e-08, "logits/generated": -1.8279447555541992, "logits/real": -1.9227333068847656, "logps/generated": -546.6016845703125, "logps/real": -231.05239868164062, "loss": 0.3154, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.202812194824219, "rewards/margins": 26.593332290649414, "rewards/real": 19.390522003173828, "step": 7610 }, { "epoch": 0.9163059163059163, "grad_norm": 289.4963148912311, "learning_rate": 4.649919828968466e-08, "logits/generated": -1.9175913333892822, "logits/real": -1.9148343801498413, "logps/generated": -547.1802978515625, "logps/real": -256.5948791503906, "loss": 0.4737, "rewards/accuracies": 0.949999988079071, "rewards/generated": -5.248676776885986, "rewards/margins": 27.513641357421875, "rewards/real": 22.264965057373047, "step": 7620 }, { "epoch": 0.9175084175084175, "grad_norm": 159.8466242647042, "learning_rate": 4.583110636023517e-08, "logits/generated": -1.8012142181396484, "logits/real": -1.998228669166565, "logps/generated": -684.0794677734375, "logps/real": -250.2084197998047, "loss": 0.3076, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.82398509979248, "rewards/margins": 35.56842803955078, "rewards/real": 21.744447708129883, "step": 7630 }, { "epoch": 0.9187109187109187, "grad_norm": 133.1085974341585, "learning_rate": 4.5163014430785674e-08, "logits/generated": -1.816612958908081, "logits/real": -1.9270000457763672, "logps/generated": -477.7117614746094, "logps/real": -221.5418701171875, "loss": 0.3852, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -6.828951835632324, "rewards/margins": 27.315725326538086, "rewards/real": 20.486770629882812, "step": 7640 }, { "epoch": 0.9199134199134199, "grad_norm": 121.06992542134766, "learning_rate": 4.449492250133619e-08, "logits/generated": -1.9036048650741577, "logits/real": -1.869901418685913, "logps/generated": -550.589111328125, "logps/real": -232.84207153320312, "loss": 0.6221, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.372431755065918, "rewards/margins": 27.593318939208984, "rewards/real": 22.220890045166016, "step": 7650 }, { "epoch": 0.9211159211159211, "grad_norm": 116.92142166137279, "learning_rate": 4.3826830571886693e-08, "logits/generated": -1.9088249206542969, "logits/real": -1.9617713689804077, "logps/generated": -629.1837768554688, "logps/real": -232.0994415283203, "loss": 0.6329, "rewards/accuracies": 0.949999988079071, "rewards/generated": -11.240312576293945, "rewards/margins": 29.797321319580078, "rewards/real": 18.557008743286133, "step": 7660 }, { "epoch": 0.9223184223184223, "grad_norm": 591.6891703932573, "learning_rate": 4.31587386424372e-08, "logits/generated": -1.738337516784668, "logits/real": -1.9371941089630127, "logps/generated": -666.4129638671875, "logps/real": -223.22177124023438, "loss": 0.3358, "rewards/accuracies": 0.949999988079071, "rewards/generated": -16.73686981201172, "rewards/margins": 35.94083023071289, "rewards/real": 19.203960418701172, "step": 7670 }, { "epoch": 0.9235209235209235, "grad_norm": 7.646952696003572, "learning_rate": 4.2490646712987706e-08, "logits/generated": -1.9599723815917969, "logits/real": -1.9543098211288452, "logps/generated": -584.2188720703125, "logps/real": -254.26943969726562, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/generated": -4.110929489135742, "rewards/margins": 26.450088500976562, "rewards/real": 22.339157104492188, "step": 7680 }, { "epoch": 0.9247234247234247, "grad_norm": 153.4210062718791, "learning_rate": 4.182255478353822e-08, "logits/generated": -1.7582743167877197, "logits/real": -1.9109766483306885, "logps/generated": -707.858154296875, "logps/real": -196.25381469726562, "loss": 0.4507, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -20.299272537231445, "rewards/margins": 39.022865295410156, "rewards/real": 18.723594665527344, "step": 7690 }, { "epoch": 0.9259259259259259, "grad_norm": 246.81391093438802, "learning_rate": 4.115446285408872e-08, "logits/generated": -1.8733514547348022, "logits/real": -1.9827098846435547, "logps/generated": -631.9744262695312, "logps/real": -274.9596252441406, "loss": 0.5578, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.418924331665039, "rewards/margins": 37.10205841064453, "rewards/real": 26.683135986328125, "step": 7700 }, { "epoch": 0.9271284271284271, "grad_norm": 5.8986953364246135, "learning_rate": 4.0486370924639226e-08, "logits/generated": -1.851654291152954, "logits/real": -1.9533793926239014, "logps/generated": -510.91668701171875, "logps/real": -226.32955932617188, "loss": 0.5021, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.667630195617676, "rewards/margins": 28.51548194885254, "rewards/real": 20.847850799560547, "step": 7710 }, { "epoch": 0.9283309283309283, "grad_norm": 620.9371507413915, "learning_rate": 3.981827899518973e-08, "logits/generated": -1.8319000005722046, "logits/real": -1.921460509300232, "logps/generated": -625.7000122070312, "logps/real": -281.3758544921875, "loss": 0.3173, "rewards/accuracies": 1.0, "rewards/generated": -9.334914207458496, "rewards/margins": 33.2500114440918, "rewards/real": 23.91509246826172, "step": 7720 }, { "epoch": 0.9295334295334295, "grad_norm": 4.111441403423276, "learning_rate": 3.915018706574024e-08, "logits/generated": -1.820752739906311, "logits/real": -1.9629709720611572, "logps/generated": -457.3055725097656, "logps/real": -180.87144470214844, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/generated": -13.024332046508789, "rewards/margins": 28.568084716796875, "rewards/real": 15.543754577636719, "step": 7730 }, { "epoch": 0.9307359307359307, "grad_norm": 24.938895839700333, "learning_rate": 3.848209513629075e-08, "logits/generated": -1.8357629776000977, "logits/real": -1.8417953252792358, "logps/generated": -658.307373046875, "logps/real": -215.5572509765625, "loss": 0.2889, "rewards/accuracies": 0.949999988079071, "rewards/generated": -16.049457550048828, "rewards/margins": 34.50643539428711, "rewards/real": 18.456981658935547, "step": 7740 }, { "epoch": 0.931938431938432, "grad_norm": 482.8381043273678, "learning_rate": 3.781400320684126e-08, "logits/generated": -1.8643558025360107, "logits/real": -1.970956802368164, "logps/generated": -572.5831298828125, "logps/real": -224.4773406982422, "loss": 0.4011, "rewards/accuracies": 0.949999988079071, "rewards/generated": -12.906872749328613, "rewards/margins": 34.622318267822266, "rewards/real": 21.715444564819336, "step": 7750 }, { "epoch": 0.9331409331409332, "grad_norm": 7.937441822014419, "learning_rate": 3.7145911277391765e-08, "logits/generated": -1.7950923442840576, "logits/real": -1.9632022380828857, "logps/generated": -577.6339721679688, "logps/real": -219.2605743408203, "loss": 0.3931, "rewards/accuracies": 0.949999988079071, "rewards/generated": -13.48230266571045, "rewards/margins": 34.0289192199707, "rewards/real": 20.546619415283203, "step": 7760 }, { "epoch": 0.9343434343434344, "grad_norm": 50.0230999113016, "learning_rate": 3.647781934794227e-08, "logits/generated": -1.9379491806030273, "logits/real": -1.9466701745986938, "logps/generated": -499.440673828125, "logps/real": -211.3698272705078, "loss": 0.6335, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.978289604187012, "rewards/margins": 26.517309188842773, "rewards/real": 18.539020538330078, "step": 7770 }, { "epoch": 0.9355459355459356, "grad_norm": 1179.5098864731124, "learning_rate": 3.580972741849278e-08, "logits/generated": -1.8757632970809937, "logits/real": -1.9674293994903564, "logps/generated": -588.0816650390625, "logps/real": -160.77267456054688, "loss": 0.3409, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -13.619443893432617, "rewards/margins": 29.020843505859375, "rewards/real": 15.401400566101074, "step": 7780 }, { "epoch": 0.9367484367484368, "grad_norm": 59.70100347128945, "learning_rate": 3.514163548904329e-08, "logits/generated": -1.8476343154907227, "logits/real": -1.9887018203735352, "logps/generated": -655.5538330078125, "logps/real": -200.73013305664062, "loss": 0.4802, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -15.746861457824707, "rewards/margins": 33.513301849365234, "rewards/real": 17.76644515991211, "step": 7790 }, { "epoch": 0.937950937950938, "grad_norm": 471.2838623781861, "learning_rate": 3.44735435595938e-08, "logits/generated": -1.8844900131225586, "logits/real": -1.8739242553710938, "logps/generated": -547.0294799804688, "logps/real": -203.09417724609375, "loss": 0.5798, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.673928260803223, "rewards/margins": 26.7031307220459, "rewards/real": 19.02920150756836, "step": 7800 }, { "epoch": 0.9391534391534392, "grad_norm": 562.4521467479547, "learning_rate": 3.3805451630144303e-08, "logits/generated": -1.9220783710479736, "logits/real": -2.022700548171997, "logps/generated": -571.9171752929688, "logps/real": -246.066162109375, "loss": 0.7754, "rewards/accuracies": 0.875, "rewards/generated": -4.068185329437256, "rewards/margins": 24.824108123779297, "rewards/real": 20.75592041015625, "step": 7810 }, { "epoch": 0.9403559403559404, "grad_norm": 13.578179882188723, "learning_rate": 3.313735970069481e-08, "logits/generated": -1.7413349151611328, "logits/real": -1.9711958169937134, "logps/generated": -750.6766967773438, "logps/real": -277.9291076660156, "loss": 0.4597, "rewards/accuracies": 0.925000011920929, "rewards/generated": -19.52254295349121, "rewards/margins": 42.818477630615234, "rewards/real": 23.29593276977539, "step": 7820 }, { "epoch": 0.9415584415584416, "grad_norm": 42.735858968267095, "learning_rate": 3.2469267771245316e-08, "logits/generated": -1.8657029867172241, "logits/real": -1.9247665405273438, "logps/generated": -535.1868286132812, "logps/real": -262.0387878417969, "loss": 0.5369, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -6.889112949371338, "rewards/margins": 27.30415916442871, "rewards/real": 20.415042877197266, "step": 7830 }, { "epoch": 0.9427609427609428, "grad_norm": 6.034575029065627, "learning_rate": 3.180117584179583e-08, "logits/generated": -1.8643524646759033, "logits/real": -1.9991130828857422, "logps/generated": -705.0940551757812, "logps/real": -234.36495971679688, "loss": 0.4216, "rewards/accuracies": 0.949999988079071, "rewards/generated": -12.718157768249512, "rewards/margins": 35.234375, "rewards/real": 22.516218185424805, "step": 7840 }, { "epoch": 0.943963443963444, "grad_norm": 6.797818967782297, "learning_rate": 3.1133083912346336e-08, "logits/generated": -1.8352415561676025, "logits/real": -1.9554344415664673, "logps/generated": -634.4024658203125, "logps/real": -219.031005859375, "loss": 0.6585, "rewards/accuracies": 0.949999988079071, "rewards/generated": -16.06719970703125, "rewards/margins": 35.9827766418457, "rewards/real": 19.915573120117188, "step": 7850 }, { "epoch": 0.9451659451659452, "grad_norm": 117.8575004614085, "learning_rate": 3.046499198289685e-08, "logits/generated": -1.776283621788025, "logits/real": -2.061842203140259, "logps/generated": -661.2219848632812, "logps/real": -201.66799926757812, "loss": 0.3905, "rewards/accuracies": 0.949999988079071, "rewards/generated": -21.539260864257812, "rewards/margins": 38.302513122558594, "rewards/real": 16.763261795043945, "step": 7860 }, { "epoch": 0.9463684463684464, "grad_norm": 47.35907395759234, "learning_rate": 2.9796900053447355e-08, "logits/generated": -1.8681166172027588, "logits/real": -2.0195064544677734, "logps/generated": -605.47509765625, "logps/real": -249.17617797851562, "loss": 0.4233, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.160433769226074, "rewards/margins": 33.22639846801758, "rewards/real": 24.065961837768555, "step": 7870 }, { "epoch": 0.9475709475709476, "grad_norm": 484.60036455117483, "learning_rate": 2.9128808123997862e-08, "logits/generated": -1.8744513988494873, "logits/real": -1.9709593057632446, "logps/generated": -558.8253173828125, "logps/real": -219.21353149414062, "loss": 0.3166, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -9.859278678894043, "rewards/margins": 30.427806854248047, "rewards/real": 20.568531036376953, "step": 7880 }, { "epoch": 0.9487734487734488, "grad_norm": 1459.3056570761464, "learning_rate": 2.8460716194548368e-08, "logits/generated": -1.9523365497589111, "logits/real": -1.997300386428833, "logps/generated": -650.0694580078125, "logps/real": -231.16244506835938, "loss": 0.3053, "rewards/accuracies": 1.0, "rewards/generated": -16.40847396850586, "rewards/margins": 35.732765197753906, "rewards/real": 19.324291229248047, "step": 7890 }, { "epoch": 0.94997594997595, "grad_norm": 17.23400580291654, "learning_rate": 2.7792624265098878e-08, "logits/generated": -1.8673444986343384, "logits/real": -2.0318591594696045, "logps/generated": -631.63916015625, "logps/real": -232.9110870361328, "loss": 0.3388, "rewards/accuracies": 1.0, "rewards/generated": -13.709556579589844, "rewards/margins": 36.20250701904297, "rewards/real": 22.492950439453125, "step": 7900 }, { "epoch": 0.9511784511784511, "grad_norm": 237.08902331170333, "learning_rate": 2.7124532335649385e-08, "logits/generated": -1.8248322010040283, "logits/real": -1.9297415018081665, "logps/generated": -616.5086059570312, "logps/real": -268.40985107421875, "loss": 0.6605, "rewards/accuracies": 1.0, "rewards/generated": -13.933581352233887, "rewards/margins": 35.22268295288086, "rewards/real": 21.289104461669922, "step": 7910 }, { "epoch": 0.9523809523809523, "grad_norm": 131.26288463473537, "learning_rate": 2.6456440406199894e-08, "logits/generated": -1.9274238348007202, "logits/real": -2.008357524871826, "logps/generated": -648.6353759765625, "logps/real": -253.4458770751953, "loss": 0.5342, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.945533752441406, "rewards/margins": 32.83995819091797, "rewards/real": 22.894426345825195, "step": 7920 }, { "epoch": 0.9535834535834535, "grad_norm": 96.54038153175982, "learning_rate": 2.57883484767504e-08, "logits/generated": -1.8905365467071533, "logits/real": -1.9958155155181885, "logps/generated": -624.479736328125, "logps/real": -270.9004821777344, "loss": 0.4429, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -10.002233505249023, "rewards/margins": 35.480838775634766, "rewards/real": 25.47860336303711, "step": 7930 }, { "epoch": 0.9547859547859547, "grad_norm": 44.17128416741591, "learning_rate": 2.512025654730091e-08, "logits/generated": -1.9015429019927979, "logits/real": -2.0088698863983154, "logps/generated": -656.9166259765625, "logps/real": -232.25119018554688, "loss": 0.3935, "rewards/accuracies": 1.0, "rewards/generated": -9.713232040405273, "rewards/margins": 33.862403869628906, "rewards/real": 24.149173736572266, "step": 7940 }, { "epoch": 0.9559884559884559, "grad_norm": 3.537761969196036, "learning_rate": 2.4452164617851417e-08, "logits/generated": -1.862546682357788, "logits/real": -1.9948028326034546, "logps/generated": -537.6753540039062, "logps/real": -192.64080810546875, "loss": 0.3979, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -11.830598831176758, "rewards/margins": 29.180246353149414, "rewards/real": 17.349647521972656, "step": 7950 }, { "epoch": 0.9571909571909571, "grad_norm": 1562.5980622278403, "learning_rate": 2.3784072688401923e-08, "logits/generated": -1.9070017337799072, "logits/real": -1.9483369588851929, "logps/generated": -572.1051025390625, "logps/real": -230.68423461914062, "loss": 0.6042, "rewards/accuracies": 0.925000011920929, "rewards/generated": -9.48015022277832, "rewards/margins": 28.934789657592773, "rewards/real": 19.454639434814453, "step": 7960 }, { "epoch": 0.9583934583934584, "grad_norm": 204.34010807600555, "learning_rate": 2.3115980758952433e-08, "logits/generated": -1.9353723526000977, "logits/real": -2.026029109954834, "logps/generated": -581.7493286132812, "logps/real": -226.6553955078125, "loss": 0.5636, "rewards/accuracies": 0.949999988079071, "rewards/generated": -9.081565856933594, "rewards/margins": 29.828304290771484, "rewards/real": 20.746736526489258, "step": 7970 }, { "epoch": 0.9595959595959596, "grad_norm": 37.015019863286604, "learning_rate": 2.244788882950294e-08, "logits/generated": -1.8648122549057007, "logits/real": -1.9446719884872437, "logps/generated": -450.04541015625, "logps/real": -186.21939086914062, "loss": 0.473, "rewards/accuracies": 0.949999988079071, "rewards/generated": -7.291723728179932, "rewards/margins": 24.546016693115234, "rewards/real": 17.25429344177246, "step": 7980 }, { "epoch": 0.9607984607984608, "grad_norm": 9.915964783029532, "learning_rate": 2.177979690005345e-08, "logits/generated": -1.8747150897979736, "logits/real": -1.9883369207382202, "logps/generated": -540.0999755859375, "logps/real": -238.61148071289062, "loss": 0.4058, "rewards/accuracies": 1.0, "rewards/generated": -6.2435173988342285, "rewards/margins": 29.89438247680664, "rewards/real": 23.650863647460938, "step": 7990 }, { "epoch": 0.962000962000962, "grad_norm": 284.84969341031046, "learning_rate": 2.1111704970603956e-08, "logits/generated": -1.9531828165054321, "logits/real": -1.9930452108383179, "logps/generated": -644.8855590820312, "logps/real": -250.3666229248047, "loss": 0.5954, "rewards/accuracies": 0.925000011920929, "rewards/generated": -11.983048439025879, "rewards/margins": 34.054222106933594, "rewards/real": 22.071170806884766, "step": 8000 }, { "epoch": 0.962000962000962, "eval_logits/generated": -1.948014259338379, "eval_logits/real": -2.000520706176758, "eval_logps/generated": -562.68896484375, "eval_logps/real": -250.2119903564453, "eval_loss": 0.39723077416419983, "eval_rewards/accuracies": 0.9642857313156128, "eval_rewards/generated": -7.290745258331299, "eval_rewards/margins": 29.396650314331055, "eval_rewards/real": 22.105905532836914, "eval_runtime": 158.3174, "eval_samples_per_second": 6.316, "eval_steps_per_second": 0.531, "step": 8000 }, { "epoch": 0.9632034632034632, "grad_norm": 6.080219689125808, "learning_rate": 2.044361304115446e-08, "logits/generated": -1.8047806024551392, "logits/real": -1.9242351055145264, "logps/generated": -474.2330017089844, "logps/real": -185.81861877441406, "loss": 0.5476, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -10.688553810119629, "rewards/margins": 27.152202606201172, "rewards/real": 16.46364974975586, "step": 8010 }, { "epoch": 0.9644059644059644, "grad_norm": 4.506819732896348, "learning_rate": 1.977552111170497e-08, "logits/generated": -1.803545355796814, "logits/real": -1.9313043355941772, "logps/generated": -510.97833251953125, "logps/real": -190.4078826904297, "loss": 0.2868, "rewards/accuracies": 1.0, "rewards/generated": -14.387975692749023, "rewards/margins": 33.485694885253906, "rewards/real": 19.097719192504883, "step": 8020 }, { "epoch": 0.9656084656084656, "grad_norm": 175.28657069353605, "learning_rate": 1.9107429182255475e-08, "logits/generated": -1.907098412513733, "logits/real": -2.042231798171997, "logps/generated": -617.7903442382812, "logps/real": -253.0595703125, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/generated": -14.982889175415039, "rewards/margins": 37.402259826660156, "rewards/real": 22.419368743896484, "step": 8030 }, { "epoch": 0.9668109668109668, "grad_norm": 4.54301889959483, "learning_rate": 1.8439337252805985e-08, "logits/generated": -1.8733489513397217, "logits/real": -1.9778435230255127, "logps/generated": -534.8236083984375, "logps/real": -225.927490234375, "loss": 0.3097, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -7.314043998718262, "rewards/margins": 27.533193588256836, "rewards/real": 20.219152450561523, "step": 8040 }, { "epoch": 0.968013468013468, "grad_norm": 3.303525626665763, "learning_rate": 1.777124532335649e-08, "logits/generated": -1.9220225811004639, "logits/real": -2.0492451190948486, "logps/generated": -711.3641357421875, "logps/real": -279.6134948730469, "loss": 0.8231, "rewards/accuracies": 0.949999988079071, "rewards/generated": -12.18855094909668, "rewards/margins": 38.05695343017578, "rewards/real": 25.8684024810791, "step": 8050 }, { "epoch": 0.9692159692159692, "grad_norm": 122.83276724360225, "learning_rate": 1.7103153393906998e-08, "logits/generated": -1.8907839059829712, "logits/real": -1.9493697881698608, "logps/generated": -463.2969665527344, "logps/real": -166.5428924560547, "loss": 0.3734, "rewards/accuracies": 0.925000011920929, "rewards/generated": -8.571676254272461, "rewards/margins": 24.737638473510742, "rewards/real": 16.16596221923828, "step": 8060 }, { "epoch": 0.9704184704184704, "grad_norm": 15.856534381934436, "learning_rate": 1.6435061464457508e-08, "logits/generated": -1.8526527881622314, "logits/real": -1.9400050640106201, "logps/generated": -560.2443237304688, "logps/real": -285.6490173339844, "loss": 0.4117, "rewards/accuracies": 1.0, "rewards/generated": -7.392428398132324, "rewards/margins": 32.72213363647461, "rewards/real": 25.3297061920166, "step": 8070 }, { "epoch": 0.9716209716209716, "grad_norm": 925.17430026782, "learning_rate": 1.5766969535008014e-08, "logits/generated": -1.8764747381210327, "logits/real": -1.849029541015625, "logps/generated": -541.9317626953125, "logps/real": -256.1459655761719, "loss": 0.4275, "rewards/accuracies": 1.0, "rewards/generated": -8.22850513458252, "rewards/margins": 30.15728759765625, "rewards/real": 21.928783416748047, "step": 8080 }, { "epoch": 0.9728234728234728, "grad_norm": 132.89207443382926, "learning_rate": 1.5098877605558524e-08, "logits/generated": -1.89582097530365, "logits/real": -1.959554672241211, "logps/generated": -586.2793579101562, "logps/real": -245.7548370361328, "loss": 0.7542, "rewards/accuracies": 0.949999988079071, "rewards/generated": -10.676665306091309, "rewards/margins": 32.07061004638672, "rewards/real": 21.39394760131836, "step": 8090 }, { "epoch": 0.974025974025974, "grad_norm": 27.941911449310695, "learning_rate": 1.4430785676109032e-08, "logits/generated": -1.8876546621322632, "logits/real": -1.995438814163208, "logps/generated": -578.7598876953125, "logps/real": -333.9859619140625, "loss": 0.5529, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -1.251502275466919, "rewards/margins": 31.09954261779785, "rewards/real": 29.84804344177246, "step": 8100 }, { "epoch": 0.9752284752284752, "grad_norm": 235.98287723233014, "learning_rate": 1.376269374665954e-08, "logits/generated": -1.9337660074234009, "logits/real": -2.0056357383728027, "logps/generated": -513.9544677734375, "logps/real": -266.8468322753906, "loss": 0.4692, "rewards/accuracies": 1.0, "rewards/generated": -6.389153480529785, "rewards/margins": 30.21062660217285, "rewards/real": 23.821468353271484, "step": 8110 }, { "epoch": 0.9764309764309764, "grad_norm": 4.208306008543635, "learning_rate": 1.3094601817210048e-08, "logits/generated": -1.8793519735336304, "logits/real": -1.9217697381973267, "logps/generated": -601.8344116210938, "logps/real": -319.68206787109375, "loss": 0.5761, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -3.4111621379852295, "rewards/margins": 29.1977481842041, "rewards/real": 25.786584854125977, "step": 8120 }, { "epoch": 0.9776334776334776, "grad_norm": 231.02284426195752, "learning_rate": 1.2426509887760556e-08, "logits/generated": -1.8332197666168213, "logits/real": -1.977024793624878, "logps/generated": -442.82916259765625, "logps/real": -210.1693115234375, "loss": 0.5707, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.085574150085449, "rewards/margins": 27.40066146850586, "rewards/real": 20.315088272094727, "step": 8130 }, { "epoch": 0.9788359788359788, "grad_norm": 153.1950357875449, "learning_rate": 1.1758417958311064e-08, "logits/generated": -1.8746535778045654, "logits/real": -1.960744857788086, "logps/generated": -535.58642578125, "logps/real": -210.9016876220703, "loss": 0.5227, "rewards/accuracies": 0.925000011920929, "rewards/generated": -5.577829837799072, "rewards/margins": 25.187856674194336, "rewards/real": 19.610027313232422, "step": 8140 }, { "epoch": 0.98003848003848, "grad_norm": 189.62900666295315, "learning_rate": 1.1090326028861572e-08, "logits/generated": -1.899243950843811, "logits/real": -1.9381084442138672, "logps/generated": -654.8983154296875, "logps/real": -235.5189971923828, "loss": 0.6281, "rewards/accuracies": 1.0, "rewards/generated": -11.314410209655762, "rewards/margins": 32.45915985107422, "rewards/real": 21.144752502441406, "step": 8150 }, { "epoch": 0.9812409812409812, "grad_norm": 19.043560298229426, "learning_rate": 1.0422234099412079e-08, "logits/generated": -1.8975727558135986, "logits/real": -2.0573973655700684, "logps/generated": -684.2650146484375, "logps/real": -272.1838684082031, "loss": 0.4934, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.558279991149902, "rewards/margins": 32.49468994140625, "rewards/real": 25.936412811279297, "step": 8160 }, { "epoch": 0.9824434824434825, "grad_norm": 255.80157344973264, "learning_rate": 9.754142169962585e-09, "logits/generated": -1.868058204650879, "logits/real": -1.9154685735702515, "logps/generated": -542.9749755859375, "logps/real": -217.7202911376953, "loss": 0.7706, "rewards/accuracies": 0.949999988079071, "rewards/generated": -6.893022060394287, "rewards/margins": 25.462129592895508, "rewards/real": 18.569110870361328, "step": 8170 }, { "epoch": 0.9836459836459837, "grad_norm": 729.4401918580012, "learning_rate": 9.086050240513093e-09, "logits/generated": -1.9020189046859741, "logits/real": -1.948012351989746, "logps/generated": -636.56298828125, "logps/real": -270.43939208984375, "loss": 0.4966, "rewards/accuracies": 0.925000011920929, "rewards/generated": -10.260741233825684, "rewards/margins": 33.59025192260742, "rewards/real": 23.329511642456055, "step": 8180 }, { "epoch": 0.9848484848484849, "grad_norm": 494.7696205202936, "learning_rate": 8.417958311063602e-09, "logits/generated": -1.8801816701889038, "logits/real": -1.9777199029922485, "logps/generated": -511.88519287109375, "logps/real": -207.6827850341797, "loss": 0.5856, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -6.680776119232178, "rewards/margins": 26.44734764099121, "rewards/real": 19.766572952270508, "step": 8190 }, { "epoch": 0.9860509860509861, "grad_norm": 524.9149081458041, "learning_rate": 7.74986638161411e-09, "logits/generated": -1.695943832397461, "logits/real": -1.8408949375152588, "logps/generated": -603.311767578125, "logps/real": -169.5996856689453, "loss": 0.375, "rewards/accuracies": 0.949999988079071, "rewards/generated": -17.913097381591797, "rewards/margins": 33.64246368408203, "rewards/real": 15.729368209838867, "step": 8200 }, { "epoch": 0.9872534872534873, "grad_norm": 777.969282884708, "learning_rate": 7.081774452164618e-09, "logits/generated": -1.8220351934432983, "logits/real": -1.9063689708709717, "logps/generated": -680.7005004882812, "logps/real": -282.27911376953125, "loss": 0.4907, "rewards/accuracies": 0.949999988079071, "rewards/generated": -12.420315742492676, "rewards/margins": 33.469173431396484, "rewards/real": 21.04886245727539, "step": 8210 }, { "epoch": 0.9884559884559885, "grad_norm": 150.28547000576296, "learning_rate": 6.413682522715126e-09, "logits/generated": -1.8631532192230225, "logits/real": -2.0302681922912598, "logps/generated": -725.7526245117188, "logps/real": -301.81536865234375, "loss": 0.5798, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -12.97132396697998, "rewards/margins": 36.01592254638672, "rewards/real": 23.04459571838379, "step": 8220 }, { "epoch": 0.9896584896584897, "grad_norm": 7.4342139328404055, "learning_rate": 5.745590593265633e-09, "logits/generated": -1.8681774139404297, "logits/real": -1.9475829601287842, "logps/generated": -545.0968017578125, "logps/real": -263.8797912597656, "loss": 0.4305, "rewards/accuracies": 0.949999988079071, "rewards/generated": -4.504101753234863, "rewards/margins": 27.534839630126953, "rewards/real": 23.030738830566406, "step": 8230 }, { "epoch": 0.9908609908609909, "grad_norm": 5.857174381469028, "learning_rate": 5.07749866381614e-09, "logits/generated": -1.9394757747650146, "logits/real": -2.0266969203948975, "logps/generated": -588.8060302734375, "logps/real": -285.57464599609375, "loss": 0.5886, "rewards/accuracies": 0.949999988079071, "rewards/generated": -3.924767255783081, "rewards/margins": 28.511524200439453, "rewards/real": 24.58675193786621, "step": 8240 }, { "epoch": 0.9920634920634921, "grad_norm": 998.4137066643937, "learning_rate": 4.4094067343666485e-09, "logits/generated": -1.8733768463134766, "logits/real": -1.9439998865127563, "logps/generated": -653.6117553710938, "logps/real": -255.6890106201172, "loss": 0.5714, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -8.273148536682129, "rewards/margins": 30.489307403564453, "rewards/real": 22.216154098510742, "step": 8250 }, { "epoch": 0.9932659932659933, "grad_norm": 92.09120705704146, "learning_rate": 3.741314804917157e-09, "logits/generated": -1.8796441555023193, "logits/real": -1.9806550741195679, "logps/generated": -520.3734130859375, "logps/real": -201.2113037109375, "loss": 0.5388, "rewards/accuracies": 0.925000011920929, "rewards/generated": -7.409041404724121, "rewards/margins": 25.22804832458496, "rewards/real": 17.819007873535156, "step": 8260 }, { "epoch": 0.9944684944684945, "grad_norm": 16.67388472375853, "learning_rate": 3.0732228754676643e-09, "logits/generated": -1.8960243463516235, "logits/real": -1.9444904327392578, "logps/generated": -660.3632202148438, "logps/real": -218.44381713867188, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/generated": -15.783859252929688, "rewards/margins": 37.282047271728516, "rewards/real": 21.498188018798828, "step": 8270 }, { "epoch": 0.9956709956709957, "grad_norm": 26.501938148729327, "learning_rate": 2.405130946018172e-09, "logits/generated": -1.8098771572113037, "logits/real": -1.9966379404067993, "logps/generated": -601.8561401367188, "logps/real": -264.85504150390625, "loss": 0.4386, "rewards/accuracies": 0.925000011920929, "rewards/generated": -9.823989868164062, "rewards/margins": 34.13655471801758, "rewards/real": 24.31256103515625, "step": 8280 }, { "epoch": 0.9968734968734969, "grad_norm": 41.47023906797856, "learning_rate": 1.7370390165686799e-09, "logits/generated": -1.7665832042694092, "logits/real": -1.8990328311920166, "logps/generated": -603.8845825195312, "logps/real": -192.68128967285156, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/generated": -14.44365406036377, "rewards/margins": 31.124195098876953, "rewards/real": 16.680543899536133, "step": 8290 }, { "epoch": 0.9980759980759981, "grad_norm": 16.051955841539332, "learning_rate": 1.0689470871191874e-09, "logits/generated": -1.9049867391586304, "logits/real": -1.9545881748199463, "logps/generated": -469.04022216796875, "logps/real": -204.06179809570312, "loss": 0.5172, "rewards/accuracies": 0.8500000238418579, "rewards/generated": -0.7458006739616394, "rewards/margins": 19.346948623657227, "rewards/real": 18.601146697998047, "step": 8300 }, { "epoch": 0.9992784992784993, "grad_norm": 366.90768126900497, "learning_rate": 4.0085515766969536e-10, "logits/generated": -1.8730134963989258, "logits/real": -1.986374855041504, "logps/generated": -415.9329528808594, "logps/real": -208.17019653320312, "loss": 0.5692, "rewards/accuracies": 0.949999988079071, "rewards/generated": -2.6481776237487793, "rewards/margins": 19.210948944091797, "rewards/real": 16.562774658203125, "step": 8310 }, { "epoch": 1.0, "step": 8316, "total_flos": 0.0, "train_loss": 0.5495538560314325, "train_runtime": 35404.0549, "train_samples_per_second": 2.819, "train_steps_per_second": 0.235 } ], "logging_steps": 10, "max_steps": 8316, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }