{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6379585326953748, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4e-07, "logits/chosen": -2.9071130752563477, "logits/rejected": -2.8750061988830566, "loss": 0.6931, "policy_logps/chosen": -127.82667541503906, "policy_logps/rejected": -130.1011505126953, "referece_logps/chosen": -127.82667541503906, "referece_logps/rejected": -130.1011505126953, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 8e-07, "logits/chosen": -2.8690595626831055, "logits/rejected": -2.921374797821045, "loss": 0.6931, "policy_logps/chosen": -127.44483947753906, "policy_logps/rejected": -118.97954559326172, "referece_logps/chosen": -127.44483947753906, "referece_logps/rejected": -118.97954559326172, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.02, "learning_rate": 1.2e-06, "logits/chosen": -2.876248359680176, "logits/rejected": -2.9442975521087646, "loss": 0.6931, "policy_logps/chosen": -151.38050842285156, "policy_logps/rejected": -104.58517456054688, "referece_logps/chosen": -151.3734893798828, "referece_logps/rejected": -104.58135986328125, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0007032513385638595, "rewards/margins": -0.00032021405058912933, "rewards/rejected": -0.0003830373170785606, "step": 3 }, { "epoch": 0.03, "learning_rate": 1.6e-06, "logits/chosen": -2.923973560333252, "logits/rejected": -2.9425337314605713, "loss": 0.6924, "policy_logps/chosen": -121.4244613647461, "policy_logps/rejected": -117.06787109375, "referece_logps/chosen": -121.42953491210938, "referece_logps/rejected": -117.06060028076172, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005075454828329384, "rewards/margins": 0.0012342334957793355, "rewards/rejected": -0.0007266878965310752, "step": 4 }, { "epoch": 0.03, "learning_rate": 2e-06, "logits/chosen": -2.896630048751831, "logits/rejected": -2.874107837677002, "loss": 0.6925, "policy_logps/chosen": -112.3115234375, "policy_logps/rejected": -117.28302764892578, "referece_logps/chosen": -112.33646392822266, "referece_logps/rejected": -117.27299499511719, "rewards/accuracies": 0.625, "rewards/chosen": 0.0024944781325757504, "rewards/margins": 0.003496956778690219, "rewards/rejected": -0.0010024786461144686, "step": 5 }, { "epoch": 0.04, "learning_rate": 1.999783578606323e-06, "logits/chosen": -2.8770463466644287, "logits/rejected": -2.893326759338379, "loss": 0.6908, "policy_logps/chosen": -111.89071655273438, "policy_logps/rejected": -110.08009338378906, "referece_logps/chosen": -111.92901611328125, "referece_logps/rejected": -110.06503295898438, "rewards/accuracies": 0.75, "rewards/chosen": 0.00383090996183455, "rewards/margins": 0.005336493253707886, "rewards/rejected": -0.0015055835247039795, "step": 6 }, { "epoch": 0.04, "learning_rate": 1.999134408101731e-06, "logits/chosen": -2.8742122650146484, "logits/rejected": -2.9248428344726562, "loss": 0.6872, "policy_logps/chosen": -160.53836059570312, "policy_logps/rejected": -137.42295837402344, "referece_logps/chosen": -160.57460021972656, "referece_logps/rejected": -137.33425903320312, "rewards/accuracies": 0.75, "rewards/chosen": 0.003623390104621649, "rewards/margins": 0.01249313447624445, "rewards/rejected": -0.008869742974638939, "step": 7 }, { "epoch": 0.05, "learning_rate": 1.998052769474995e-06, "logits/chosen": -2.9328360557556152, "logits/rejected": -2.975874185562134, "loss": 0.6898, "policy_logps/chosen": -75.984619140625, "policy_logps/rejected": -74.07447814941406, "referece_logps/chosen": -76.07889556884766, "referece_logps/rejected": -74.01802825927734, "rewards/accuracies": 0.75, "rewards/chosen": 0.009427506476640701, "rewards/margins": 0.015072083100676537, "rewards/rejected": -0.0056445784866809845, "step": 8 }, { "epoch": 0.06, "learning_rate": 1.9965391309055927e-06, "logits/chosen": -2.882821798324585, "logits/rejected": -2.901142120361328, "loss": 0.6773, "policy_logps/chosen": -107.12052917480469, "policy_logps/rejected": -93.86226654052734, "referece_logps/chosen": -107.38616943359375, "referece_logps/rejected": -93.67011260986328, "rewards/accuracies": 0.9375, "rewards/chosen": 0.026563584804534912, "rewards/margins": 0.04577912390232086, "rewards/rejected": -0.019215542823076248, "step": 9 }, { "epoch": 0.06, "learning_rate": 1.9945941475610623e-06, "logits/chosen": -2.8708529472351074, "logits/rejected": -2.9397594928741455, "loss": 0.6735, "policy_logps/chosen": -123.71336364746094, "policy_logps/rejected": -100.81864166259766, "referece_logps/chosen": -124.01136779785156, "referece_logps/rejected": -100.63360595703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.029799818992614746, "rewards/margins": 0.048302434384822845, "rewards/rejected": -0.0185026116669178, "step": 10 }, { "epoch": 0.07, "learning_rate": 1.992218661313415e-06, "logits/chosen": -2.8874049186706543, "logits/rejected": -2.9123711585998535, "loss": 0.6729, "policy_logps/chosen": -104.46468353271484, "policy_logps/rejected": -98.05982971191406, "referece_logps/chosen": -104.58216857910156, "referece_logps/rejected": -97.82801055908203, "rewards/accuracies": 0.8125, "rewards/chosen": 0.011748719029128551, "rewards/margins": 0.03493000194430351, "rewards/rejected": -0.023181283846497536, "step": 11 }, { "epoch": 0.08, "learning_rate": 1.98941370037474e-06, "logits/chosen": -2.9089269638061523, "logits/rejected": -2.887115478515625, "loss": 0.67, "policy_logps/chosen": -125.26738739013672, "policy_logps/rejected": -118.79678344726562, "referece_logps/chosen": -125.5167007446289, "referece_logps/rejected": -118.42857360839844, "rewards/accuracies": 0.875, "rewards/chosen": 0.024931641295552254, "rewards/margins": 0.06175263226032257, "rewards/rejected": -0.036820992827415466, "step": 12 }, { "epoch": 0.08, "learning_rate": 1.986180478852149e-06, "logits/chosen": -2.8961265087127686, "logits/rejected": -2.85491943359375, "loss": 0.6635, "policy_logps/chosen": -152.51437377929688, "policy_logps/rejected": -152.42971801757812, "referece_logps/chosen": -152.56588745117188, "referece_logps/rejected": -152.2041015625, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005150413140654564, "rewards/margins": 0.027712417766451836, "rewards/rejected": -0.02256200462579727, "step": 13 }, { "epoch": 0.09, "learning_rate": 1.982520396222257e-06, "logits/chosen": -2.8753738403320312, "logits/rejected": -2.912999153137207, "loss": 0.6472, "policy_logps/chosen": -101.14571380615234, "policy_logps/rejected": -99.60147094726562, "referece_logps/chosen": -101.51478576660156, "referece_logps/rejected": -98.9403076171875, "rewards/accuracies": 0.875, "rewards/chosen": 0.03690744936466217, "rewards/margins": 0.10302485525608063, "rewards/rejected": -0.06611741334199905, "step": 14 }, { "epoch": 0.1, "learning_rate": 1.978435036725432e-06, "logits/chosen": -2.9012084007263184, "logits/rejected": -2.8953423500061035, "loss": 0.6372, "policy_logps/chosen": -126.27330017089844, "policy_logps/rejected": -128.28578186035156, "referece_logps/chosen": -126.68486785888672, "referece_logps/rejected": -127.51297760009766, "rewards/accuracies": 1.0, "rewards/chosen": 0.04115738719701767, "rewards/margins": 0.11843809485435486, "rewards/rejected": -0.07728070020675659, "step": 15 }, { "epoch": 0.1, "learning_rate": 1.9739261686800657e-06, "logits/chosen": -2.9024605751037598, "logits/rejected": -2.8853814601898193, "loss": 0.6414, "policy_logps/chosen": -114.59141540527344, "policy_logps/rejected": -128.96331787109375, "referece_logps/chosen": -114.98219299316406, "referece_logps/rejected": -128.28781127929688, "rewards/accuracies": 0.875, "rewards/chosen": 0.03907782956957817, "rewards/margins": 0.1066286489367485, "rewards/rejected": -0.06755081564188004, "step": 16 }, { "epoch": 0.11, "learning_rate": 1.968995743717171e-06, "logits/chosen": -2.912461757659912, "logits/rejected": -2.9493064880371094, "loss": 0.6429, "policy_logps/chosen": -111.6058349609375, "policy_logps/rejected": -109.18556213378906, "referece_logps/chosen": -111.89552307128906, "referece_logps/rejected": -108.55181884765625, "rewards/accuracies": 0.75, "rewards/chosen": 0.028969965875148773, "rewards/margins": 0.09234414994716644, "rewards/rejected": -0.06337418407201767, "step": 17 }, { "epoch": 0.11, "learning_rate": 1.9636458959356316e-06, "logits/chosen": -2.8974556922912598, "logits/rejected": -2.89353084564209, "loss": 0.6528, "policy_logps/chosen": -138.28262329101562, "policy_logps/rejected": -128.17782592773438, "referece_logps/chosen": -138.3295440673828, "referece_logps/rejected": -127.26476287841797, "rewards/accuracies": 0.875, "rewards/chosen": 0.004691671580076218, "rewards/margins": 0.09599801898002625, "rewards/rejected": -0.09130635112524033, "step": 18 }, { "epoch": 0.12, "learning_rate": 1.9578789409784727e-06, "logits/chosen": -2.8959522247314453, "logits/rejected": -2.9369704723358154, "loss": 0.6446, "policy_logps/chosen": -111.41603088378906, "policy_logps/rejected": -104.68195343017578, "referece_logps/chosen": -111.60549926757812, "referece_logps/rejected": -104.04905700683594, "rewards/accuracies": 0.8125, "rewards/chosen": 0.018947793170809746, "rewards/margins": 0.08223824948072433, "rewards/rejected": -0.06329045444726944, "step": 19 }, { "epoch": 0.13, "learning_rate": 1.951697375030553e-06, "logits/chosen": -2.853641986846924, "logits/rejected": -2.8619699478149414, "loss": 0.6466, "policy_logps/chosen": -145.3450927734375, "policy_logps/rejected": -134.5753631591797, "referece_logps/chosen": -145.5601348876953, "referece_logps/rejected": -133.66200256347656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.021503955125808716, "rewards/margins": 0.11283906549215317, "rewards/rejected": -0.09133510291576385, "step": 20 }, { "epoch": 0.13, "learning_rate": 1.9451038737381077e-06, "logits/chosen": -2.93542218208313, "logits/rejected": -2.9347047805786133, "loss": 0.6346, "policy_logps/chosen": -97.80740356445312, "policy_logps/rejected": -92.36180877685547, "referece_logps/chosen": -97.90306091308594, "referece_logps/rejected": -91.67369079589844, "rewards/accuracies": 0.8125, "rewards/chosen": 0.009566396474838257, "rewards/margins": 0.07837802916765213, "rewards/rejected": -0.06881163269281387, "step": 21 }, { "epoch": 0.14, "learning_rate": 1.9381012910506143e-06, "logits/chosen": -2.8574209213256836, "logits/rejected": -2.860379219055176, "loss": 0.6102, "policy_logps/chosen": -152.8387908935547, "policy_logps/rejected": -156.60391235351562, "referece_logps/chosen": -152.914306640625, "referece_logps/rejected": -155.2355499267578, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007550956681370735, "rewards/margins": 0.14438626170158386, "rewards/rejected": -0.13683530688285828, "step": 22 }, { "epoch": 0.15, "learning_rate": 1.9306926579854817e-06, "logits/chosen": -2.8623623847961426, "logits/rejected": -2.8780465126037598, "loss": 0.6306, "policy_logps/chosen": -143.302490234375, "policy_logps/rejected": -127.38278198242188, "referece_logps/chosen": -143.29205322265625, "referece_logps/rejected": -126.24747467041016, "rewards/accuracies": 0.875, "rewards/chosen": -0.0010449867695569992, "rewards/margins": 0.11248550564050674, "rewards/rejected": -0.11353050917387009, "step": 23 }, { "epoch": 0.15, "learning_rate": 1.922881181316097e-06, "logits/chosen": -2.949070930480957, "logits/rejected": -2.954594135284424, "loss": 0.6002, "policy_logps/chosen": -74.232177734375, "policy_logps/rejected": -73.76258850097656, "referece_logps/chosen": -75.20740509033203, "referece_logps/rejected": -72.94302368164062, "rewards/accuracies": 0.875, "rewards/chosen": 0.09752248972654343, "rewards/margins": 0.17947959899902344, "rewards/rejected": -0.08195710927248001, "step": 24 }, { "epoch": 0.16, "learning_rate": 1.9146702421837946e-06, "logits/chosen": -2.8681116104125977, "logits/rejected": -2.8957810401916504, "loss": 0.612, "policy_logps/chosen": -127.76103210449219, "policy_logps/rejected": -125.47801208496094, "referece_logps/chosen": -128.02197265625, "referece_logps/rejected": -124.08172607421875, "rewards/accuracies": 0.8125, "rewards/chosen": 0.026093529537320137, "rewards/margins": 0.1657221019268036, "rewards/rejected": -0.1396285593509674, "step": 25 }, { "epoch": 0.17, "learning_rate": 1.906063394634356e-06, "logits/chosen": -2.862689256668091, "logits/rejected": -2.877995252609253, "loss": 0.6398, "policy_logps/chosen": -120.70744323730469, "policy_logps/rejected": -113.76991271972656, "referece_logps/chosen": -120.96971130371094, "referece_logps/rejected": -112.71368408203125, "rewards/accuracies": 0.875, "rewards/chosen": 0.026226602494716644, "rewards/margins": 0.13184988498687744, "rewards/rejected": -0.1056232899427414, "step": 26 }, { "epoch": 0.17, "learning_rate": 1.897064364079664e-06, "logits/chosen": -2.9357306957244873, "logits/rejected": -2.976917028427124, "loss": 0.5904, "policy_logps/chosen": -105.41529846191406, "policy_logps/rejected": -96.86430358886719, "referece_logps/chosen": -106.42874145507812, "referece_logps/rejected": -95.49075317382812, "rewards/accuracies": 0.875, "rewards/chosen": 0.10134478658437729, "rewards/margins": 0.23869961500167847, "rewards/rejected": -0.13735483586788177, "step": 27 }, { "epoch": 0.18, "learning_rate": 1.8876770456851876e-06, "logits/chosen": -2.8511600494384766, "logits/rejected": -2.888685941696167, "loss": 0.5817, "policy_logps/chosen": -135.38853454589844, "policy_logps/rejected": -136.4590301513672, "referece_logps/chosen": -135.95562744140625, "referece_logps/rejected": -134.50482177734375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05670913681387901, "rewards/margins": 0.2521297335624695, "rewards/rejected": -0.19542059302330017, "step": 28 }, { "epoch": 0.19, "learning_rate": 1.8779055026839868e-06, "logits/chosen": -2.9084866046905518, "logits/rejected": -2.9267258644104004, "loss": 0.6042, "policy_logps/chosen": -134.9838409423828, "policy_logps/rejected": -114.98860931396484, "referece_logps/chosen": -135.53085327148438, "referece_logps/rejected": -113.39382934570312, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05469997972249985, "rewards/margins": 0.21417750418186188, "rewards/rejected": -0.15947751700878143, "step": 29 }, { "epoch": 0.19, "learning_rate": 1.8677539646179705e-06, "logits/chosen": -2.88179349899292, "logits/rejected": -2.929935932159424, "loss": 0.5827, "policy_logps/chosen": -163.58660888671875, "policy_logps/rejected": -131.79258728027344, "referece_logps/chosen": -163.90985107421875, "referece_logps/rejected": -129.81314086914062, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0323248989880085, "rewards/margins": 0.23026807606220245, "rewards/rejected": -0.19794318079948425, "step": 30 }, { "epoch": 0.2, "learning_rate": 1.8572268255071718e-06, "logits/chosen": -2.9416298866271973, "logits/rejected": -2.961357831954956, "loss": 0.5977, "policy_logps/chosen": -98.88802337646484, "policy_logps/rejected": -94.90333557128906, "referece_logps/chosen": -99.61119842529297, "referece_logps/rejected": -93.1702651977539, "rewards/accuracies": 0.875, "rewards/chosen": 0.07231828570365906, "rewards/margins": 0.24562585353851318, "rewards/rejected": -0.17330753803253174, "step": 31 }, { "epoch": 0.2, "learning_rate": 1.8463286419478252e-06, "logits/chosen": -2.9383907318115234, "logits/rejected": -2.880384922027588, "loss": 0.5772, "policy_logps/chosen": -118.6290283203125, "policy_logps/rejected": -126.94461822509766, "referece_logps/chosen": -118.98780822753906, "referece_logps/rejected": -125.21376037597656, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03587843477725983, "rewards/margins": 0.20896492898464203, "rewards/rejected": -0.173086479306221, "step": 32 }, { "epoch": 0.21, "learning_rate": 1.835064131140081e-06, "logits/chosen": -2.908090829849243, "logits/rejected": -2.9141459465026855, "loss": 0.5723, "policy_logps/chosen": -132.4810333251953, "policy_logps/rejected": -130.16603088378906, "referece_logps/chosen": -132.7151336669922, "referece_logps/rejected": -127.14649963378906, "rewards/accuracies": 1.0, "rewards/chosen": 0.02341010421514511, "rewards/margins": 0.32536280155181885, "rewards/rejected": -0.3019527196884155, "step": 33 }, { "epoch": 0.22, "learning_rate": 1.8234381688461941e-06, "logits/chosen": -2.9611878395080566, "logits/rejected": -2.966728448867798, "loss": 0.5734, "policy_logps/chosen": -119.45679473876953, "policy_logps/rejected": -118.76419830322266, "referece_logps/chosen": -118.86337280273438, "referece_logps/rejected": -115.91516876220703, "rewards/accuracies": 0.8125, "rewards/chosen": -0.059341806918382645, "rewards/margins": 0.22556202113628387, "rewards/rejected": -0.28490379452705383, "step": 34 }, { "epoch": 0.22, "learning_rate": 1.8114557872800905e-06, "logits/chosen": -2.967761993408203, "logits/rejected": -2.900844097137451, "loss": 0.5633, "policy_logps/chosen": -130.78848266601562, "policy_logps/rejected": -142.6917724609375, "referece_logps/chosen": -130.37750244140625, "referece_logps/rejected": -139.14178466796875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.041097551584243774, "rewards/margins": 0.31390050053596497, "rewards/rejected": -0.35499805212020874, "step": 35 }, { "epoch": 0.23, "learning_rate": 1.7991221729292058e-06, "logits/chosen": -2.8585638999938965, "logits/rejected": -2.90903377532959, "loss": 0.5649, "policy_logps/chosen": -137.8734893798828, "policy_logps/rejected": -127.91438293457031, "referece_logps/chosen": -137.9367218017578, "referece_logps/rejected": -125.2115478515625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0063229575753211975, "rewards/margins": 0.27660617232322693, "rewards/rejected": -0.27028322219848633, "step": 36 }, { "epoch": 0.24, "learning_rate": 1.7864426643095536e-06, "logits/chosen": -2.9357798099517822, "logits/rejected": -2.900618553161621, "loss": 0.5592, "policy_logps/chosen": -151.2121124267578, "policy_logps/rejected": -134.10804748535156, "referece_logps/chosen": -150.83583068847656, "referece_logps/rejected": -131.0611572265625, "rewards/accuracies": 0.875, "rewards/chosen": -0.03762848675251007, "rewards/margins": 0.26705947518348694, "rewards/rejected": -0.3046879470348358, "step": 37 }, { "epoch": 0.24, "learning_rate": 1.7734227496549878e-06, "logits/chosen": -2.9032468795776367, "logits/rejected": -2.9027702808380127, "loss": 0.5684, "policy_logps/chosen": -104.47734069824219, "policy_logps/rejected": -106.55696105957031, "referece_logps/chosen": -105.3730697631836, "referece_logps/rejected": -104.14241027832031, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0895722359418869, "rewards/margins": 0.3310272991657257, "rewards/rejected": -0.241455078125, "step": 38 }, { "epoch": 0.25, "learning_rate": 1.7600680645416582e-06, "logits/chosen": -2.9712843894958496, "logits/rejected": -2.894430637359619, "loss": 0.5499, "policy_logps/chosen": -126.83047485351562, "policy_logps/rejected": -138.54771423339844, "referece_logps/chosen": -126.56389617919922, "referece_logps/rejected": -135.42103576660156, "rewards/accuracies": 0.875, "rewards/chosen": -0.026657823473215103, "rewards/margins": 0.2860097885131836, "rewards/rejected": -0.312667578458786, "step": 39 }, { "epoch": 0.26, "learning_rate": 1.7463843894486936e-06, "logits/chosen": -2.942542552947998, "logits/rejected": -2.9863080978393555, "loss": 0.5534, "policy_logps/chosen": -93.51473999023438, "policy_logps/rejected": -94.46737670898438, "referece_logps/chosen": -93.94270324707031, "referece_logps/rejected": -91.81558227539062, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04279506206512451, "rewards/margins": 0.3079749345779419, "rewards/rejected": -0.2651798725128174, "step": 40 }, { "epoch": 0.26, "learning_rate": 1.7323776472561625e-06, "logits/chosen": -2.9191946983337402, "logits/rejected": -2.935122013092041, "loss": 0.5682, "policy_logps/chosen": -128.17813110351562, "policy_logps/rejected": -139.06613159179688, "referece_logps/chosen": -127.54570770263672, "referece_logps/rejected": -135.201904296875, "rewards/accuracies": 0.875, "rewards/chosen": -0.06324195861816406, "rewards/margins": 0.32317861914634705, "rewards/rejected": -0.3864205777645111, "step": 41 }, { "epoch": 0.27, "learning_rate": 1.7180539006813969e-06, "logits/chosen": -2.920085906982422, "logits/rejected": -2.906773567199707, "loss": 0.5454, "policy_logps/chosen": -126.83687591552734, "policy_logps/rejected": -114.17430877685547, "referece_logps/chosen": -126.77012634277344, "referece_logps/rejected": -110.9568862915039, "rewards/accuracies": 0.875, "rewards/chosen": -0.006674099713563919, "rewards/margins": 0.31506818532943726, "rewards/rejected": -0.3217422664165497, "step": 42 }, { "epoch": 0.27, "learning_rate": 1.7034193496547902e-06, "logits/chosen": -2.8620564937591553, "logits/rejected": -2.9222469329833984, "loss": 0.534, "policy_logps/chosen": -125.01854705810547, "policy_logps/rejected": -116.95589447021484, "referece_logps/chosen": -125.00530242919922, "referece_logps/rejected": -113.22078704833984, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0013249870389699936, "rewards/margins": 0.3721860647201538, "rewards/rejected": -0.37351107597351074, "step": 43 }, { "epoch": 0.28, "learning_rate": 1.6884803286362e-06, "logits/chosen": -2.8868565559387207, "logits/rejected": -2.91511869430542, "loss": 0.5459, "policy_logps/chosen": -150.9369354248047, "policy_logps/rejected": -144.1173858642578, "referece_logps/chosen": -150.93002319335938, "referece_logps/rejected": -139.8741455078125, "rewards/accuracies": 0.875, "rewards/chosen": -0.0006910450756549835, "rewards/margins": 0.42363405227661133, "rewards/rejected": -0.4243250787258148, "step": 44 }, { "epoch": 0.29, "learning_rate": 1.673243303873124e-06, "logits/chosen": -2.9313180446624756, "logits/rejected": -2.9371540546417236, "loss": 0.5569, "policy_logps/chosen": -122.50914764404297, "policy_logps/rejected": -110.94712829589844, "referece_logps/chosen": -122.10940551757812, "referece_logps/rejected": -108.29950714111328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03997454792261124, "rewards/margins": 0.22478806972503662, "rewards/rejected": -0.26476261019706726, "step": 45 }, { "epoch": 0.29, "learning_rate": 1.6577148706018328e-06, "logits/chosen": -2.894786834716797, "logits/rejected": -2.977565288543701, "loss": 0.5775, "policy_logps/chosen": -114.12425231933594, "policy_logps/rejected": -113.44467163085938, "referece_logps/chosen": -112.93356323242188, "referece_logps/rejected": -110.42334747314453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11906924843788147, "rewards/margins": 0.18306350708007812, "rewards/rejected": -0.3021327555179596, "step": 46 }, { "epoch": 0.3, "learning_rate": 1.6419017501926656e-06, "logits/chosen": -2.90000581741333, "logits/rejected": -2.9254188537597656, "loss": 0.5176, "policy_logps/chosen": -128.19607543945312, "policy_logps/rejected": -122.17625427246094, "referece_logps/chosen": -128.63787841796875, "referece_logps/rejected": -118.01435852050781, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04418013244867325, "rewards/margins": 0.4603692889213562, "rewards/rejected": -0.41618919372558594, "step": 47 }, { "epoch": 0.31, "learning_rate": 1.6258107872407374e-06, "logits/chosen": -2.9072225093841553, "logits/rejected": -2.9065234661102295, "loss": 0.5243, "policy_logps/chosen": -125.48599243164062, "policy_logps/rejected": -129.7559814453125, "referece_logps/chosen": -126.17718505859375, "referece_logps/rejected": -127.1695785522461, "rewards/accuracies": 0.875, "rewards/chosen": 0.06911970674991608, "rewards/margins": 0.32776087522506714, "rewards/rejected": -0.25864115357398987, "step": 48 }, { "epoch": 0.31, "learning_rate": 1.6094489466033042e-06, "logits/chosen": -2.917146682739258, "logits/rejected": -2.9667577743530273, "loss": 0.5559, "policy_logps/chosen": -108.18141174316406, "policy_logps/rejected": -99.39623260498047, "referece_logps/chosen": -108.76286315917969, "referece_logps/rejected": -96.53907775878906, "rewards/accuracies": 0.875, "rewards/chosen": 0.0581454373896122, "rewards/margins": 0.34386110305786133, "rewards/rejected": -0.28571566939353943, "step": 49 }, { "epoch": 0.32, "learning_rate": 1.5928233103850727e-06, "logits/chosen": -2.906874179840088, "logits/rejected": -2.919196844100952, "loss": 0.5364, "policy_logps/chosen": -161.1468048095703, "policy_logps/rejected": -143.97592163085938, "referece_logps/chosen": -160.62081909179688, "referece_logps/rejected": -140.28585815429688, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052598677575588226, "rewards/margins": 0.31640806794166565, "rewards/rejected": -0.3690067529678345, "step": 50 }, { "epoch": 0.33, "learning_rate": 1.575941074872766e-06, "logits/chosen": -2.957002878189087, "logits/rejected": -2.9672722816467285, "loss": 0.5172, "policy_logps/chosen": -119.42903900146484, "policy_logps/rejected": -116.86624145507812, "referece_logps/chosen": -120.13378143310547, "referece_logps/rejected": -113.2657241821289, "rewards/accuracies": 1.0, "rewards/chosen": 0.07047442346811295, "rewards/margins": 0.4305253326892853, "rewards/rejected": -0.36005088686943054, "step": 51 }, { "epoch": 0.33, "learning_rate": 1.5588095474202594e-06, "logits/chosen": -2.8591084480285645, "logits/rejected": -2.8590025901794434, "loss": 0.4933, "policy_logps/chosen": -157.062744140625, "policy_logps/rejected": -151.7144775390625, "referece_logps/chosen": -156.53114318847656, "referece_logps/rejected": -147.15911865234375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05316000431776047, "rewards/margins": 0.4023759067058563, "rewards/rejected": -0.4555359482765198, "step": 52 }, { "epoch": 0.34, "learning_rate": 1.5414361432856474e-06, "logits/chosen": -2.9438529014587402, "logits/rejected": -2.9987499713897705, "loss": 0.4979, "policy_logps/chosen": -117.94309997558594, "policy_logps/rejected": -108.414306640625, "referece_logps/chosen": -118.55531311035156, "referece_logps/rejected": -104.54682922363281, "rewards/accuracies": 0.9375, "rewards/chosen": 0.061221349984407425, "rewards/margins": 0.4479690194129944, "rewards/rejected": -0.38674765825271606, "step": 53 }, { "epoch": 0.34, "learning_rate": 1.5238283824216013e-06, "logits/chosen": -2.9244260787963867, "logits/rejected": -2.9373199939727783, "loss": 0.4961, "policy_logps/chosen": -119.873779296875, "policy_logps/rejected": -108.0604248046875, "referece_logps/chosen": -120.80511474609375, "referece_logps/rejected": -104.79627990722656, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09313352406024933, "rewards/margins": 0.4195476770401001, "rewards/rejected": -0.32641416788101196, "step": 54 }, { "epoch": 0.35, "learning_rate": 1.5059938862204125e-06, "logits/chosen": -2.920050621032715, "logits/rejected": -2.9494502544403076, "loss": 0.5339, "policy_logps/chosen": -128.6884307861328, "policy_logps/rejected": -124.91862487792969, "referece_logps/chosen": -127.86734008789062, "referece_logps/rejected": -120.62930297851562, "rewards/accuracies": 0.875, "rewards/chosen": -0.08210951089859009, "rewards/margins": 0.34682315587997437, "rewards/rejected": -0.42893266677856445, "step": 55 }, { "epoch": 0.36, "learning_rate": 1.4879403742151283e-06, "logits/chosen": -2.907794713973999, "logits/rejected": -2.9134182929992676, "loss": 0.5389, "policy_logps/chosen": -129.15188598632812, "policy_logps/rejected": -135.58035278320312, "referece_logps/chosen": -127.82069396972656, "referece_logps/rejected": -131.39085388183594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13311973214149475, "rewards/margins": 0.28583019971847534, "rewards/rejected": -0.4189499020576477, "step": 56 }, { "epoch": 0.36, "learning_rate": 1.4696756607382058e-06, "logits/chosen": -2.9352166652679443, "logits/rejected": -2.9330966472625732, "loss": 0.543, "policy_logps/chosen": -125.51739501953125, "policy_logps/rejected": -128.10951232910156, "referece_logps/chosen": -124.24830627441406, "referece_logps/rejected": -123.51809692382812, "rewards/accuracies": 0.75, "rewards/chosen": -0.12690886855125427, "rewards/margins": 0.33223241567611694, "rewards/rejected": -0.4591412842273712, "step": 57 }, { "epoch": 0.37, "learning_rate": 1.4512076515391374e-06, "logits/chosen": -2.941624164581299, "logits/rejected": -2.9078333377838135, "loss": 0.4758, "policy_logps/chosen": -109.23553466796875, "policy_logps/rejected": -104.69176483154297, "referece_logps/chosen": -110.22058868408203, "referece_logps/rejected": -100.0502700805664, "rewards/accuracies": 1.0, "rewards/chosen": 0.09850560873746872, "rewards/margins": 0.5626559257507324, "rewards/rejected": -0.4641503393650055, "step": 58 }, { "epoch": 0.38, "learning_rate": 1.432544340362501e-06, "logits/chosen": -2.972963809967041, "logits/rejected": -2.9390883445739746, "loss": 0.4904, "policy_logps/chosen": -95.29005432128906, "policy_logps/rejected": -121.40338897705078, "referece_logps/chosen": -95.24687194824219, "referece_logps/rejected": -116.24635314941406, "rewards/accuracies": 0.9375, "rewards/chosen": -0.004318548366427422, "rewards/margins": 0.5113850235939026, "rewards/rejected": -0.515703558921814, "step": 59 }, { "epoch": 0.38, "learning_rate": 1.4136938054879282e-06, "logits/chosen": -2.9395623207092285, "logits/rejected": -2.953756809234619, "loss": 0.5286, "policy_logps/chosen": -122.0515365600586, "policy_logps/rejected": -128.63021850585938, "referece_logps/chosen": -120.7651596069336, "referece_logps/rejected": -123.8788070678711, "rewards/accuracies": 0.875, "rewards/chosen": -0.1286384016275406, "rewards/margins": 0.34650319814682007, "rewards/rejected": -0.47514158487319946, "step": 60 }, { "epoch": 0.39, "learning_rate": 1.3946642062334763e-06, "logits/chosen": -2.9107141494750977, "logits/rejected": -2.9332330226898193, "loss": 0.4832, "policy_logps/chosen": -121.9334716796875, "policy_logps/rejected": -119.00639343261719, "referece_logps/chosen": -122.2487564086914, "referece_logps/rejected": -114.04248809814453, "rewards/accuracies": 1.0, "rewards/chosen": 0.03152900189161301, "rewards/margins": 0.5279202461242676, "rewards/rejected": -0.49639129638671875, "step": 61 }, { "epoch": 0.4, "learning_rate": 1.37546377942393e-06, "logits/chosen": -2.950266122817993, "logits/rejected": -2.9494924545288086, "loss": 0.5178, "policy_logps/chosen": -104.2540283203125, "policy_logps/rejected": -125.53955841064453, "referece_logps/chosen": -104.66864776611328, "referece_logps/rejected": -121.41145324707031, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04146187752485275, "rewards/margins": 0.45427215099334717, "rewards/rejected": -0.4128102958202362, "step": 62 }, { "epoch": 0.4, "learning_rate": 1.3561008358255469e-06, "logits/chosen": -2.917292594909668, "logits/rejected": -2.952850818634033, "loss": 0.5263, "policy_logps/chosen": -118.34769439697266, "policy_logps/rejected": -112.80188751220703, "referece_logps/chosen": -118.5436782836914, "referece_logps/rejected": -109.08131408691406, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01959807053208351, "rewards/margins": 0.39165574312210083, "rewards/rejected": -0.3720576763153076, "step": 63 }, { "epoch": 0.41, "learning_rate": 1.3365837565488062e-06, "logits/chosen": -2.92209529876709, "logits/rejected": -2.9668972492218018, "loss": 0.4813, "policy_logps/chosen": -161.57546997070312, "policy_logps/rejected": -137.75039672851562, "referece_logps/chosen": -161.21104431152344, "referece_logps/rejected": -130.89552307128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.03644174337387085, "rewards/margins": 0.6490457057952881, "rewards/rejected": -0.6854873895645142, "step": 64 }, { "epoch": 0.41, "learning_rate": 1.3169209894207027e-06, "logits/chosen": -2.9616472721099854, "logits/rejected": -2.961239814758301, "loss": 0.4486, "policy_logps/chosen": -154.44467163085938, "policy_logps/rejected": -158.21372985839844, "referece_logps/chosen": -154.07650756835938, "referece_logps/rejected": -151.34902954101562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03681756183505058, "rewards/margins": 0.6496531367301941, "rewards/rejected": -0.6864707469940186, "step": 65 }, { "epoch": 0.42, "learning_rate": 1.2971210453281673e-06, "logits/chosen": -2.892361640930176, "logits/rejected": -2.8887600898742676, "loss": 0.4939, "policy_logps/chosen": -121.81805419921875, "policy_logps/rejected": -114.29660034179688, "referece_logps/chosen": -122.0481948852539, "referece_logps/rejected": -110.4261474609375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.023014426231384277, "rewards/margins": 0.41005975008010864, "rewards/rejected": -0.38704538345336914, "step": 66 }, { "epoch": 0.43, "learning_rate": 1.2771924945341906e-06, "logits/chosen": -2.91727352142334, "logits/rejected": -2.922391891479492, "loss": 0.4868, "policy_logps/chosen": -111.62061309814453, "policy_logps/rejected": -106.5035400390625, "referece_logps/chosen": -112.6988525390625, "referece_logps/rejected": -101.50572967529297, "rewards/accuracies": 1.0, "rewards/chosen": 0.10782448947429657, "rewards/margins": 0.6076046228408813, "rewards/rejected": -0.49978014826774597, "step": 67 }, { "epoch": 0.43, "learning_rate": 1.257143962968246e-06, "logits/chosen": -2.9374663829803467, "logits/rejected": -2.9512386322021484, "loss": 0.5144, "policy_logps/chosen": -149.2052001953125, "policy_logps/rejected": -130.6556396484375, "referece_logps/chosen": -147.80715942382812, "referece_logps/rejected": -125.28959655761719, "rewards/accuracies": 0.75, "rewards/chosen": -0.13980263471603394, "rewards/margins": 0.39680200815200806, "rewards/rejected": -0.536604642868042, "step": 68 }, { "epoch": 0.44, "learning_rate": 1.236984128492619e-06, "logits/chosen": -2.9574966430664062, "logits/rejected": -2.9541845321655273, "loss": 0.4811, "policy_logps/chosen": -105.58607482910156, "policy_logps/rejected": -107.0927734375, "referece_logps/chosen": -106.00244903564453, "referece_logps/rejected": -103.1064453125, "rewards/accuracies": 0.9375, "rewards/chosen": 0.041637033224105835, "rewards/margins": 0.44027072191238403, "rewards/rejected": -0.3986337184906006, "step": 69 }, { "epoch": 0.45, "learning_rate": 1.2167217171462566e-06, "logits/chosen": -2.96567964553833, "logits/rejected": -2.9865453243255615, "loss": 0.4782, "policy_logps/chosen": -137.23434448242188, "policy_logps/rejected": -116.43749237060547, "referece_logps/chosen": -137.0900421142578, "referece_logps/rejected": -110.79519653320312, "rewards/accuracies": 0.875, "rewards/chosen": -0.014430008828639984, "rewards/margins": 0.5497984886169434, "rewards/rejected": -0.5642285346984863, "step": 70 }, { "epoch": 0.45, "learning_rate": 1.1963654993677643e-06, "logits/chosen": -2.9047865867614746, "logits/rejected": -2.9375927448272705, "loss": 0.4831, "policy_logps/chosen": -142.0609588623047, "policy_logps/rejected": -138.23785400390625, "referece_logps/chosen": -140.9311981201172, "referece_logps/rejected": -132.38949584960938, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1129767894744873, "rewards/margins": 0.47185903787612915, "rewards/rejected": -0.5848358869552612, "step": 71 }, { "epoch": 0.46, "learning_rate": 1.1759242861991854e-06, "logits/chosen": -2.9874184131622314, "logits/rejected": -2.9638171195983887, "loss": 0.4436, "policy_logps/chosen": -111.93683624267578, "policy_logps/rejected": -113.12979888916016, "referece_logps/chosen": -111.51397705078125, "referece_logps/rejected": -107.90960693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.042286355048418045, "rewards/margins": 0.4797336161136627, "rewards/rejected": -0.5220199227333069, "step": 72 }, { "epoch": 0.47, "learning_rate": 1.155406925472205e-06, "logits/chosen": -2.927311420440674, "logits/rejected": -2.921337366104126, "loss": 0.4284, "policy_logps/chosen": -147.83377075195312, "policy_logps/rejected": -148.07089233398438, "referece_logps/chosen": -147.18499755859375, "referece_logps/rejected": -140.68211364746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.0648760199546814, "rewards/margins": 0.6740000247955322, "rewards/rejected": -0.7388760447502136, "step": 73 }, { "epoch": 0.47, "learning_rate": 1.1348222979784287e-06, "logits/chosen": -2.9232394695281982, "logits/rejected": -2.9430999755859375, "loss": 0.5002, "policy_logps/chosen": -143.42274475097656, "policy_logps/rejected": -143.17144775390625, "referece_logps/chosen": -141.336669921875, "referece_logps/rejected": -136.30712890625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20860746502876282, "rewards/margins": 0.4778253436088562, "rewards/rejected": -0.6864327788352966, "step": 74 }, { "epoch": 0.48, "learning_rate": 1.1141793136253986e-06, "logits/chosen": -2.8802056312561035, "logits/rejected": -2.9181809425354004, "loss": 0.5141, "policy_logps/chosen": -155.84146118164062, "policy_logps/rejected": -145.16802978515625, "referece_logps/chosen": -154.25274658203125, "referece_logps/rejected": -140.40045166015625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15887098014354706, "rewards/margins": 0.31788796186447144, "rewards/rejected": -0.4767589569091797, "step": 75 }, { "epoch": 0.48, "learning_rate": 1.09348690758e-06, "logits/chosen": -2.921013355255127, "logits/rejected": -2.949887275695801, "loss": 0.4978, "policy_logps/chosen": -140.55694580078125, "policy_logps/rejected": -137.7884521484375, "referece_logps/chosen": -139.3011474609375, "referece_logps/rejected": -130.94146728515625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12557877600193024, "rewards/margins": 0.5591215491294861, "rewards/rejected": -0.6847003102302551, "step": 76 }, { "epoch": 0.49, "learning_rate": 1.072754036400944e-06, "logits/chosen": -2.9696455001831055, "logits/rejected": -2.9857373237609863, "loss": 0.417, "policy_logps/chosen": -113.93096923828125, "policy_logps/rejected": -107.09919738769531, "referece_logps/chosen": -115.27886962890625, "referece_logps/rejected": -102.10234069824219, "rewards/accuracies": 1.0, "rewards/chosen": 0.1347896158695221, "rewards/margins": 0.6344748139381409, "rewards/rejected": -0.49968522787094116, "step": 77 }, { "epoch": 0.5, "learning_rate": 1.0519896741619803e-06, "logits/chosen": -2.919851303100586, "logits/rejected": -2.9218404293060303, "loss": 0.4688, "policy_logps/chosen": -155.71444702148438, "policy_logps/rejected": -144.62152099609375, "referece_logps/chosen": -155.02752685546875, "referece_logps/rejected": -138.43026733398438, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0686924085021019, "rewards/margins": 0.5504311323165894, "rewards/rejected": -0.6191235184669495, "step": 78 }, { "epoch": 0.5, "learning_rate": 1.031202808567539e-06, "logits/chosen": -2.8995673656463623, "logits/rejected": -2.930441379547119, "loss": 0.4539, "policy_logps/chosen": -155.89486694335938, "policy_logps/rejected": -133.66708374023438, "referece_logps/chosen": -154.44532775878906, "referece_logps/rejected": -126.08793640136719, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14495408535003662, "rewards/margins": 0.6129606366157532, "rewards/rejected": -0.7579147815704346, "step": 79 }, { "epoch": 0.51, "learning_rate": 1.0104024370624642e-06, "logits/chosen": -2.9370193481445312, "logits/rejected": -2.9806201457977295, "loss": 0.4106, "policy_logps/chosen": -134.17471313476562, "policy_logps/rejected": -112.07872772216797, "referece_logps/chosen": -134.83750915527344, "referece_logps/rejected": -105.02826690673828, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06628021597862244, "rewards/margins": 0.7713260650634766, "rewards/rejected": -0.7050458192825317, "step": 80 }, { "epoch": 0.52, "learning_rate": 9.895975629375357e-07, "logits/chosen": -2.9489431381225586, "logits/rejected": -2.931128978729248, "loss": 0.4902, "policy_logps/chosen": -132.9930877685547, "policy_logps/rejected": -148.03472900390625, "referece_logps/chosen": -131.95632934570312, "referece_logps/rejected": -141.67227172851562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1036761999130249, "rewards/margins": 0.5325698852539062, "rewards/rejected": -0.6362460851669312, "step": 81 }, { "epoch": 0.52, "learning_rate": 9.687971914324607e-07, "logits/chosen": -2.911447525024414, "logits/rejected": -2.9363858699798584, "loss": 0.4865, "policy_logps/chosen": -136.85560607910156, "policy_logps/rejected": -110.8121337890625, "referece_logps/chosen": -135.94354248046875, "referece_logps/rejected": -105.58758544921875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09120647609233856, "rewards/margins": 0.4312480390071869, "rewards/rejected": -0.5224545001983643, "step": 82 }, { "epoch": 0.53, "learning_rate": 9.480103258380197e-07, "logits/chosen": -2.9371180534362793, "logits/rejected": -2.9503769874572754, "loss": 0.4557, "policy_logps/chosen": -150.19725036621094, "policy_logps/rejected": -149.049072265625, "referece_logps/chosen": -147.80014038085938, "referece_logps/rejected": -141.38333129882812, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23970948159694672, "rewards/margins": 0.5268632173538208, "rewards/rejected": -0.7665727138519287, "step": 83 }, { "epoch": 0.54, "learning_rate": 9.272459635990562e-07, "logits/chosen": -2.914623737335205, "logits/rejected": -2.909700393676758, "loss": 0.4753, "policy_logps/chosen": -149.44277954101562, "policy_logps/rejected": -158.23593139648438, "referece_logps/chosen": -147.42015075683594, "referece_logps/rejected": -149.85214233398438, "rewards/accuracies": 0.875, "rewards/chosen": -0.2022620439529419, "rewards/margins": 0.6361156702041626, "rewards/rejected": -0.838377833366394, "step": 84 }, { "epoch": 0.54, "learning_rate": 9.065130924199998e-07, "logits/chosen": -2.964315414428711, "logits/rejected": -2.9234838485717773, "loss": 0.5092, "policy_logps/chosen": -133.36880493164062, "policy_logps/rejected": -146.08169555664062, "referece_logps/chosen": -132.2157440185547, "referece_logps/rejected": -140.08538818359375, "rewards/accuracies": 0.875, "rewards/chosen": -0.11530620604753494, "rewards/margins": 0.4843238890171051, "rewards/rejected": -0.5996301174163818, "step": 85 }, { "epoch": 0.55, "learning_rate": 8.858206863746017e-07, "logits/chosen": -2.909698486328125, "logits/rejected": -2.918955087661743, "loss": 0.4525, "policy_logps/chosen": -132.2825469970703, "policy_logps/rejected": -136.70962524414062, "referece_logps/chosen": -130.87799072265625, "referece_logps/rejected": -130.4683837890625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1404554843902588, "rewards/margins": 0.48366838693618774, "rewards/rejected": -0.6241238713264465, "step": 86 }, { "epoch": 0.56, "learning_rate": 8.651777020215712e-07, "logits/chosen": -2.8775806427001953, "logits/rejected": -2.876236915588379, "loss": 0.4829, "policy_logps/chosen": -138.22640991210938, "policy_logps/rejected": -135.67745971679688, "referece_logps/chosen": -137.68087768554688, "referece_logps/rejected": -129.38088989257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.05455498397350311, "rewards/margins": 0.5751017928123474, "rewards/rejected": -0.6296567916870117, "step": 87 }, { "epoch": 0.56, "learning_rate": 8.445930745277951e-07, "logits/chosen": -2.9498825073242188, "logits/rejected": -2.9456593990325928, "loss": 0.4666, "policy_logps/chosen": -99.36846160888672, "policy_logps/rejected": -110.91441345214844, "referece_logps/chosen": -97.86239624023438, "referece_logps/rejected": -104.65141296386719, "rewards/accuracies": 0.875, "rewards/chosen": -0.15060700476169586, "rewards/margins": 0.47569307684898376, "rewards/rejected": -0.6263000965118408, "step": 88 }, { "epoch": 0.57, "learning_rate": 8.240757138008148e-07, "logits/chosen": -2.9100852012634277, "logits/rejected": -2.9250974655151367, "loss": 0.4921, "policy_logps/chosen": -144.23516845703125, "policy_logps/rejected": -140.12838745117188, "referece_logps/chosen": -142.4854736328125, "referece_logps/rejected": -133.91094970703125, "rewards/accuracies": 0.875, "rewards/chosen": -0.17496907711029053, "rewards/margins": 0.4467761516571045, "rewards/rejected": -0.621745228767395, "step": 89 }, { "epoch": 0.57, "learning_rate": 8.036345006322358e-07, "logits/chosen": -2.9227583408355713, "logits/rejected": -2.936587333679199, "loss": 0.4516, "policy_logps/chosen": -130.54054260253906, "policy_logps/rejected": -137.36415100097656, "referece_logps/chosen": -129.6831512451172, "referece_logps/rejected": -129.63467407226562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08573976159095764, "rewards/margins": 0.6872075796127319, "rewards/rejected": -0.772947371006012, "step": 90 }, { "epoch": 0.58, "learning_rate": 7.832782828537435e-07, "logits/chosen": -2.946342706680298, "logits/rejected": -2.9744181632995605, "loss": 0.4716, "policy_logps/chosen": -155.42410278320312, "policy_logps/rejected": -155.65284729003906, "referece_logps/chosen": -153.79293823242188, "referece_logps/rejected": -145.5734100341797, "rewards/accuracies": 0.9375, "rewards/chosen": -0.16311725974082947, "rewards/margins": 0.8448256850242615, "rewards/rejected": -1.0079429149627686, "step": 91 }, { "epoch": 0.59, "learning_rate": 7.630158715073812e-07, "logits/chosen": -2.9480092525482178, "logits/rejected": -2.971871852874756, "loss": 0.4588, "policy_logps/chosen": -117.96566772460938, "policy_logps/rejected": -120.06287384033203, "referece_logps/chosen": -117.35060119628906, "referece_logps/rejected": -114.15763092041016, "rewards/accuracies": 0.875, "rewards/chosen": -0.06150689721107483, "rewards/margins": 0.5290161371231079, "rewards/rejected": -0.5905230045318604, "step": 92 }, { "epoch": 0.59, "learning_rate": 7.428560370317541e-07, "logits/chosen": -2.9216105937957764, "logits/rejected": -2.973964214324951, "loss": 0.4124, "policy_logps/chosen": -118.88021850585938, "policy_logps/rejected": -116.6799087524414, "referece_logps/chosen": -118.77764892578125, "referece_logps/rejected": -110.41334533691406, "rewards/accuracies": 0.9375, "rewards/chosen": -0.010256083682179451, "rewards/margins": 0.6163986921310425, "rewards/rejected": -0.6266547441482544, "step": 93 }, { "epoch": 0.6, "learning_rate": 7.228075054658095e-07, "logits/chosen": -2.914088249206543, "logits/rejected": -2.949735164642334, "loss": 0.5151, "policy_logps/chosen": -156.11376953125, "policy_logps/rejected": -136.4941864013672, "referece_logps/chosen": -154.32318115234375, "referece_logps/rejected": -129.20169067382812, "rewards/accuracies": 0.875, "rewards/chosen": -0.17905890941619873, "rewards/margins": 0.5501898527145386, "rewards/rejected": -0.7292487621307373, "step": 94 }, { "epoch": 0.61, "learning_rate": 7.028789546718325e-07, "logits/chosen": -2.9285836219787598, "logits/rejected": -2.9289567470550537, "loss": 0.46, "policy_logps/chosen": -138.6322021484375, "policy_logps/rejected": -140.56346130371094, "referece_logps/chosen": -137.0457763671875, "referece_logps/rejected": -134.56263732910156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15864019095897675, "rewards/margins": 0.44144219160079956, "rewards/rejected": -0.6000823974609375, "step": 95 }, { "epoch": 0.61, "learning_rate": 6.830790105792973e-07, "logits/chosen": -2.9379961490631104, "logits/rejected": -2.9724767208099365, "loss": 0.4291, "policy_logps/chosen": -101.97422790527344, "policy_logps/rejected": -120.30264282226562, "referece_logps/chosen": -102.00912475585938, "referece_logps/rejected": -112.79841613769531, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0034902598708868027, "rewards/margins": 0.7539129257202148, "rewards/rejected": -0.7504226565361023, "step": 96 }, { "epoch": 0.62, "learning_rate": 6.634162434511938e-07, "logits/chosen": -2.9047234058380127, "logits/rejected": -2.9031195640563965, "loss": 0.4965, "policy_logps/chosen": -143.34796142578125, "policy_logps/rejected": -137.45285034179688, "referece_logps/chosen": -140.2106170654297, "referece_logps/rejected": -128.766845703125, "rewards/accuracies": 0.875, "rewards/chosen": -0.3137343227863312, "rewards/margins": 0.5548651814460754, "rewards/rejected": -0.868599534034729, "step": 97 }, { "epoch": 0.63, "learning_rate": 6.43899164174453e-07, "logits/chosen": -2.9683728218078613, "logits/rejected": -2.9473183155059814, "loss": 0.4951, "policy_logps/chosen": -99.94713592529297, "policy_logps/rejected": -99.26773834228516, "referece_logps/chosen": -100.4857406616211, "referece_logps/rejected": -93.8021011352539, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05386023223400116, "rewards/margins": 0.6004235148429871, "rewards/rejected": -0.5465632677078247, "step": 98 }, { "epoch": 0.63, "learning_rate": 6.245362205760703e-07, "logits/chosen": -2.912815570831299, "logits/rejected": -2.941206216812134, "loss": 0.4346, "policy_logps/chosen": -149.0521240234375, "policy_logps/rejected": -140.37326049804688, "referece_logps/chosen": -146.72096252441406, "referece_logps/rejected": -131.8609161376953, "rewards/accuracies": 0.875, "rewards/chosen": -0.23311704397201538, "rewards/margins": 0.6181185245513916, "rewards/rejected": -0.8512355089187622, "step": 99 }, { "epoch": 0.64, "learning_rate": 6.053357937665236e-07, "logits/chosen": -2.9297289848327637, "logits/rejected": -2.9020700454711914, "loss": 0.4962, "policy_logps/chosen": -130.0217742919922, "policy_logps/rejected": -139.19395446777344, "referece_logps/chosen": -128.82655334472656, "referece_logps/rejected": -132.361572265625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11952205747365952, "rewards/margins": 0.563715934753418, "rewards/rejected": -0.6832380294799805, "step": 100 } ], "max_steps": 156, "num_train_epochs": 1, "total_flos": 0.0, "trial_name": null, "trial_params": null }