{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -0.505158543586731, "logits/rejected": 1.1344256401062012, "logps/chosen": -534.2272338867188, "logps/rejected": -995.0223388671875, "loss": 0.21, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.9607843137254904e-07, "logits/chosen": -1.4771511554718018, "logits/rejected": -0.7203052043914795, "logps/chosen": -653.9701538085938, "logps/rejected": -1290.11083984375, "loss": 0.2983, "rewards/accuracies": 0.3055555522441864, "rewards/chosen": -0.00023890436568763107, "rewards/margins": -0.0006189702544361353, "rewards/rejected": 0.00038006596150808036, "step": 10 }, { "epoch": 0.01, "learning_rate": 3.921568627450981e-07, "logits/chosen": -1.5881028175354004, "logits/rejected": -0.847257137298584, "logps/chosen": -677.5276489257812, "logps/rejected": -1343.302978515625, "loss": 0.34, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005764259840361774, "rewards/margins": 0.0008251671679317951, "rewards/rejected": -0.0002487411838956177, "step": 20 }, { "epoch": 0.01, "learning_rate": 5.882352941176471e-07, "logits/chosen": -1.5565259456634521, "logits/rejected": -0.9040892720222473, "logps/chosen": -587.6061401367188, "logps/rejected": -1259.46630859375, "loss": 0.3992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0015199712943285704, "rewards/margins": 0.002795459469780326, "rewards/rejected": -0.004315430298447609, "step": 30 }, { "epoch": 0.02, "learning_rate": 7.843137254901962e-07, "logits/chosen": -1.3543564081192017, "logits/rejected": -0.5594847798347473, "logps/chosen": -660.8809814453125, "logps/rejected": -1349.8839111328125, "loss": 0.3377, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.007950540632009506, "rewards/margins": 0.009673960506916046, "rewards/rejected": -0.017624501138925552, "step": 40 }, { "epoch": 0.02, "learning_rate": 9.80392156862745e-07, "logits/chosen": -1.4439340829849243, "logits/rejected": -0.9004542231559753, "logps/chosen": -625.8778076171875, "logps/rejected": -1303.6329345703125, "loss": 0.3665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01639598235487938, "rewards/margins": 0.029322799295186996, "rewards/rejected": -0.04571877792477608, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.1764705882352942e-06, "logits/chosen": -1.5793389081954956, "logits/rejected": -0.6903096437454224, "logps/chosen": -691.1597290039062, "logps/rejected": -1354.8695068359375, "loss": 0.3259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04962822049856186, "rewards/margins": 0.04500482603907585, "rewards/rejected": -0.0946330577135086, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.3725490196078434e-06, "logits/chosen": -1.2960580587387085, "logits/rejected": -0.5226901173591614, "logps/chosen": -677.5730590820312, "logps/rejected": -1611.273681640625, "loss": 0.2328, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09385339170694351, "rewards/margins": 0.11672432720661163, "rewards/rejected": -0.21057769656181335, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.5686274509803923e-06, "logits/chosen": -1.0945719480514526, "logits/rejected": -0.5267337560653687, "logps/chosen": -776.620849609375, "logps/rejected": -1658.595703125, "loss": 0.1974, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19967763125896454, "rewards/margins": 0.2444140613079071, "rewards/rejected": -0.44409170746803284, "step": 80 }, { "epoch": 0.04, "learning_rate": 1.7647058823529414e-06, "logits/chosen": -1.4510087966918945, "logits/rejected": -0.023749172687530518, "logps/chosen": -911.9953002929688, "logps/rejected": -1725.72265625, "loss": 0.2263, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2247885912656784, "rewards/margins": 0.1656641662120819, "rewards/rejected": -0.3904527723789215, "step": 90 }, { "epoch": 0.04, "learning_rate": 1.96078431372549e-06, "logits/chosen": -1.3619906902313232, "logits/rejected": -0.15897789597511292, "logps/chosen": -822.1832275390625, "logps/rejected": -1571.0025634765625, "loss": 0.2765, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16276445984840393, "rewards/margins": 0.13095514476299286, "rewards/rejected": -0.2937195897102356, "step": 100 }, { "epoch": 0.04, "learning_rate": 2.1568627450980393e-06, "logits/chosen": -1.1530998945236206, "logits/rejected": -0.40491175651550293, "logps/chosen": -854.7205200195312, "logps/rejected": -1822.2965087890625, "loss": 0.2099, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15811415016651154, "rewards/margins": 0.23178577423095703, "rewards/rejected": -0.3898999094963074, "step": 110 }, { "epoch": 0.05, "learning_rate": 2.3529411764705885e-06, "logits/chosen": -1.5039284229278564, "logits/rejected": -0.5590807199478149, "logps/chosen": -726.2496337890625, "logps/rejected": -1728.7135009765625, "loss": 0.2438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13852688670158386, "rewards/margins": 0.27724406123161316, "rewards/rejected": -0.415770947933197, "step": 120 }, { "epoch": 0.05, "learning_rate": 2.549019607843137e-06, "logits/chosen": -1.47978937625885, "logits/rejected": -0.7583194971084595, "logps/chosen": -777.576416015625, "logps/rejected": -1722.5013427734375, "loss": 0.169, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19047263264656067, "rewards/margins": 0.221228688955307, "rewards/rejected": -0.41170138120651245, "step": 130 }, { "epoch": 0.05, "learning_rate": 2.7450980392156867e-06, "logits/chosen": -1.3619725704193115, "logits/rejected": -0.45514482259750366, "logps/chosen": -859.7752685546875, "logps/rejected": -1761.045654296875, "loss": 0.2, "rewards/accuracies": 0.75, "rewards/chosen": -0.2766234278678894, "rewards/margins": 0.1959143877029419, "rewards/rejected": -0.4725378155708313, "step": 140 }, { "epoch": 0.06, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -1.2937225103378296, "logits/rejected": -0.18269118666648865, "logps/chosen": -1000.1593627929688, "logps/rejected": -1983.649169921875, "loss": 0.2365, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3481788635253906, "rewards/margins": 0.32042697072029114, "rewards/rejected": -0.6686058640480042, "step": 150 }, { "epoch": 0.06, "learning_rate": 3.1372549019607846e-06, "logits/chosen": -1.3573691844940186, "logits/rejected": -0.8902850151062012, "logps/chosen": -908.5567626953125, "logps/rejected": -1647.058349609375, "loss": 0.2157, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2951143980026245, "rewards/margins": 0.15847407281398773, "rewards/rejected": -0.45358848571777344, "step": 160 }, { "epoch": 0.07, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.657151222229004, "logits/rejected": -0.9709945917129517, "logps/chosen": -817.062744140625, "logps/rejected": -1781.638671875, "loss": 0.1573, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21228685975074768, "rewards/margins": 0.23603840172290802, "rewards/rejected": -0.4483252465724945, "step": 170 }, { "epoch": 0.07, "learning_rate": 3.529411764705883e-06, "logits/chosen": -1.4512741565704346, "logits/rejected": -0.1740313172340393, "logps/chosen": -889.5105590820312, "logps/rejected": -2051.538818359375, "loss": 0.2172, "rewards/accuracies": 0.875, "rewards/chosen": -0.25935059785842896, "rewards/margins": 0.3617965579032898, "rewards/rejected": -0.6211471557617188, "step": 180 }, { "epoch": 0.07, "learning_rate": 3.7254901960784316e-06, "logits/chosen": -1.1388862133026123, "logits/rejected": -0.18607623875141144, "logps/chosen": -821.3484497070312, "logps/rejected": -1919.426513671875, "loss": 0.1605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21933992207050323, "rewards/margins": 0.3276643455028534, "rewards/rejected": -0.547004222869873, "step": 190 }, { "epoch": 0.08, "learning_rate": 3.92156862745098e-06, "logits/chosen": -1.348503828048706, "logits/rejected": -0.5367448925971985, "logps/chosen": -625.4483032226562, "logps/rejected": -1496.484130859375, "loss": 0.2089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10934285074472427, "rewards/margins": 0.23809942603111267, "rewards/rejected": -0.34744226932525635, "step": 200 }, { "epoch": 0.08, "learning_rate": 4.11764705882353e-06, "logits/chosen": -1.3590004444122314, "logits/rejected": -0.8172636032104492, "logps/chosen": -815.4562377929688, "logps/rejected": -1780.2926025390625, "loss": 0.2143, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1335686445236206, "rewards/margins": 0.2463621348142624, "rewards/rejected": -0.3799307644367218, "step": 210 }, { "epoch": 0.09, "learning_rate": 4.313725490196079e-06, "logits/chosen": -1.5793583393096924, "logits/rejected": -0.32534486055374146, "logps/chosen": -928.5148315429688, "logps/rejected": -1738.1536865234375, "loss": 0.2501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20052528381347656, "rewards/margins": 0.21043157577514648, "rewards/rejected": -0.41095685958862305, "step": 220 }, { "epoch": 0.09, "learning_rate": 4.509803921568628e-06, "logits/chosen": -0.9972221255302429, "logits/rejected": -0.37468206882476807, "logps/chosen": -708.7088623046875, "logps/rejected": -1586.076416015625, "loss": 0.205, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14892444014549255, "rewards/margins": 0.24091584980487823, "rewards/rejected": -0.3898402750492096, "step": 230 }, { "epoch": 0.09, "learning_rate": 4.705882352941177e-06, "logits/chosen": -1.3743550777435303, "logits/rejected": -0.13277244567871094, "logps/chosen": -719.425537109375, "logps/rejected": -1675.90234375, "loss": 0.2009, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1083696037530899, "rewards/margins": 0.3259289562702179, "rewards/rejected": -0.4342985153198242, "step": 240 }, { "epoch": 0.1, "learning_rate": 4.901960784313726e-06, "logits/chosen": -1.1646835803985596, "logits/rejected": -0.5943381786346436, "logps/chosen": -621.46630859375, "logps/rejected": -1612.871826171875, "loss": 0.1643, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17562244832515717, "rewards/margins": 0.2645077705383301, "rewards/rejected": -0.44013017416000366, "step": 250 }, { "epoch": 0.1, "learning_rate": 4.999941442477777e-06, "logits/chosen": -1.2978475093841553, "logits/rejected": -0.576497495174408, "logps/chosen": -937.4520263671875, "logps/rejected": -1737.780029296875, "loss": 0.2432, "rewards/accuracies": 0.75, "rewards/chosen": -0.25836849212646484, "rewards/margins": 0.241961270570755, "rewards/rejected": -0.5003297924995422, "step": 260 }, { "epoch": 0.11, "learning_rate": 4.999472998758979e-06, "logits/chosen": -1.4330791234970093, "logits/rejected": -0.8838942646980286, "logps/chosen": -877.1728515625, "logps/rejected": -1793.1947021484375, "loss": 0.1393, "rewards/accuracies": 0.75, "rewards/chosen": -0.25408753752708435, "rewards/margins": 0.2761463522911072, "rewards/rejected": -0.5302339792251587, "step": 270 }, { "epoch": 0.11, "learning_rate": 4.998536199099246e-06, "logits/chosen": -1.3899977207183838, "logits/rejected": 0.03836112096905708, "logps/chosen": -923.8590087890625, "logps/rejected": -1724.1558837890625, "loss": 0.1851, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1945658177137375, "rewards/margins": 0.2358773946762085, "rewards/rejected": -0.4304431974887848, "step": 280 }, { "epoch": 0.11, "learning_rate": 4.997131219037856e-06, "logits/chosen": -1.186488389968872, "logits/rejected": -0.389091819524765, "logps/chosen": -757.4147338867188, "logps/rejected": -1886.984130859375, "loss": 0.1841, "rewards/accuracies": 0.875, "rewards/chosen": -0.21733467280864716, "rewards/margins": 0.3353338837623596, "rewards/rejected": -0.5526684522628784, "step": 290 }, { "epoch": 0.12, "learning_rate": 4.995258321842611e-06, "logits/chosen": -1.1964404582977295, "logits/rejected": -0.06750938296318054, "logps/chosen": -907.2109375, "logps/rejected": -1809.0191650390625, "loss": 0.1834, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2906198799610138, "rewards/margins": 0.279925137758255, "rewards/rejected": -0.5705450177192688, "step": 300 }, { "epoch": 0.12, "learning_rate": 4.9929178584605e-06, "logits/chosen": -1.649431586265564, "logits/rejected": -0.20804986357688904, "logps/chosen": -891.9801635742188, "logps/rejected": -1733.181884765625, "loss": 0.1278, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.22755351662635803, "rewards/margins": 0.2664097547531128, "rewards/rejected": -0.49396324157714844, "step": 310 }, { "epoch": 0.13, "learning_rate": 4.9901102674519446e-06, "logits/chosen": -1.4958832263946533, "logits/rejected": -0.3006078004837036, "logps/chosen": -951.6578369140625, "logps/rejected": -1706.25390625, "loss": 0.2295, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2715206742286682, "rewards/margins": 0.25203150510787964, "rewards/rejected": -0.5235521793365479, "step": 320 }, { "epoch": 0.13, "learning_rate": 4.986836074908616e-06, "logits/chosen": -1.3995481729507446, "logits/rejected": 0.009560632519423962, "logps/chosen": -718.5650634765625, "logps/rejected": -1350.846923828125, "loss": 0.2471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20702452957630157, "rewards/margins": 0.14036989212036133, "rewards/rejected": -0.3473944067955017, "step": 330 }, { "epoch": 0.13, "learning_rate": 4.983095894354858e-06, "logits/chosen": -1.5904731750488281, "logits/rejected": -0.14893893897533417, "logps/chosen": -855.9501953125, "logps/rejected": -1916.079345703125, "loss": 0.2131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24469470977783203, "rewards/margins": 0.22492530941963196, "rewards/rejected": -0.4696199893951416, "step": 340 }, { "epoch": 0.14, "learning_rate": 4.9788904266327206e-06, "logits/chosen": -1.6823375225067139, "logits/rejected": -0.4657576084136963, "logps/chosen": -784.65234375, "logps/rejected": -1751.244873046875, "loss": 0.1888, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1359667181968689, "rewards/margins": 0.29217660427093506, "rewards/rejected": -0.42814335227012634, "step": 350 }, { "epoch": 0.14, "learning_rate": 4.9742204597706386e-06, "logits/chosen": -1.5003750324249268, "logits/rejected": -0.001354557229205966, "logps/chosen": -755.9137573242188, "logps/rejected": -1653.0166015625, "loss": 0.1933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12234246730804443, "rewards/margins": 0.2765265107154846, "rewards/rejected": -0.39886897802352905, "step": 360 }, { "epoch": 0.15, "learning_rate": 4.9690868688357655e-06, "logits/chosen": -1.3799958229064941, "logits/rejected": -0.4311766028404236, "logps/chosen": -724.7586059570312, "logps/rejected": -1667.642822265625, "loss": 0.1828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1624334752559662, "rewards/margins": 0.2481921911239624, "rewards/rejected": -0.4106256365776062, "step": 370 }, { "epoch": 0.15, "learning_rate": 4.963490615770003e-06, "logits/chosen": -1.295836329460144, "logits/rejected": -0.5849100947380066, "logps/chosen": -835.3861083984375, "logps/rejected": -1846.414794921875, "loss": 0.2211, "rewards/accuracies": 0.875, "rewards/chosen": -0.23319277167320251, "rewards/margins": 0.3248142898082733, "rewards/rejected": -0.5580071210861206, "step": 380 }, { "epoch": 0.15, "learning_rate": 4.957432749209755e-06, "logits/chosen": -1.4312934875488281, "logits/rejected": 0.31627362966537476, "logps/chosen": -939.7803955078125, "logps/rejected": -1674.4808349609375, "loss": 0.2533, "rewards/accuracies": 0.75, "rewards/chosen": -0.2971717119216919, "rewards/margins": 0.19919905066490173, "rewards/rejected": -0.496370792388916, "step": 390 }, { "epoch": 0.16, "learning_rate": 4.950914404289423e-06, "logits/chosen": -1.3529198169708252, "logits/rejected": -0.19551090896129608, "logps/chosen": -940.6759643554688, "logps/rejected": -1822.1956787109375, "loss": 0.2262, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3031768500804901, "rewards/margins": 0.22445103526115417, "rewards/rejected": -0.5276279449462891, "step": 400 }, { "epoch": 0.16, "learning_rate": 4.943936802428712e-06, "logits/chosen": -1.1721961498260498, "logits/rejected": 0.37075644731521606, "logps/chosen": -702.531005859375, "logps/rejected": -1698.3720703125, "loss": 0.1711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18371441960334778, "rewards/margins": 0.269645094871521, "rewards/rejected": -0.4533595144748688, "step": 410 }, { "epoch": 0.16, "learning_rate": 4.936501251103751e-06, "logits/chosen": -1.1501245498657227, "logits/rejected": -0.04669635370373726, "logps/chosen": -934.7687377929688, "logps/rejected": -1762.8375244140625, "loss": 0.2049, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.27317720651626587, "rewards/margins": 0.24144259095191956, "rewards/rejected": -0.5146198272705078, "step": 420 }, { "epoch": 0.17, "learning_rate": 4.928609143602102e-06, "logits/chosen": -1.3455841541290283, "logits/rejected": -0.689312219619751, "logps/chosen": -953.3030395507812, "logps/rejected": -2143.519775390625, "loss": 0.1132, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3812543451786041, "rewards/margins": 0.47345447540283203, "rewards/rejected": -0.8547086715698242, "step": 430 }, { "epoch": 0.17, "learning_rate": 4.920261958761677e-06, "logits/chosen": -1.1954295635223389, "logits/rejected": 0.1524878442287445, "logps/chosen": -988.5673828125, "logps/rejected": -1907.625, "loss": 0.2181, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3235063850879669, "rewards/margins": 0.30497947335243225, "rewards/rejected": -0.6284858584403992, "step": 440 }, { "epoch": 0.18, "learning_rate": 4.911461260693639e-06, "logits/chosen": -1.384975552558899, "logits/rejected": -0.3957231938838959, "logps/chosen": -864.88623046875, "logps/rejected": -1796.1107177734375, "loss": 0.1692, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24604813754558563, "rewards/margins": 0.25146228075027466, "rewards/rejected": -0.4975104331970215, "step": 450 }, { "epoch": 0.18, "learning_rate": 4.902208698489302e-06, "logits/chosen": -1.0432078838348389, "logits/rejected": -0.16131794452667236, "logps/chosen": -885.232421875, "logps/rejected": -1651.9114990234375, "loss": 0.2494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22546634078025818, "rewards/margins": 0.1930330991744995, "rewards/rejected": -0.4184994697570801, "step": 460 }, { "epoch": 0.18, "learning_rate": 4.89250600591114e-06, "logits/chosen": -1.3176567554473877, "logits/rejected": -0.0033722042571753263, "logps/chosen": -723.5933837890625, "logps/rejected": -1598.0091552734375, "loss": 0.2398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15554097294807434, "rewards/margins": 0.26405271887779236, "rewards/rejected": -0.4195936620235443, "step": 470 }, { "epoch": 0.19, "learning_rate": 4.882355001067892e-06, "logits/chosen": -1.188307523727417, "logits/rejected": 0.14929169416427612, "logps/chosen": -815.7213134765625, "logps/rejected": -1634.1407470703125, "loss": 0.2558, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16603827476501465, "rewards/margins": 0.21612891554832458, "rewards/rejected": -0.38216716051101685, "step": 480 }, { "epoch": 0.19, "learning_rate": 4.871757586073897e-06, "logits/chosen": -1.3035974502563477, "logits/rejected": 0.26524829864501953, "logps/chosen": -763.2244262695312, "logps/rejected": -1522.682861328125, "loss": 0.2258, "rewards/accuracies": 0.75, "rewards/chosen": -0.1318461149930954, "rewards/margins": 0.23059546947479248, "rewards/rejected": -0.3624415993690491, "step": 490 }, { "epoch": 0.2, "learning_rate": 4.860715746692661e-06, "logits/chosen": -1.1487717628479004, "logits/rejected": 0.05942107364535332, "logps/chosen": -886.2254638671875, "logps/rejected": -1841.0814208984375, "loss": 0.1885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18671779334545135, "rewards/margins": 0.27246180176734924, "rewards/rejected": -0.4591795802116394, "step": 500 }, { "epoch": 0.2, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.2474385499954224, "logits/rejected": -0.14498676359653473, "logps/chosen": -778.3880615234375, "logps/rejected": -1752.142333984375, "loss": 0.1754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18546968698501587, "rewards/margins": 0.3222576379776001, "rewards/rejected": -0.5077272653579712, "step": 510 }, { "epoch": 0.2, "learning_rate": 4.837307153820184e-06, "logits/chosen": -1.1251775026321411, "logits/rejected": 0.15637345612049103, "logps/chosen": -924.3635864257812, "logps/rejected": -2070.327392578125, "loss": 0.1343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3112488090991974, "rewards/margins": 0.38895484805107117, "rewards/rejected": -0.7002035975456238, "step": 520 }, { "epoch": 0.21, "learning_rate": 4.824944786675003e-06, "logits/chosen": -1.3947086334228516, "logits/rejected": 0.045419882982969284, "logps/chosen": -856.5111083984375, "logps/rejected": -1587.355712890625, "loss": 0.1704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2732272148132324, "rewards/margins": 0.24021320044994354, "rewards/rejected": -0.5134404301643372, "step": 530 }, { "epoch": 0.21, "learning_rate": 4.81214676701278e-06, "logits/chosen": -1.2445639371871948, "logits/rejected": 0.1435929536819458, "logps/chosen": -935.2590942382812, "logps/rejected": -1872.558349609375, "loss": 0.1603, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2728428244590759, "rewards/margins": 0.303517609834671, "rewards/rejected": -0.5763604044914246, "step": 540 }, { "epoch": 0.22, "learning_rate": 4.798915492950456e-06, "logits/chosen": -1.3926843404769897, "logits/rejected": -0.8224552273750305, "logps/chosen": -930.3948364257812, "logps/rejected": -1831.987060546875, "loss": 0.2094, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.22408756613731384, "rewards/margins": 0.306917279958725, "rewards/rejected": -0.5310048460960388, "step": 550 }, { "epoch": 0.22, "learning_rate": 4.785253443788997e-06, "logits/chosen": -1.452789306640625, "logits/rejected": -0.08553876727819443, "logps/chosen": -834.9271240234375, "logps/rejected": -1715.3486328125, "loss": 0.2066, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1931687295436859, "rewards/margins": 0.24109697341918945, "rewards/rejected": -0.43426570296287537, "step": 560 }, { "epoch": 0.22, "learning_rate": 4.771163179548809e-06, "logits/chosen": -1.2075916528701782, "logits/rejected": -0.4084923267364502, "logps/chosen": -895.1989135742188, "logps/rejected": -1892.2545166015625, "loss": 0.1562, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.27637767791748047, "rewards/margins": 0.3559107184410095, "rewards/rejected": -0.6322883367538452, "step": 570 }, { "epoch": 0.23, "learning_rate": 4.75664734049005e-06, "logits/chosen": -1.4612247943878174, "logits/rejected": -0.4000505805015564, "logps/chosen": -838.0030517578125, "logps/rejected": -1836.1865234375, "loss": 0.1866, "rewards/accuracies": 0.75, "rewards/chosen": -0.27399009466171265, "rewards/margins": 0.3403601050376892, "rewards/rejected": -0.6143501996994019, "step": 580 }, { "epoch": 0.23, "learning_rate": 4.741708646617879e-06, "logits/chosen": -1.4533047676086426, "logits/rejected": -0.44210928678512573, "logps/chosen": -826.44921875, "logps/rejected": -1627.9482421875, "loss": 0.1568, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18495787680149078, "rewards/margins": 0.24494799971580505, "rewards/rejected": -0.42990580201148987, "step": 590 }, { "epoch": 0.24, "learning_rate": 4.726349897172791e-06, "logits/chosen": -1.2161755561828613, "logits/rejected": -0.4458787441253662, "logps/chosen": -677.1725463867188, "logps/rejected": -1372.3172607421875, "loss": 0.2348, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11152330785989761, "rewards/margins": 0.17953188717365265, "rewards/rejected": -0.29105520248413086, "step": 600 }, { "epoch": 0.24, "learning_rate": 4.710573970106076e-06, "logits/chosen": -1.2787022590637207, "logits/rejected": -0.5003519654273987, "logps/chosen": -937.7862548828125, "logps/rejected": -1879.7720947265625, "loss": 0.2216, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.27286097407341003, "rewards/margins": 0.2761802077293396, "rewards/rejected": -0.5490411520004272, "step": 610 }, { "epoch": 0.24, "learning_rate": 4.694383821540554e-06, "logits/chosen": -1.4234240055084229, "logits/rejected": -0.529420793056488, "logps/chosen": -879.75830078125, "logps/rejected": -1886.7099609375, "loss": 0.1449, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.23544082045555115, "rewards/margins": 0.3297392725944519, "rewards/rejected": -0.5651801824569702, "step": 620 }, { "epoch": 0.25, "learning_rate": 4.677782485216644e-06, "logits/chosen": -1.5074328184127808, "logits/rejected": 0.13324348628520966, "logps/chosen": -894.3519287109375, "logps/rejected": -1656.986328125, "loss": 0.2306, "rewards/accuracies": 0.75, "rewards/chosen": -0.2585051953792572, "rewards/margins": 0.17639592289924622, "rewards/rejected": -0.4349011480808258, "step": 630 }, { "epoch": 0.25, "learning_rate": 4.660773071923901e-06, "logits/chosen": -1.254246473312378, "logits/rejected": -0.4503572881221771, "logps/chosen": -743.8980712890625, "logps/rejected": -1586.318603515625, "loss": 0.2306, "rewards/accuracies": 0.625, "rewards/chosen": -0.20321564376354218, "rewards/margins": 0.28816673159599304, "rewards/rejected": -0.49138230085372925, "step": 640 }, { "epoch": 0.25, "learning_rate": 4.643358768918106e-06, "logits/chosen": -1.2100741863250732, "logits/rejected": -0.6602537631988525, "logps/chosen": -866.4385986328125, "logps/rejected": -1698.565185546875, "loss": 0.2341, "rewards/accuracies": 0.75, "rewards/chosen": -0.24518051743507385, "rewards/margins": 0.21809275448322296, "rewards/rejected": -0.46327322721481323, "step": 650 }, { "epoch": 0.26, "learning_rate": 4.625542839324036e-06, "logits/chosen": -1.2801318168640137, "logits/rejected": -0.20570655167102814, "logps/chosen": -696.7703857421875, "logps/rejected": -1810.203857421875, "loss": 0.1443, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1753673553466797, "rewards/margins": 0.33784395456314087, "rewards/rejected": -0.5132113099098206, "step": 660 }, { "epoch": 0.26, "learning_rate": 4.6073286215240105e-06, "logits/chosen": -1.573704719543457, "logits/rejected": -0.5480459928512573, "logps/chosen": -698.1392211914062, "logps/rejected": -1760.8834228515625, "loss": 3.1142, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11380796134471893, "rewards/margins": 0.48678064346313477, "rewards/rejected": -0.6005885601043701, "step": 670 }, { "epoch": 0.27, "learning_rate": 4.588719528532342e-06, "logits/chosen": -1.5839512348175049, "logits/rejected": -0.7513319253921509, "logps/chosen": -620.78955078125, "logps/rejected": -1401.8199462890625, "loss": 0.2422, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02382536605000496, "rewards/margins": 0.09586119651794434, "rewards/rejected": -0.119686558842659, "step": 680 }, { "epoch": 0.27, "learning_rate": 4.569719047355795e-06, "logits/chosen": -1.5924733877182007, "logits/rejected": -0.816574215888977, "logps/chosen": -557.929931640625, "logps/rejected": -1159.7681884765625, "loss": 0.292, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.010283837094902992, "rewards/margins": 0.04927302524447441, "rewards/rejected": -0.059556860476732254, "step": 690 }, { "epoch": 0.27, "learning_rate": 4.550330738340189e-06, "logits/chosen": -1.4926470518112183, "logits/rejected": -0.8066496849060059, "logps/chosen": -669.9822387695312, "logps/rejected": -1387.30419921875, "loss": 0.2635, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06443636119365692, "rewards/margins": 0.10144983232021332, "rewards/rejected": -0.16588619351387024, "step": 700 }, { "epoch": 0.28, "learning_rate": 4.530558234503252e-06, "logits/chosen": -1.504148244857788, "logits/rejected": -0.710750937461853, "logps/chosen": -563.5753173828125, "logps/rejected": -1385.9373779296875, "loss": 0.1933, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02186558023095131, "rewards/margins": 0.15772321820259094, "rewards/rejected": -0.17958880960941315, "step": 710 }, { "epoch": 0.28, "learning_rate": 4.5104052408538545e-06, "logits/chosen": -1.3532848358154297, "logits/rejected": -0.17277280986309052, "logps/chosen": -665.1290893554688, "logps/rejected": -1473.070068359375, "loss": 0.2188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0683923214673996, "rewards/margins": 0.23080816864967346, "rewards/rejected": -0.29920047521591187, "step": 720 }, { "epoch": 0.29, "learning_rate": 4.489875533697767e-06, "logits/chosen": -1.2411041259765625, "logits/rejected": -0.6769916415214539, "logps/chosen": -796.1107177734375, "logps/rejected": -1779.8375244140625, "loss": 0.2014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11370061337947845, "rewards/margins": 0.2886132001876831, "rewards/rejected": -0.40231385827064514, "step": 730 }, { "epoch": 0.29, "learning_rate": 4.468972959930043e-06, "logits/chosen": -1.4062107801437378, "logits/rejected": -0.11251994222402573, "logps/chosen": -810.1907958984375, "logps/rejected": -1755.5439453125, "loss": 0.207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.138728067278862, "rewards/margins": 0.25112494826316833, "rewards/rejected": -0.38985303044319153, "step": 740 }, { "epoch": 0.29, "learning_rate": 4.447701436314176e-06, "logits/chosen": -1.1295002698898315, "logits/rejected": -0.491716206073761, "logps/chosen": -665.5704345703125, "logps/rejected": -1605.226318359375, "loss": 0.2432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09421003609895706, "rewards/margins": 0.22589227557182312, "rewards/rejected": -0.3201023042201996, "step": 750 }, { "epoch": 0.3, "learning_rate": 4.4260649487481835e-06, "logits/chosen": -1.3528281450271606, "logits/rejected": -0.8653984069824219, "logps/chosen": -560.7476806640625, "logps/rejected": -1564.6998291015625, "loss": 0.1747, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06951048225164413, "rewards/margins": 0.29366621375083923, "rewards/rejected": -0.36317676305770874, "step": 760 }, { "epoch": 0.3, "learning_rate": 4.404067551517704e-06, "logits/chosen": -1.496765375137329, "logits/rejected": -0.7339566349983215, "logps/chosen": -559.6861572265625, "logps/rejected": -1561.775634765625, "loss": 0.1495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0916418582201004, "rewards/margins": 0.28826963901519775, "rewards/rejected": -0.37991148233413696, "step": 770 }, { "epoch": 0.31, "learning_rate": 4.381713366536312e-06, "logits/chosen": -1.2229559421539307, "logits/rejected": -0.3822958469390869, "logps/chosen": -795.2717895507812, "logps/rejected": -1646.298583984375, "loss": 0.2385, "rewards/accuracies": 0.75, "rewards/chosen": -0.16480056941509247, "rewards/margins": 0.25636088848114014, "rewards/rejected": -0.4211614727973938, "step": 780 }, { "epoch": 0.31, "learning_rate": 4.359006582573138e-06, "logits/chosen": -1.3127458095550537, "logits/rejected": -0.6002156138420105, "logps/chosen": -731.8434448242188, "logps/rejected": -1655.339599609375, "loss": 0.2386, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1232454776763916, "rewards/margins": 0.260085791349411, "rewards/rejected": -0.3833312392234802, "step": 790 }, { "epoch": 0.31, "learning_rate": 4.335951454467971e-06, "logits/chosen": -1.4491212368011475, "logits/rejected": -0.4968988299369812, "logps/chosen": -708.8034057617188, "logps/rejected": -1689.820068359375, "loss": 0.1514, "rewards/accuracies": 0.75, "rewards/chosen": -0.11892116069793701, "rewards/margins": 0.311443567276001, "rewards/rejected": -0.4303646981716156, "step": 800 }, { "epoch": 0.32, "learning_rate": 4.3125523023339825e-06, "logits/chosen": -1.532845377922058, "logits/rejected": -0.5454439520835876, "logps/chosen": -708.6060791015625, "logps/rejected": -1473.7392578125, "loss": 0.2365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0950343906879425, "rewards/margins": 0.2247859686613083, "rewards/rejected": -0.3198204040527344, "step": 810 }, { "epoch": 0.32, "learning_rate": 4.288813510748207e-06, "logits/chosen": -1.3746122121810913, "logits/rejected": -0.3929213881492615, "logps/chosen": -709.5933837890625, "logps/rejected": -1493.141357421875, "loss": 0.1891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05677127093076706, "rewards/margins": 0.22434012591838837, "rewards/rejected": -0.28111138939857483, "step": 820 }, { "epoch": 0.33, "learning_rate": 4.264739527929959e-06, "logits/chosen": -1.6062724590301514, "logits/rejected": -0.8062151074409485, "logps/chosen": -672.033447265625, "logps/rejected": -1605.8253173828125, "loss": 0.2076, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04862620308995247, "rewards/margins": 0.277982234954834, "rewards/rejected": -0.32660841941833496, "step": 830 }, { "epoch": 0.33, "learning_rate": 4.240334864907317e-06, "logits/chosen": -1.429529070854187, "logits/rejected": -0.1541730761528015, "logps/chosen": -751.5721435546875, "logps/rejected": -1614.796875, "loss": 0.1689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07653092592954636, "rewards/margins": 0.23011043667793274, "rewards/rejected": -0.3066413402557373, "step": 840 }, { "epoch": 0.33, "learning_rate": 4.215604094671835e-06, "logits/chosen": -1.4942229986190796, "logits/rejected": -0.5664646029472351, "logps/chosen": -683.9749755859375, "logps/rejected": -1751.165771484375, "loss": 0.1305, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07548153400421143, "rewards/margins": 0.3088182806968689, "rewards/rejected": -0.3842998147010803, "step": 850 }, { "epoch": 0.34, "learning_rate": 4.190551851321647e-06, "logits/chosen": -1.5068459510803223, "logits/rejected": -0.3654994070529938, "logps/chosen": -753.12060546875, "logps/rejected": -1841.876220703125, "loss": 0.1256, "rewards/accuracies": 0.875, "rewards/chosen": -0.10451909154653549, "rewards/margins": 0.35891246795654297, "rewards/rejected": -0.46343153715133667, "step": 860 }, { "epoch": 0.34, "learning_rate": 4.165182829193126e-06, "logits/chosen": -1.4504587650299072, "logits/rejected": 0.0904449075460434, "logps/chosen": -773.3833618164062, "logps/rejected": -1582.4493408203125, "loss": 0.2156, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06579799205064774, "rewards/margins": 0.241295725107193, "rewards/rejected": -0.3070937395095825, "step": 870 }, { "epoch": 0.35, "learning_rate": 4.139501781981245e-06, "logits/chosen": -1.5094424486160278, "logits/rejected": -0.5480602383613586, "logps/chosen": -672.755126953125, "logps/rejected": -1651.116943359375, "loss": 0.1111, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07722672820091248, "rewards/margins": 0.26155346632003784, "rewards/rejected": -0.33878016471862793, "step": 880 }, { "epoch": 0.35, "learning_rate": 4.113513521848821e-06, "logits/chosen": -1.594499111175537, "logits/rejected": -0.5706368684768677, "logps/chosen": -772.4927978515625, "logps/rejected": -1745.507080078125, "loss": 0.1475, "rewards/accuracies": 0.875, "rewards/chosen": -0.10902180522680283, "rewards/margins": 0.3192656934261322, "rewards/rejected": -0.4282875061035156, "step": 890 }, { "epoch": 0.35, "learning_rate": 4.087222918524807e-06, "logits/chosen": -1.297629952430725, "logits/rejected": -0.6775213479995728, "logps/chosen": -705.9368896484375, "logps/rejected": -1540.4775390625, "loss": 0.2268, "rewards/accuracies": 0.75, "rewards/chosen": -0.1348380744457245, "rewards/margins": 0.2274044305086136, "rewards/rejected": -0.36224251985549927, "step": 900 }, { "epoch": 0.36, "learning_rate": 4.0606348983917924e-06, "logits/chosen": -1.3503175973892212, "logits/rejected": -0.9185010194778442, "logps/chosen": -610.7164306640625, "logps/rejected": -1734.915771484375, "loss": 0.1352, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10167312622070312, "rewards/margins": 0.36634570360183716, "rewards/rejected": -0.4680188298225403, "step": 910 }, { "epoch": 0.36, "learning_rate": 4.03375444356288e-06, "logits/chosen": -1.4071118831634521, "logits/rejected": -0.8690752983093262, "logps/chosen": -836.24169921875, "logps/rejected": -1863.6539306640625, "loss": 0.2307, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1224076971411705, "rewards/margins": 0.3151033818721771, "rewards/rejected": -0.43751105666160583, "step": 920 }, { "epoch": 0.36, "learning_rate": 4.006586590948141e-06, "logits/chosen": -1.3949382305145264, "logits/rejected": -0.680055558681488, "logps/chosen": -666.8121948242188, "logps/rejected": -1796.386474609375, "loss": 0.199, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07509482651948929, "rewards/margins": 0.31507402658462524, "rewards/rejected": -0.39016884565353394, "step": 930 }, { "epoch": 0.37, "learning_rate": 3.979136431310781e-06, "logits/chosen": -1.4007041454315186, "logits/rejected": -0.44923824071884155, "logps/chosen": -629.3880615234375, "logps/rejected": -1281.4881591796875, "loss": 0.27, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06840632110834122, "rewards/margins": 0.14729034900665283, "rewards/rejected": -0.21569669246673584, "step": 940 }, { "epoch": 0.37, "learning_rate": 3.951409108313223e-06, "logits/chosen": -1.3141412734985352, "logits/rejected": -0.3359532654285431, "logps/chosen": -682.4598999023438, "logps/rejected": -1479.57763671875, "loss": 0.2002, "rewards/accuracies": 0.75, "rewards/chosen": -0.06678664684295654, "rewards/margins": 0.18387752771377563, "rewards/rejected": -0.2506641745567322, "step": 950 }, { "epoch": 0.38, "learning_rate": 3.923409817553284e-06, "logits/chosen": -1.26377534866333, "logits/rejected": -0.5578689575195312, "logps/chosen": -753.383056640625, "logps/rejected": -1470.470458984375, "loss": 0.1909, "rewards/accuracies": 0.75, "rewards/chosen": -0.08720171451568604, "rewards/margins": 0.24566006660461426, "rewards/rejected": -0.3328618109226227, "step": 960 }, { "epoch": 0.38, "learning_rate": 3.895143805590609e-06, "logits/chosen": -1.5301742553710938, "logits/rejected": -0.33912280201911926, "logps/chosen": -788.5135498046875, "logps/rejected": -1906.780029296875, "loss": 0.2098, "rewards/accuracies": 0.875, "rewards/chosen": -0.10219583660364151, "rewards/margins": 0.3592928946018219, "rewards/rejected": -0.4614887833595276, "step": 970 }, { "epoch": 0.38, "learning_rate": 3.8666163689635614e-06, "logits/chosen": -1.4293967485427856, "logits/rejected": -0.766064465045929, "logps/chosen": -697.79443359375, "logps/rejected": -1692.085693359375, "loss": 0.2074, "rewards/accuracies": 0.75, "rewards/chosen": -0.10171394050121307, "rewards/margins": 0.3115464448928833, "rewards/rejected": -0.41326045989990234, "step": 980 }, { "epoch": 0.39, "learning_rate": 3.837832853196751e-06, "logits/chosen": -1.4031484127044678, "logits/rejected": -0.46277111768722534, "logps/chosen": -741.0556030273438, "logps/rejected": -1712.839111328125, "loss": 0.1786, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10311299562454224, "rewards/margins": 0.2798925042152405, "rewards/rejected": -0.3830054700374603, "step": 990 }, { "epoch": 0.39, "learning_rate": 3.808798651799377e-06, "logits/chosen": -1.4064973592758179, "logits/rejected": -0.5826825499534607, "logps/chosen": -687.228271484375, "logps/rejected": -1728.9072265625, "loss": 0.1515, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10456766188144684, "rewards/margins": 0.31050539016723633, "rewards/rejected": -0.4150730073451996, "step": 1000 }, { "epoch": 0.4, "learning_rate": 3.7795192052545805e-06, "logits/chosen": -1.3606574535369873, "logits/rejected": -0.26507607102394104, "logps/chosen": -657.6034545898438, "logps/rejected": -1711.599365234375, "loss": 0.1027, "rewards/accuracies": 0.875, "rewards/chosen": -0.11261602491140366, "rewards/margins": 0.3513622283935547, "rewards/rejected": -0.46397823095321655, "step": 1010 }, { "epoch": 0.4, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.3821978569030762, "logits/rejected": -0.846422016620636, "logps/chosen": -700.0028076171875, "logps/rejected": -1700.7777099609375, "loss": 0.188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1201629489660263, "rewards/margins": 0.2851884663105011, "rewards/rejected": -0.4053514003753662, "step": 1020 }, { "epoch": 0.4, "learning_rate": 3.7202465673997123e-06, "logits/chosen": -1.327423334121704, "logits/rejected": -0.4249703884124756, "logps/chosen": -733.533935546875, "logps/rejected": -1811.9857177734375, "loss": 0.2335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15838567912578583, "rewards/margins": 0.3160237669944763, "rewards/rejected": -0.47440940141677856, "step": 1030 }, { "epoch": 0.41, "learning_rate": 3.6902644827077504e-06, "logits/chosen": -1.163883924484253, "logits/rejected": -0.564578652381897, "logps/chosen": -714.31591796875, "logps/rejected": -1658.974609375, "loss": 0.204, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1494341641664505, "rewards/margins": 0.2751534581184387, "rewards/rejected": -0.4245876669883728, "step": 1040 }, { "epoch": 0.41, "learning_rate": 3.660059364023409e-06, "logits/chosen": -1.1056033372879028, "logits/rejected": -0.6749047040939331, "logps/chosen": -836.0635986328125, "logps/rejected": -1795.9320068359375, "loss": 0.1381, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13029779493808746, "rewards/margins": 0.3451148271560669, "rewards/rejected": -0.47541260719299316, "step": 1050 }, { "epoch": 0.42, "learning_rate": 3.6296368712385084e-06, "logits/chosen": -1.2282450199127197, "logits/rejected": 0.033928144723176956, "logps/chosen": -668.1098022460938, "logps/rejected": -1750.6011962890625, "loss": 0.187, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12075567245483398, "rewards/margins": 0.3676701486110687, "rewards/rejected": -0.4884257912635803, "step": 1060 }, { "epoch": 0.42, "learning_rate": 3.599002704976835e-06, "logits/chosen": -1.513203501701355, "logits/rejected": -0.3770269453525543, "logps/chosen": -774.125244140625, "logps/rejected": -1470.924072265625, "loss": 0.2331, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0825313851237297, "rewards/margins": 0.2151786983013153, "rewards/rejected": -0.297710120677948, "step": 1070 }, { "epoch": 0.42, "learning_rate": 3.5681626055259526e-06, "logits/chosen": -1.351539134979248, "logits/rejected": 0.01821332611143589, "logps/chosen": -615.5689086914062, "logps/rejected": -1394.30859375, "loss": 0.1882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04903438687324524, "rewards/margins": 0.17228753864765167, "rewards/rejected": -0.22132191061973572, "step": 1080 }, { "epoch": 0.43, "learning_rate": 3.5371223517615684e-06, "logits/chosen": -1.1955583095550537, "logits/rejected": -0.7964296340942383, "logps/chosen": -650.0599365234375, "logps/rejected": -1640.6591796875, "loss": 0.1666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.061278946697711945, "rewards/margins": 0.23827362060546875, "rewards/rejected": -0.2995525896549225, "step": 1090 }, { "epoch": 0.43, "learning_rate": 3.5058877600646814e-06, "logits/chosen": -1.5846580266952515, "logits/rejected": -0.4390091896057129, "logps/chosen": -774.6456298828125, "logps/rejected": -1672.4420166015625, "loss": 0.1899, "rewards/accuracies": 0.75, "rewards/chosen": -0.09766945987939835, "rewards/margins": 0.26369303464889526, "rewards/rejected": -0.3613625466823578, "step": 1100 }, { "epoch": 0.44, "learning_rate": 3.4744646832316985e-06, "logits/chosen": -1.1662776470184326, "logits/rejected": -0.2102310210466385, "logps/chosen": -793.6665649414062, "logps/rejected": -1921.721923828125, "loss": 0.1516, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.128018319606781, "rewards/margins": 0.35574427247047424, "rewards/rejected": -0.48376256227493286, "step": 1110 }, { "epoch": 0.44, "learning_rate": 3.442859009377724e-06, "logits/chosen": -1.2999095916748047, "logits/rejected": -0.5450000762939453, "logps/chosen": -756.6891479492188, "logps/rejected": -1727.3140869140625, "loss": 0.2095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12410111725330353, "rewards/margins": 0.2954896092414856, "rewards/rejected": -0.41959071159362793, "step": 1120 }, { "epoch": 0.44, "learning_rate": 3.4110766608332347e-06, "logits/chosen": -1.3748492002487183, "logits/rejected": -0.4282529950141907, "logps/chosen": -715.91064453125, "logps/rejected": -1581.970703125, "loss": 0.2029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10205087810754776, "rewards/margins": 0.2136324942111969, "rewards/rejected": -0.31568339467048645, "step": 1130 }, { "epoch": 0.45, "learning_rate": 3.379123593034342e-06, "logits/chosen": -1.4860260486602783, "logits/rejected": -0.33013448119163513, "logps/chosen": -715.021240234375, "logps/rejected": -1671.137939453125, "loss": 0.1657, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0988413542509079, "rewards/margins": 0.2489662617444992, "rewards/rejected": -0.3478075861930847, "step": 1140 }, { "epoch": 0.45, "learning_rate": 3.3470057934068533e-06, "logits/chosen": -1.4496772289276123, "logits/rejected": -0.6596914529800415, "logps/chosen": -673.6126098632812, "logps/rejected": -1665.568603515625, "loss": 0.1832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08717682957649231, "rewards/margins": 0.2865816652774811, "rewards/rejected": -0.3737585246562958, "step": 1150 }, { "epoch": 0.45, "learning_rate": 3.314729280244332e-06, "logits/chosen": -1.5033951997756958, "logits/rejected": -0.4424918591976166, "logps/chosen": -715.0887451171875, "logps/rejected": -1384.922119140625, "loss": 0.2064, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12197653949260712, "rewards/margins": 0.2576510012149811, "rewards/rejected": -0.3796275556087494, "step": 1160 }, { "epoch": 0.46, "learning_rate": 3.2823001015803863e-06, "logits/chosen": -1.3551867008209229, "logits/rejected": -0.6100107431411743, "logps/chosen": -750.599853515625, "logps/rejected": -1853.4273681640625, "loss": 0.1589, "rewards/accuracies": 0.875, "rewards/chosen": -0.09711600840091705, "rewards/margins": 0.3520352840423584, "rewards/rejected": -0.44915127754211426, "step": 1170 }, { "epoch": 0.46, "learning_rate": 3.2497243340553675e-06, "logits/chosen": -1.0115400552749634, "logits/rejected": -0.17798957228660583, "logps/chosen": -745.58984375, "logps/rejected": -1906.7685546875, "loss": 0.2539, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14974217116832733, "rewards/margins": 0.3422713875770569, "rewards/rejected": -0.4920136332511902, "step": 1180 }, { "epoch": 0.47, "learning_rate": 3.217008081777726e-06, "logits/chosen": -1.1727737188339233, "logits/rejected": -0.37460917234420776, "logps/chosen": -709.9483642578125, "logps/rejected": -1686.753173828125, "loss": 0.1683, "rewards/accuracies": 0.875, "rewards/chosen": -0.10941555351018906, "rewards/margins": 0.2822516858577728, "rewards/rejected": -0.3916672468185425, "step": 1190 }, { "epoch": 0.47, "learning_rate": 3.184157475180208e-06, "logits/chosen": -1.3031466007232666, "logits/rejected": -0.5970622301101685, "logps/chosen": -697.8651123046875, "logps/rejected": -1595.6754150390625, "loss": 0.2328, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10812550783157349, "rewards/margins": 0.23647412657737732, "rewards/rejected": -0.3445996046066284, "step": 1200 }, { "epoch": 0.47, "learning_rate": 3.1511786698711226e-06, "logits/chosen": -1.3314238786697388, "logits/rejected": 0.48418712615966797, "logps/chosen": -731.9833984375, "logps/rejected": -1517.853271484375, "loss": 0.2287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12708911299705505, "rewards/margins": 0.23969343304634094, "rewards/rejected": -0.3667825162410736, "step": 1210 }, { "epoch": 0.48, "learning_rate": 3.1180778454808973e-06, "logits/chosen": -1.289541244506836, "logits/rejected": -0.4609376788139343, "logps/chosen": -746.2857666015625, "logps/rejected": -1523.1092529296875, "loss": 0.1886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09962339699268341, "rewards/margins": 0.28612619638442993, "rewards/rejected": -0.38574957847595215, "step": 1220 }, { "epoch": 0.48, "learning_rate": 3.084861204504122e-06, "logits/chosen": -1.0148189067840576, "logits/rejected": -0.48453038930892944, "logps/chosen": -778.4666748046875, "logps/rejected": -1931.9429931640625, "loss": 0.1131, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08722411096096039, "rewards/margins": 0.36971360445022583, "rewards/rejected": -0.4569377303123474, "step": 1230 }, { "epoch": 0.49, "learning_rate": 3.051534971137315e-06, "logits/chosen": -1.2210582494735718, "logits/rejected": -0.43022990226745605, "logps/chosen": -752.8408813476562, "logps/rejected": -1476.504638671875, "loss": 0.2269, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09663649648427963, "rewards/margins": 0.20909292995929718, "rewards/rejected": -0.3057294487953186, "step": 1240 }, { "epoch": 0.49, "learning_rate": 3.0181053901126243e-06, "logits/chosen": -1.1169403791427612, "logits/rejected": 0.2767347991466522, "logps/chosen": -749.15673828125, "logps/rejected": -1505.369140625, "loss": 0.1992, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09570419043302536, "rewards/margins": 0.19496819376945496, "rewards/rejected": -0.2906723916530609, "step": 1250 }, { "epoch": 0.49, "learning_rate": 2.9845787255276753e-06, "logits/chosen": -1.5088775157928467, "logits/rejected": -0.9695127606391907, "logps/chosen": -588.0244750976562, "logps/rejected": -1467.4212646484375, "loss": 0.1487, "rewards/accuracies": 0.75, "rewards/chosen": -0.04604783654212952, "rewards/margins": 0.28641366958618164, "rewards/rejected": -0.33246147632598877, "step": 1260 }, { "epoch": 0.5, "learning_rate": 2.950961259671793e-06, "logits/chosen": -1.50933837890625, "logits/rejected": -0.6869689226150513, "logps/chosen": -710.8389892578125, "logps/rejected": -1601.9041748046875, "loss": 0.2032, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0675523430109024, "rewards/margins": 0.27207452058792114, "rewards/rejected": -0.33962687849998474, "step": 1270 }, { "epoch": 0.5, "learning_rate": 2.917259291848814e-06, "logits/chosen": -1.4775984287261963, "logits/rejected": -0.3601114749908447, "logps/chosen": -680.5808715820312, "logps/rejected": -1640.981689453125, "loss": 0.2072, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06015778332948685, "rewards/margins": 0.2646317481994629, "rewards/rejected": -0.32478955388069153, "step": 1280 }, { "epoch": 0.51, "learning_rate": 2.883479137196714e-06, "logits/chosen": -1.826909065246582, "logits/rejected": -0.6638845801353455, "logps/chosen": -696.27734375, "logps/rejected": -1482.72119140625, "loss": 0.184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05563250929117203, "rewards/margins": 0.2452922761440277, "rewards/rejected": -0.30092480778694153, "step": 1290 }, { "epoch": 0.51, "learning_rate": 2.849627125504262e-06, "logits/chosen": -1.374955415725708, "logits/rejected": -0.20216119289398193, "logps/chosen": -578.1390380859375, "logps/rejected": -1516.6820068359375, "loss": 0.1879, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0654110461473465, "rewards/margins": 0.27813297510147095, "rewards/rejected": -0.34354403614997864, "step": 1300 }, { "epoch": 0.51, "learning_rate": 2.8157096000249334e-06, "logits/chosen": -1.5065643787384033, "logits/rejected": -0.7829849123954773, "logps/chosen": -630.3825073242188, "logps/rejected": -1537.6138916015625, "loss": 0.2042, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.061698682606220245, "rewards/margins": 0.2721042037010193, "rewards/rejected": -0.33380287885665894, "step": 1310 }, { "epoch": 0.52, "learning_rate": 2.7817329162883033e-06, "logits/chosen": -1.471840500831604, "logits/rejected": -0.21121864020824432, "logps/chosen": -743.4503173828125, "logps/rejected": -1590.5904541015625, "loss": 0.1418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.055032916367053986, "rewards/margins": 0.24513819813728333, "rewards/rejected": -0.3001710772514343, "step": 1320 }, { "epoch": 0.52, "learning_rate": 2.747703440909128e-06, "logits/chosen": -1.6148380041122437, "logits/rejected": -0.6764585375785828, "logps/chosen": -709.3273315429688, "logps/rejected": -1805.447021484375, "loss": 0.1334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.030007129535079002, "rewards/margins": 0.361659973859787, "rewards/rejected": -0.39166706800460815, "step": 1330 }, { "epoch": 0.53, "learning_rate": 2.713627550394363e-06, "logits/chosen": -1.3852078914642334, "logits/rejected": -0.6749362945556641, "logps/chosen": -686.713623046875, "logps/rejected": -1515.2998046875, "loss": 0.1806, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05257093161344528, "rewards/margins": 0.26767003536224365, "rewards/rejected": -0.3202410042285919, "step": 1340 }, { "epoch": 0.53, "learning_rate": 2.679511629948319e-06, "logits/chosen": -1.352468729019165, "logits/rejected": -0.6524327993392944, "logps/chosen": -796.8145751953125, "logps/rejected": -1669.538330078125, "loss": 0.2095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10906052589416504, "rewards/margins": 0.23847489058971405, "rewards/rejected": -0.3475354313850403, "step": 1350 }, { "epoch": 0.53, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -1.554595708847046, "logits/rejected": 0.08318161964416504, "logps/chosen": -606.7230224609375, "logps/rejected": -1430.286865234375, "loss": 0.2034, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04838673770427704, "rewards/margins": 0.2375943958759308, "rewards/rejected": -0.28598111867904663, "step": 1360 }, { "epoch": 0.54, "learning_rate": 2.6111852763861763e-06, "logits/chosen": -1.3457515239715576, "logits/rejected": -0.39270055294036865, "logps/chosen": -752.8702392578125, "logps/rejected": -1860.8333740234375, "loss": 0.1234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.04639287292957306, "rewards/margins": 0.38252198696136475, "rewards/rejected": -0.4289148449897766, "step": 1370 }, { "epoch": 0.54, "learning_rate": 2.576987646390426e-06, "logits/chosen": -1.5459932088851929, "logits/rejected": -0.5794991254806519, "logps/chosen": -691.588134765625, "logps/rejected": -1757.112548828125, "loss": 0.1192, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04728538915514946, "rewards/margins": 0.32668638229370117, "rewards/rejected": -0.3739717900753021, "step": 1380 }, { "epoch": 0.55, "learning_rate": 2.542775590305023e-06, "logits/chosen": -1.304917573928833, "logits/rejected": -0.4121823310852051, "logps/chosen": -630.0661010742188, "logps/rejected": -1441.1773681640625, "loss": 0.2289, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03756406903266907, "rewards/margins": 0.20802605152130127, "rewards/rejected": -0.24559013545513153, "step": 1390 }, { "epoch": 0.55, "learning_rate": 2.5085555188492384e-06, "logits/chosen": -1.2159336805343628, "logits/rejected": -0.3775702118873596, "logps/chosen": -709.61376953125, "logps/rejected": -1723.700927734375, "loss": 0.1568, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10398067533969879, "rewards/margins": 0.2780481278896332, "rewards/rejected": -0.38202884793281555, "step": 1400 }, { "epoch": 0.55, "learning_rate": 2.474333844244276e-06, "logits/chosen": -1.2202876806259155, "logits/rejected": -0.35152697563171387, "logps/chosen": -818.2611083984375, "logps/rejected": -1743.5579833984375, "loss": 0.1788, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09500784426927567, "rewards/margins": 0.300513356924057, "rewards/rejected": -0.39552122354507446, "step": 1410 }, { "epoch": 0.56, "learning_rate": 2.440116979011743e-06, "logits/chosen": -1.4342302083969116, "logits/rejected": -0.45796999335289, "logps/chosen": -718.6922607421875, "logps/rejected": -1725.5560302734375, "loss": 0.197, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05935473367571831, "rewards/margins": 0.324557363986969, "rewards/rejected": -0.383912056684494, "step": 1420 }, { "epoch": 0.56, "learning_rate": 2.4059113347720573e-06, "logits/chosen": -1.5391137599945068, "logits/rejected": -0.13381418585777283, "logps/chosen": -690.8306884765625, "logps/rejected": -1534.461181640625, "loss": 0.1946, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10408179461956024, "rewards/margins": 0.27237391471862793, "rewards/rejected": -0.376455694437027, "step": 1430 }, { "epoch": 0.56, "learning_rate": 2.3717233210430258e-06, "logits/chosen": -1.308176875114441, "logits/rejected": -0.5252507925033569, "logps/chosen": -736.7550659179688, "logps/rejected": -1814.572021484375, "loss": 0.1699, "rewards/accuracies": 0.75, "rewards/chosen": -0.1053951233625412, "rewards/margins": 0.3335246741771698, "rewards/rejected": -0.4389197826385498, "step": 1440 }, { "epoch": 0.57, "learning_rate": 2.337559344038817e-06, "logits/chosen": -1.2826203107833862, "logits/rejected": 0.2594057321548462, "logps/chosen": -654.9820556640625, "logps/rejected": -1548.369384765625, "loss": 0.1628, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10001038014888763, "rewards/margins": 0.2573556900024414, "rewards/rejected": -0.35736608505249023, "step": 1450 }, { "epoch": 0.57, "learning_rate": 2.303425805469554e-06, "logits/chosen": -1.2893702983856201, "logits/rejected": -0.615670382976532, "logps/chosen": -686.9696044921875, "logps/rejected": -1765.0921630859375, "loss": 0.1341, "rewards/accuracies": 0.875, "rewards/chosen": -0.06719444692134857, "rewards/margins": 0.3570956885814667, "rewards/rejected": -0.42429018020629883, "step": 1460 }, { "epoch": 0.58, "learning_rate": 2.269329101341745e-06, "logits/chosen": -1.5257200002670288, "logits/rejected": -0.8465067744255066, "logps/chosen": -722.9954833984375, "logps/rejected": -1763.6884765625, "loss": 0.1296, "rewards/accuracies": 0.875, "rewards/chosen": -0.0741962268948555, "rewards/margins": 0.36958009004592896, "rewards/rejected": -0.44377630949020386, "step": 1470 }, { "epoch": 0.58, "learning_rate": 2.235275620759797e-06, "logits/chosen": -1.3611409664154053, "logits/rejected": 0.612551748752594, "logps/chosen": -703.1578979492188, "logps/rejected": -1591.042236328125, "loss": 0.1764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10201771557331085, "rewards/margins": 0.2424260824918747, "rewards/rejected": -0.34444376826286316, "step": 1480 }, { "epoch": 0.58, "learning_rate": 2.2012717447288037e-06, "logits/chosen": -1.3054463863372803, "logits/rejected": -0.7033378481864929, "logps/chosen": -731.6030883789062, "logps/rejected": -1814.713134765625, "loss": 0.1576, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0751316249370575, "rewards/margins": 0.3522658348083496, "rewards/rejected": -0.42739754915237427, "step": 1490 }, { "epoch": 0.59, "learning_rate": 2.167323844958867e-06, "logits/chosen": -1.524957299232483, "logits/rejected": -0.6119885444641113, "logps/chosen": -701.2098388671875, "logps/rejected": -1545.368896484375, "loss": 0.14, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10369672626256943, "rewards/margins": 0.28280869126319885, "rewards/rejected": -0.3865054249763489, "step": 1500 }, { "epoch": 0.59, "learning_rate": 2.133438282671149e-06, "logits/chosen": -1.2132611274719238, "logits/rejected": -0.7082799673080444, "logps/chosen": -762.6727294921875, "logps/rejected": -1658.924072265625, "loss": 0.1803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1439850628376007, "rewards/margins": 0.27077409625053406, "rewards/rejected": -0.41475915908813477, "step": 1510 }, { "epoch": 0.6, "learning_rate": 2.0996214074059033e-06, "logits/chosen": -1.6239715814590454, "logits/rejected": -0.5037415623664856, "logps/chosen": -786.1912841796875, "logps/rejected": -1638.0843505859375, "loss": 0.2179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07825516164302826, "rewards/margins": 0.2860822379589081, "rewards/rejected": -0.36433738470077515, "step": 1520 }, { "epoch": 0.6, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.3029212951660156, "logits/rejected": -0.10125327110290527, "logps/chosen": -724.8988647460938, "logps/rejected": -1521.581787109375, "loss": 0.1988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05552230030298233, "rewards/margins": 0.29605624079704285, "rewards/rejected": -0.3515785336494446, "step": 1530 }, { "epoch": 0.6, "learning_rate": 2.0322190505629297e-06, "logits/chosen": -1.1891577243804932, "logits/rejected": -0.263233482837677, "logps/chosen": -726.5543212890625, "logps/rejected": -1851.503662109375, "loss": 0.1454, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10788372904062271, "rewards/margins": 0.327489972114563, "rewards/rejected": -0.4353737235069275, "step": 1540 }, { "epoch": 0.61, "learning_rate": 1.998646198965312e-06, "logits/chosen": -1.376450777053833, "logits/rejected": -0.22948014736175537, "logps/chosen": -596.0374755859375, "logps/rejected": -1520.2818603515625, "loss": 0.2496, "rewards/accuracies": 0.75, "rewards/chosen": -0.062327928841114044, "rewards/margins": 0.3048885762691498, "rewards/rejected": -0.3672165274620056, "step": 1550 }, { "epoch": 0.61, "learning_rate": 1.965167291983757e-06, "logits/chosen": -1.6274656057357788, "logits/rejected": -0.2617906928062439, "logps/chosen": -786.1827392578125, "logps/rejected": -1801.614990234375, "loss": 0.1203, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10416553169488907, "rewards/margins": 0.327767550945282, "rewards/rejected": -0.43193307518959045, "step": 1560 }, { "epoch": 0.62, "learning_rate": 1.931788602958678e-06, "logits/chosen": -0.9874919652938843, "logits/rejected": 0.055336445569992065, "logps/chosen": -801.8827514648438, "logps/rejected": -1887.7252197265625, "loss": 0.1647, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1263236552476883, "rewards/margins": 0.3318944573402405, "rewards/rejected": -0.4582180976867676, "step": 1570 }, { "epoch": 0.62, "learning_rate": 1.8985163864514644e-06, "logits/chosen": -1.4952738285064697, "logits/rejected": -0.03670965135097504, "logps/chosen": -776.7321166992188, "logps/rejected": -1846.3646240234375, "loss": 0.1433, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11145295947790146, "rewards/margins": 0.3160027265548706, "rewards/rejected": -0.42745572328567505, "step": 1580 }, { "epoch": 0.62, "learning_rate": 1.8653568770724805e-06, "logits/chosen": -1.352738618850708, "logits/rejected": -0.2683241367340088, "logps/chosen": -648.5192260742188, "logps/rejected": -1464.099365234375, "loss": 0.185, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08493933826684952, "rewards/margins": 0.2524186968803406, "rewards/rejected": -0.3373579978942871, "step": 1590 }, { "epoch": 0.63, "learning_rate": 1.8323162883128211e-06, "logits/chosen": -1.419662356376648, "logits/rejected": -0.4111382067203522, "logps/chosen": -699.5247802734375, "logps/rejected": -1743.6064453125, "loss": 0.1541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08453786373138428, "rewards/margins": 0.296464741230011, "rewards/rejected": -0.38100260496139526, "step": 1600 }, { "epoch": 0.63, "learning_rate": 1.7994008113800105e-06, "logits/chosen": -1.5189629793167114, "logits/rejected": -0.9077790975570679, "logps/chosen": -701.3331298828125, "logps/rejected": -1603.5174560546875, "loss": 0.1429, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08178045600652695, "rewards/margins": 0.3028547167778015, "rewards/rejected": -0.38463518023490906, "step": 1610 }, { "epoch": 0.64, "learning_rate": 1.7666166140378853e-06, "logits/chosen": -1.169510841369629, "logits/rejected": 0.19725301861763, "logps/chosen": -734.7293090820312, "logps/rejected": -1564.332763671875, "loss": 0.1528, "rewards/accuracies": 0.75, "rewards/chosen": -0.09660240262746811, "rewards/margins": 0.28595981001853943, "rewards/rejected": -0.38256219029426575, "step": 1620 }, { "epoch": 0.64, "learning_rate": 1.7339698394508632e-06, "logits/chosen": -1.266775369644165, "logits/rejected": -0.6185767650604248, "logps/chosen": -627.6648559570312, "logps/rejected": -1780.268310546875, "loss": 0.1694, "rewards/accuracies": 0.75, "rewards/chosen": -0.0766761377453804, "rewards/margins": 0.3623715043067932, "rewards/rejected": -0.43904757499694824, "step": 1630 }, { "epoch": 0.64, "learning_rate": 1.7014666050328325e-06, "logits/chosen": -1.5317351818084717, "logits/rejected": -0.46623557806015015, "logps/chosen": -639.0328369140625, "logps/rejected": -1635.7354736328125, "loss": 0.126, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07408356666564941, "rewards/margins": 0.34189721941947937, "rewards/rejected": -0.41598081588745117, "step": 1640 }, { "epoch": 0.65, "learning_rate": 1.6691130013008514e-06, "logits/chosen": -1.421917200088501, "logits/rejected": -0.19839780032634735, "logps/chosen": -837.2825317382812, "logps/rejected": -1678.8179931640625, "loss": 0.1956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08556106686592102, "rewards/margins": 0.2512792646884918, "rewards/rejected": -0.33684033155441284, "step": 1650 }, { "epoch": 0.65, "learning_rate": 1.6369150907339007e-06, "logits/chosen": -1.195821762084961, "logits/rejected": -0.20372645556926727, "logps/chosen": -709.2095336914062, "logps/rejected": -1652.1871337890625, "loss": 0.1906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07075698673725128, "rewards/margins": 0.289537250995636, "rewards/rejected": -0.36029425263404846, "step": 1660 }, { "epoch": 0.65, "learning_rate": 1.6048789066368858e-06, "logits/chosen": -1.354961633682251, "logits/rejected": -0.20124280452728271, "logps/chosen": -728.2799072265625, "logps/rejected": -1569.3551025390625, "loss": 0.1916, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08767645061016083, "rewards/margins": 0.2705709636211395, "rewards/rejected": -0.35824739933013916, "step": 1670 }, { "epoch": 0.66, "learning_rate": 1.5730104520100984e-06, "logits/chosen": -1.496524453163147, "logits/rejected": -0.8575867414474487, "logps/chosen": -612.16650390625, "logps/rejected": -1632.1365966796875, "loss": 0.1279, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06737435609102249, "rewards/margins": 0.3229941725730896, "rewards/rejected": -0.3903685212135315, "step": 1680 }, { "epoch": 0.66, "learning_rate": 1.5413156984243715e-06, "logits/chosen": -1.3209052085876465, "logits/rejected": -0.12577922642230988, "logps/chosen": -759.8672485351562, "logps/rejected": -1498.656494140625, "loss": 0.1552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10295001417398453, "rewards/margins": 0.21431489288806915, "rewards/rejected": -0.3172649145126343, "step": 1690 }, { "epoch": 0.67, "learning_rate": 1.509800584902108e-06, "logits/chosen": -1.1863139867782593, "logits/rejected": -0.08450505882501602, "logps/chosen": -838.8494873046875, "logps/rejected": -1536.5277099609375, "loss": 0.167, "rewards/accuracies": 0.75, "rewards/chosen": -0.12472305446863174, "rewards/margins": 0.24718424677848816, "rewards/rejected": -0.3719072937965393, "step": 1700 }, { "epoch": 0.67, "learning_rate": 1.4784710168044215e-06, "logits/chosen": -1.369985818862915, "logits/rejected": -0.5248149037361145, "logps/chosen": -883.6121826171875, "logps/rejected": -1617.128662109375, "loss": 0.1984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1338253617286682, "rewards/margins": 0.2511526942253113, "rewards/rejected": -0.3849780857563019, "step": 1710 }, { "epoch": 0.67, "learning_rate": 1.4473328647245726e-06, "logits/chosen": -1.624087929725647, "logits/rejected": -0.42871198058128357, "logps/chosen": -694.0233764648438, "logps/rejected": -1572.922119140625, "loss": 0.2198, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1324300318956375, "rewards/margins": 0.2687898874282837, "rewards/rejected": -0.4012199342250824, "step": 1720 }, { "epoch": 0.68, "learning_rate": 1.4163919633879325e-06, "logits/chosen": -1.4249976873397827, "logits/rejected": -0.46216440200805664, "logps/chosen": -831.1329956054688, "logps/rejected": -1623.590087890625, "loss": 0.2073, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09118635952472687, "rewards/margins": 0.26622968912124634, "rewards/rejected": -0.357416033744812, "step": 1730 }, { "epoch": 0.68, "learning_rate": 1.3856541105586545e-06, "logits/chosen": -1.5596380233764648, "logits/rejected": -0.4608355462551117, "logps/chosen": -826.0984497070312, "logps/rejected": -1898.5416259765625, "loss": 0.1421, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1307828575372696, "rewards/margins": 0.34290483593940735, "rewards/rejected": -0.47368764877319336, "step": 1740 }, { "epoch": 0.69, "learning_rate": 1.3551250659532853e-06, "logits/chosen": -1.492356300354004, "logits/rejected": -0.7112780809402466, "logps/chosen": -699.1672973632812, "logps/rejected": -1537.228271484375, "loss": 0.1776, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06596329063177109, "rewards/margins": 0.2567977011203766, "rewards/rejected": -0.32276099920272827, "step": 1750 }, { "epoch": 0.69, "learning_rate": 1.3248105501614897e-06, "logits/chosen": -1.2990128993988037, "logits/rejected": -0.7208808660507202, "logps/chosen": -714.08544921875, "logps/rejected": -1732.0875244140625, "loss": 0.2147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05500803142786026, "rewards/margins": 0.26819437742233276, "rewards/rejected": -0.32320234179496765, "step": 1760 }, { "epoch": 0.69, "learning_rate": 1.2947162435741278e-06, "logits/chosen": -1.1586157083511353, "logits/rejected": 0.03688998147845268, "logps/chosen": -734.365966796875, "logps/rejected": -1622.6265869140625, "loss": 0.2471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1007305383682251, "rewards/margins": 0.197604700922966, "rewards/rejected": -0.2983352243900299, "step": 1770 }, { "epoch": 0.7, "learning_rate": 1.2648477853188395e-06, "logits/chosen": -1.412379503250122, "logits/rejected": -0.5264952778816223, "logps/chosen": -698.6842651367188, "logps/rejected": -1511.8642578125, "loss": 0.1865, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.042982958257198334, "rewards/margins": 0.26162266731262207, "rewards/rejected": -0.304605633020401, "step": 1780 }, { "epoch": 0.7, "learning_rate": 1.2352107722033842e-06, "logits/chosen": -1.2586696147918701, "logits/rejected": -0.15170638263225555, "logps/chosen": -653.3026123046875, "logps/rejected": -1529.8104248046875, "loss": 0.1549, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0475340336561203, "rewards/margins": 0.275061696767807, "rewards/rejected": -0.3225957453250885, "step": 1790 }, { "epoch": 0.71, "learning_rate": 1.205810757666894e-06, "logits/chosen": -1.3673145771026611, "logits/rejected": -0.4642263948917389, "logps/chosen": -588.0513916015625, "logps/rejected": -1447.431396484375, "loss": 0.1613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07587876915931702, "rewards/margins": 0.24527780711650848, "rewards/rejected": -0.3211565613746643, "step": 1800 }, { "epoch": 0.71, "learning_rate": 1.176653250739265e-06, "logits/chosen": -1.4524450302124023, "logits/rejected": -0.21896734833717346, "logps/chosen": -831.2824096679688, "logps/rejected": -1819.2064208984375, "loss": 0.1362, "rewards/accuracies": 0.875, "rewards/chosen": -0.09457580000162125, "rewards/margins": 0.29631510376930237, "rewards/rejected": -0.390890896320343, "step": 1810 }, { "epoch": 0.71, "learning_rate": 1.1477437150088599e-06, "logits/chosen": -1.112823247909546, "logits/rejected": -0.731514573097229, "logps/chosen": -659.6626586914062, "logps/rejected": -1812.48828125, "loss": 0.1304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05299247428774834, "rewards/margins": 0.3928179442882538, "rewards/rejected": -0.4458104074001312, "step": 1820 }, { "epoch": 0.72, "learning_rate": 1.1190875675987355e-06, "logits/chosen": -1.3094470500946045, "logits/rejected": -0.5637291073799133, "logps/chosen": -753.520263671875, "logps/rejected": -1700.703857421875, "loss": 0.1502, "rewards/accuracies": 0.75, "rewards/chosen": -0.09490607678890228, "rewards/margins": 0.31419411301612854, "rewards/rejected": -0.409100204706192, "step": 1830 }, { "epoch": 0.72, "learning_rate": 1.0906901781515695e-06, "logits/chosen": -1.550244927406311, "logits/rejected": -0.08849823474884033, "logps/chosen": -724.5099487304688, "logps/rejected": -1681.033447265625, "loss": 0.1606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08625416457653046, "rewards/margins": 0.31124037504196167, "rewards/rejected": -0.39749449491500854, "step": 1840 }, { "epoch": 0.73, "learning_rate": 1.0625568678234839e-06, "logits/chosen": -1.0879476070404053, "logits/rejected": -0.13099336624145508, "logps/chosen": -671.8837280273438, "logps/rejected": -1590.70068359375, "loss": 0.1721, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06376481801271439, "rewards/margins": 0.2916422486305237, "rewards/rejected": -0.35540705919265747, "step": 1850 }, { "epoch": 0.73, "learning_rate": 1.034692908286964e-06, "logits/chosen": -1.3455946445465088, "logits/rejected": -0.2840282917022705, "logps/chosen": -611.4814453125, "logps/rejected": -1663.345703125, "loss": 0.2039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07838527858257294, "rewards/margins": 0.31080400943756104, "rewards/rejected": -0.3891892731189728, "step": 1860 }, { "epoch": 0.73, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -1.2556473016738892, "logits/rejected": -0.011271673254668713, "logps/chosen": -753.8445434570312, "logps/rejected": -1636.611083984375, "loss": 0.2112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11006224155426025, "rewards/margins": 0.24791212379932404, "rewards/rejected": -0.3579743504524231, "step": 1870 }, { "epoch": 0.74, "learning_rate": 9.797938749429088e-07, "logits/chosen": -1.2267249822616577, "logits/rejected": -0.35565489530563354, "logps/chosen": -690.4405517578125, "logps/rejected": -1600.0665283203125, "loss": 0.1862, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11180742084980011, "rewards/margins": 0.24852195382118225, "rewards/rejected": -0.36032935976982117, "step": 1880 }, { "epoch": 0.74, "learning_rate": 9.527690882192636e-07, "logits/chosen": -1.2072794437408447, "logits/rejected": 0.457929790019989, "logps/chosen": -697.0407104492188, "logps/rejected": -1490.8367919921875, "loss": 0.1672, "rewards/accuracies": 0.75, "rewards/chosen": -0.08405301719903946, "rewards/margins": 0.30805063247680664, "rewards/rejected": -0.3921036422252655, "step": 1890 }, { "epoch": 0.75, "learning_rate": 9.260342245273507e-07, "logits/chosen": -1.3990890979766846, "logits/rejected": -0.6794065237045288, "logps/chosen": -618.4937744140625, "logps/rejected": -1800.4622802734375, "loss": 0.1376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07932893931865692, "rewards/margins": 0.37783947587013245, "rewards/rejected": -0.4571684002876282, "step": 1900 }, { "epoch": 0.75, "learning_rate": 8.995942934960964e-07, "logits/chosen": -1.4945213794708252, "logits/rejected": -0.18756787478923798, "logps/chosen": -803.509521484375, "logps/rejected": -1819.3349609375, "loss": 0.1544, "rewards/accuracies": 0.875, "rewards/chosen": -0.09666319191455841, "rewards/margins": 0.36414963006973267, "rewards/rejected": -0.4608128070831299, "step": 1910 }, { "epoch": 0.75, "learning_rate": 8.734542494893955e-07, "logits/chosen": -1.431398868560791, "logits/rejected": -0.4752410352230072, "logps/chosen": -792.5185546875, "logps/rejected": -1632.630126953125, "loss": 0.2053, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09532758593559265, "rewards/margins": 0.30739787220954895, "rewards/rejected": -0.4027254581451416, "step": 1920 }, { "epoch": 0.76, "learning_rate": 8.476189906777457e-07, "logits/chosen": -1.3982821702957153, "logits/rejected": -0.08427709341049194, "logps/chosen": -703.8153076171875, "logps/rejected": -1600.6046142578125, "loss": 0.1632, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08243191242218018, "rewards/margins": 0.2651790678501129, "rewards/rejected": -0.3476109802722931, "step": 1930 }, { "epoch": 0.76, "learning_rate": 8.220933581204257e-07, "logits/chosen": -1.2576748132705688, "logits/rejected": 0.40268439054489136, "logps/chosen": -528.5084228515625, "logps/rejected": -1385.6802978515625, "loss": 0.1183, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.042114533483982086, "rewards/margins": 0.2899821698665619, "rewards/rejected": -0.3320966958999634, "step": 1940 }, { "epoch": 0.76, "learning_rate": 7.968821348583644e-07, "logits/chosen": -1.3039714097976685, "logits/rejected": -0.34471797943115234, "logps/chosen": -695.2639770507812, "logps/rejected": -1490.329345703125, "loss": 0.1969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08580182492733002, "rewards/margins": 0.252492755651474, "rewards/rejected": -0.3382945656776428, "step": 1950 }, { "epoch": 0.77, "learning_rate": 7.719900450178882e-07, "logits/chosen": -1.2936707735061646, "logits/rejected": 0.12274640798568726, "logps/chosen": -856.3453979492188, "logps/rejected": -1843.365966796875, "loss": 0.1424, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11615820974111557, "rewards/margins": 0.33368679881095886, "rewards/rejected": -0.44984501600265503, "step": 1960 }, { "epoch": 0.77, "learning_rate": 7.474217529255018e-07, "logits/chosen": -1.611425757408142, "logits/rejected": -0.11960859596729279, "logps/chosen": -636.3781127929688, "logps/rejected": -1416.641357421875, "loss": 0.1836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03691656142473221, "rewards/margins": 0.26324373483657837, "rewards/rejected": -0.30016031861305237, "step": 1970 }, { "epoch": 0.78, "learning_rate": 7.231818622338824e-07, "logits/chosen": -1.616742730140686, "logits/rejected": -0.024957846850156784, "logps/chosen": -676.9722900390625, "logps/rejected": -1823.395751953125, "loss": 0.1225, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06905169785022736, "rewards/margins": 0.3410964906215668, "rewards/rejected": -0.4101482033729553, "step": 1980 }, { "epoch": 0.78, "learning_rate": 6.992749150592343e-07, "logits/chosen": -1.2690980434417725, "logits/rejected": -0.1918954849243164, "logps/chosen": -866.05029296875, "logps/rejected": -1606.396240234375, "loss": 0.1865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.102320097386837, "rewards/margins": 0.27621665596961975, "rewards/rejected": -0.37853676080703735, "step": 1990 }, { "epoch": 0.78, "learning_rate": 6.75705391130183e-07, "logits/chosen": -1.2711069583892822, "logits/rejected": -0.00027151108952239156, "logps/chosen": -804.7188720703125, "logps/rejected": -1668.5374755859375, "loss": 0.166, "rewards/accuracies": 0.875, "rewards/chosen": -0.07086384296417236, "rewards/margins": 0.2957269251346588, "rewards/rejected": -0.3665907680988312, "step": 2000 }, { "epoch": 0.79, "learning_rate": 6.524777069483526e-07, "logits/chosen": -1.225556492805481, "logits/rejected": 0.41769227385520935, "logps/chosen": -634.6071166992188, "logps/rejected": -1566.803466796875, "loss": 0.1916, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.020447371527552605, "rewards/margins": 0.31025153398513794, "rewards/rejected": -0.3306989073753357, "step": 2010 }, { "epoch": 0.79, "learning_rate": 6.29596214960792e-07, "logits/chosen": -1.3543643951416016, "logits/rejected": -0.1612066775560379, "logps/chosen": -731.138671875, "logps/rejected": -1658.898193359375, "loss": 0.1954, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.078438401222229, "rewards/margins": 0.2861797511577606, "rewards/rejected": -0.36461812257766724, "step": 2020 }, { "epoch": 0.8, "learning_rate": 6.070652027444102e-07, "logits/chosen": -1.5058627128601074, "logits/rejected": -0.940344512462616, "logps/chosen": -629.819580078125, "logps/rejected": -1781.6654052734375, "loss": 0.1992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.061135418713092804, "rewards/margins": 0.3336263597011566, "rewards/rejected": -0.39476174116134644, "step": 2030 }, { "epoch": 0.8, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1.524287462234497, "logits/rejected": -0.8633726239204407, "logps/chosen": -602.5631103515625, "logps/rejected": -1604.6434326171875, "loss": 0.1721, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05612843483686447, "rewards/margins": 0.33471354842185974, "rewards/rejected": -0.3908420205116272, "step": 2040 }, { "epoch": 0.8, "learning_rate": 5.63071438773913e-07, "logits/chosen": -1.4894258975982666, "logits/rejected": -0.14880971610546112, "logps/chosen": -642.1497802734375, "logps/rejected": -1459.4459228515625, "loss": 0.2064, "rewards/accuracies": 0.75, "rewards/chosen": -0.0682179257273674, "rewards/margins": 0.21762903034687042, "rewards/rejected": -0.2858469486236572, "step": 2050 }, { "epoch": 0.81, "learning_rate": 5.416169306538485e-07, "logits/chosen": -1.3140041828155518, "logits/rejected": 0.3596586287021637, "logps/chosen": -820.9474487304688, "logps/rejected": -1682.409912109375, "loss": 0.2355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09827397763729095, "rewards/margins": 0.281690388917923, "rewards/rejected": -0.3799643814563751, "step": 2060 }, { "epoch": 0.81, "learning_rate": 5.205293880283552e-07, "logits/chosen": -1.5573115348815918, "logits/rejected": -0.13623039424419403, "logps/chosen": -671.4677124023438, "logps/rejected": -1707.608642578125, "loss": 0.1752, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05831771343946457, "rewards/margins": 0.3186204433441162, "rewards/rejected": -0.37693825364112854, "step": 2070 }, { "epoch": 0.82, "learning_rate": 4.998127623207404e-07, "logits/chosen": -1.2270171642303467, "logits/rejected": -0.16427640616893768, "logps/chosen": -636.1573486328125, "logps/rejected": -1320.9652099609375, "loss": 0.1501, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04943504184484482, "rewards/margins": 0.23794174194335938, "rewards/rejected": -0.2873767912387848, "step": 2080 }, { "epoch": 0.82, "learning_rate": 4.794709354512073e-07, "logits/chosen": -1.4142221212387085, "logits/rejected": -0.6630762219429016, "logps/chosen": -694.4979858398438, "logps/rejected": -1861.2236328125, "loss": 0.1027, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0681043490767479, "rewards/margins": 0.33745378255844116, "rewards/rejected": -0.40555816888809204, "step": 2090 }, { "epoch": 0.82, "learning_rate": 4.5950771910944603e-07, "logits/chosen": -1.386041522026062, "logits/rejected": -0.4771800637245178, "logps/chosen": -552.6729736328125, "logps/rejected": -1493.256103515625, "loss": 0.1758, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06818665564060211, "rewards/margins": 0.2598266899585724, "rewards/rejected": -0.3280133306980133, "step": 2100 }, { "epoch": 0.83, "learning_rate": 4.399268540403975e-07, "logits/chosen": -1.6429307460784912, "logits/rejected": -0.7215126752853394, "logps/chosen": -692.6094970703125, "logps/rejected": -1617.4793701171875, "loss": 0.1561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.04670856520533562, "rewards/margins": 0.3174007534980774, "rewards/rejected": -0.3641093373298645, "step": 2110 }, { "epoch": 0.83, "learning_rate": 4.2073200934330316e-07, "logits/chosen": -1.318565011024475, "logits/rejected": 0.31595462560653687, "logps/chosen": -688.9269409179688, "logps/rejected": -1576.12939453125, "loss": 0.1494, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05915086343884468, "rewards/margins": 0.2843713164329529, "rewards/rejected": -0.34352222084999084, "step": 2120 }, { "epoch": 0.84, "learning_rate": 4.019267817841835e-07, "logits/chosen": -1.4014190435409546, "logits/rejected": 0.06803856045007706, "logps/chosen": -661.041015625, "logps/rejected": -1782.5843505859375, "loss": 0.1339, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.060147546231746674, "rewards/margins": 0.367543488740921, "rewards/rejected": -0.4276910424232483, "step": 2130 }, { "epoch": 0.84, "learning_rate": 3.8351469512186656e-07, "logits/chosen": -1.293666124343872, "logits/rejected": 0.01516579370945692, "logps/chosen": -703.8981323242188, "logps/rejected": -1585.350830078125, "loss": 0.2612, "rewards/accuracies": 0.875, "rewards/chosen": -0.07257186621427536, "rewards/margins": 0.24214370548725128, "rewards/rejected": -0.31471556425094604, "step": 2140 }, { "epoch": 0.84, "learning_rate": 3.654991994477039e-07, "logits/chosen": -1.4482967853546143, "logits/rejected": -0.5136088132858276, "logps/chosen": -739.101318359375, "logps/rejected": -1636.627685546875, "loss": 0.2446, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08362185955047607, "rewards/margins": 0.2510630488395691, "rewards/rejected": -0.33468490839004517, "step": 2150 }, { "epoch": 0.85, "learning_rate": 3.4788367053908087e-07, "logits/chosen": -1.464727520942688, "logits/rejected": -0.6895856261253357, "logps/chosen": -649.2825927734375, "logps/rejected": -1706.675048828125, "loss": 0.1222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0619744174182415, "rewards/margins": 0.3182070851325989, "rewards/rejected": -0.3801814913749695, "step": 2160 }, { "epoch": 0.85, "learning_rate": 3.3067140922686175e-07, "logits/chosen": -1.2893580198287964, "logits/rejected": -0.02746570110321045, "logps/chosen": -637.2613525390625, "logps/rejected": -1635.1025390625, "loss": 0.1475, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06607834994792938, "rewards/margins": 0.306417852640152, "rewards/rejected": -0.37249621748924255, "step": 2170 }, { "epoch": 0.85, "learning_rate": 3.1386564077687115e-07, "logits/chosen": -1.2429146766662598, "logits/rejected": -0.5083945989608765, "logps/chosen": -689.4899291992188, "logps/rejected": -1385.622802734375, "loss": 0.2019, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08055596053600311, "rewards/margins": 0.19417758285999298, "rewards/rejected": -0.2747335135936737, "step": 2180 }, { "epoch": 0.86, "learning_rate": 2.9746951428553884e-07, "logits/chosen": -1.2200576066970825, "logits/rejected": 0.4126719534397125, "logps/chosen": -697.4050903320312, "logps/rejected": -1761.609375, "loss": 0.1621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.057256706058979034, "rewards/margins": 0.3544352650642395, "rewards/rejected": -0.41169196367263794, "step": 2190 }, { "epoch": 0.86, "learning_rate": 2.814861020898146e-07, "logits/chosen": -1.5707600116729736, "logits/rejected": -0.5523526668548584, "logps/chosen": -807.9168090820312, "logps/rejected": -1893.062255859375, "loss": 0.12, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.036843474954366684, "rewards/margins": 0.38658010959625244, "rewards/rejected": -0.4234235882759094, "step": 2200 }, { "epoch": 0.87, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -1.3549106121063232, "logits/rejected": -0.0543874129652977, "logps/chosen": -659.9330444335938, "logps/rejected": -1517.608642578125, "loss": 0.1865, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08612764626741409, "rewards/margins": 0.2554120123386383, "rewards/rejected": -0.3415396809577942, "step": 2210 }, { "epoch": 0.87, "learning_rate": 2.507693226958871e-07, "logits/chosen": -1.5055897235870361, "logits/rejected": -0.7960633635520935, "logps/chosen": -594.6507568359375, "logps/rejected": -1544.100341796875, "loss": 0.1835, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05378856509923935, "rewards/margins": 0.26992538571357727, "rewards/rejected": -0.3237139582633972, "step": 2220 }, { "epoch": 0.87, "learning_rate": 2.360417112654481e-07, "logits/chosen": -1.3403241634368896, "logits/rejected": -0.036334630101919174, "logps/chosen": -747.6497802734375, "logps/rejected": -1497.166748046875, "loss": 0.2369, "rewards/accuracies": 0.75, "rewards/chosen": -0.09375442564487457, "rewards/margins": 0.2080894410610199, "rewards/rejected": -0.30184388160705566, "step": 2230 }, { "epoch": 0.88, "learning_rate": 2.2173832458762146e-07, "logits/chosen": -1.3305310010910034, "logits/rejected": 0.5647405385971069, "logps/chosen": -708.2887573242188, "logps/rejected": -1672.27734375, "loss": 0.1525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0794595405459404, "rewards/margins": 0.27278995513916016, "rewards/rejected": -0.35224950313568115, "step": 2240 }, { "epoch": 0.88, "learning_rate": 2.07861842857843e-07, "logits/chosen": -1.3758533000946045, "logits/rejected": -0.3346394896507263, "logps/chosen": -641.7481689453125, "logps/rejected": -1659.6363525390625, "loss": 0.1304, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05183352157473564, "rewards/margins": 0.3061942458152771, "rewards/rejected": -0.35802772641181946, "step": 2250 }, { "epoch": 0.89, "learning_rate": 1.9441486627729987e-07, "logits/chosen": -1.2939542531967163, "logits/rejected": -0.2226782590150833, "logps/chosen": -574.517822265625, "logps/rejected": -1345.4554443359375, "loss": 0.2427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.044371530413627625, "rewards/margins": 0.25636622309684753, "rewards/rejected": -0.30073776841163635, "step": 2260 }, { "epoch": 0.89, "learning_rate": 1.8139991456569694e-07, "logits/chosen": -1.5377174615859985, "logits/rejected": -0.4445236623287201, "logps/chosen": -666.4637451171875, "logps/rejected": -1826.706298828125, "loss": 0.1407, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04511731117963791, "rewards/margins": 0.3437032103538513, "rewards/rejected": -0.3888205587863922, "step": 2270 }, { "epoch": 0.89, "learning_rate": 1.6881942648911077e-07, "logits/chosen": -1.1588428020477295, "logits/rejected": -0.33162426948547363, "logps/chosen": -692.0267333984375, "logps/rejected": -1666.332763671875, "loss": 0.1417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08260687440633774, "rewards/margins": 0.27366960048675537, "rewards/rejected": -0.3562764525413513, "step": 2280 }, { "epoch": 0.9, "learning_rate": 1.5667575940300384e-07, "logits/chosen": -1.2564775943756104, "logits/rejected": 0.01690312661230564, "logps/chosen": -673.8323974609375, "logps/rejected": -1666.777099609375, "loss": 0.1716, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0681915134191513, "rewards/margins": 0.31038275361061096, "rewards/rejected": -0.3785742521286011, "step": 2290 }, { "epoch": 0.9, "learning_rate": 1.449711888105046e-07, "logits/chosen": -1.518640160560608, "logits/rejected": -0.6442452669143677, "logps/chosen": -570.983154296875, "logps/rejected": -1282.307373046875, "loss": 0.2478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0646091103553772, "rewards/margins": 0.1985008716583252, "rewards/rejected": -0.2631099820137024, "step": 2300 }, { "epoch": 0.91, "learning_rate": 1.3370790793601373e-07, "logits/chosen": -1.3143935203552246, "logits/rejected": -0.862291157245636, "logps/chosen": -554.1536254882812, "logps/rejected": -1571.299560546875, "loss": 0.1953, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.04557369276881218, "rewards/margins": 0.30963796377182007, "rewards/rejected": -0.35521167516708374, "step": 2310 }, { "epoch": 0.91, "learning_rate": 1.2288802731423882e-07, "logits/chosen": -1.0400464534759521, "logits/rejected": -0.2425573766231537, "logps/chosen": -563.1722412109375, "logps/rejected": -1669.356201171875, "loss": 0.1733, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.032249629497528076, "rewards/margins": 0.35338449478149414, "rewards/rejected": -0.38563409447669983, "step": 2320 }, { "epoch": 0.91, "learning_rate": 1.125135743947145e-07, "logits/chosen": -1.392665982246399, "logits/rejected": -0.2731800079345703, "logps/chosen": -636.9364013671875, "logps/rejected": -1649.939453125, "loss": 0.1732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05704592913389206, "rewards/margins": 0.2943916916847229, "rewards/rejected": -0.35143759846687317, "step": 2330 }, { "epoch": 0.92, "learning_rate": 1.0258649316189722e-07, "logits/chosen": -1.448233723640442, "logits/rejected": -0.09116245806217194, "logps/chosen": -595.1334838867188, "logps/rejected": -1466.4798583984375, "loss": 0.1563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.029435504227876663, "rewards/margins": 0.2879069745540619, "rewards/rejected": -0.31734246015548706, "step": 2340 }, { "epoch": 0.92, "learning_rate": 9.310864377089696e-08, "logits/chosen": -1.298662543296814, "logits/rejected": 0.7022291421890259, "logps/chosen": -692.7830810546875, "logps/rejected": -1592.398193359375, "loss": 0.1976, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06722499430179596, "rewards/margins": 0.26029545068740845, "rewards/rejected": -0.327520489692688, "step": 2350 }, { "epoch": 0.93, "learning_rate": 8.408180219891899e-08, "logits/chosen": -1.0684707164764404, "logits/rejected": -0.727800726890564, "logps/chosen": -588.0787353515625, "logps/rejected": -1701.8707275390625, "loss": 0.1205, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.026762153953313828, "rewards/margins": 0.3723045885562897, "rewards/rejected": -0.3990667462348938, "step": 2360 }, { "epoch": 0.93, "learning_rate": 7.550765991247655e-08, "logits/chosen": -1.3413885831832886, "logits/rejected": -0.5428125262260437, "logps/chosen": -576.0411376953125, "logps/rejected": -1852.245361328125, "loss": 0.1253, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.046153269708156586, "rewards/margins": 0.3955245614051819, "rewards/rejected": -0.44167786836624146, "step": 2370 }, { "epoch": 0.93, "learning_rate": 6.738782355044048e-08, "logits/chosen": -1.3843357563018799, "logits/rejected": -0.3473323881626129, "logps/chosen": -653.3553466796875, "logps/rejected": -1551.5245361328125, "loss": 0.2134, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07392759621143341, "rewards/margins": 0.27120834589004517, "rewards/rejected": -0.3451359272003174, "step": 2380 }, { "epoch": 0.94, "learning_rate": 5.972381462298643e-08, "logits/chosen": -1.446597695350647, "logits/rejected": -0.7707004547119141, "logps/chosen": -587.423095703125, "logps/rejected": -1533.5751953125, "loss": 0.1378, "rewards/accuracies": 0.75, "rewards/chosen": -0.048751670867204666, "rewards/margins": 0.2718280255794525, "rewards/rejected": -0.3205797076225281, "step": 2390 }, { "epoch": 0.94, "learning_rate": 5.2517069226488694e-08, "logits/chosen": -1.3285058736801147, "logits/rejected": 0.6017956733703613, "logps/chosen": -635.4934692382812, "logps/rejected": -1658.809326171875, "loss": 0.1218, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.054670076817274094, "rewards/margins": 0.33853715658187866, "rewards/rejected": -0.39320722222328186, "step": 2400 }, { "epoch": 0.95, "learning_rate": 4.576893777442415e-08, "logits/chosen": -1.455540418624878, "logits/rejected": -0.42580240964889526, "logps/chosen": -567.0203857421875, "logps/rejected": -1439.871826171875, "loss": 0.1791, "rewards/accuracies": 0.75, "rewards/chosen": -0.04314614459872246, "rewards/margins": 0.26481324434280396, "rewards/rejected": -0.3079594075679779, "step": 2410 }, { "epoch": 0.95, "learning_rate": 3.9480684744327145e-08, "logits/chosen": -0.8030007481575012, "logits/rejected": -0.6547081470489502, "logps/chosen": -714.3633422851562, "logps/rejected": -1776.2916259765625, "loss": 0.135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.07487257570028305, "rewards/margins": 0.3637450039386749, "rewards/rejected": -0.4386175274848938, "step": 2420 }, { "epoch": 0.95, "learning_rate": 3.3653488440851255e-08, "logits/chosen": -1.4338642358779907, "logits/rejected": -0.2775370478630066, "logps/chosen": -522.8925170898438, "logps/rejected": -1403.964599609375, "loss": 0.1429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.028084104880690575, "rewards/margins": 0.2945864796638489, "rewards/rejected": -0.322670578956604, "step": 2430 }, { "epoch": 0.96, "learning_rate": 2.82884407749745e-08, "logits/chosen": -1.548905611038208, "logits/rejected": -0.1580895483493805, "logps/chosen": -721.1185913085938, "logps/rejected": -1815.273681640625, "loss": 0.168, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.058462172746658325, "rewards/margins": 0.334224134683609, "rewards/rejected": -0.39268630743026733, "step": 2440 }, { "epoch": 0.96, "learning_rate": 2.3386547059396634e-08, "logits/chosen": -1.3936518430709839, "logits/rejected": -0.42037662863731384, "logps/chosen": -727.2462768554688, "logps/rejected": -1849.8060302734375, "loss": 0.1504, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.053724952042102814, "rewards/margins": 0.34565088152885437, "rewards/rejected": -0.3993757963180542, "step": 2450 }, { "epoch": 0.96, "learning_rate": 1.8948725820160663e-08, "logits/chosen": -1.5373561382293701, "logits/rejected": -0.5124548673629761, "logps/chosen": -707.9256591796875, "logps/rejected": -1602.8634033203125, "loss": 0.152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06985752284526825, "rewards/margins": 0.3106308579444885, "rewards/rejected": -0.3804883658885956, "step": 2460 }, { "epoch": 0.97, "learning_rate": 1.497580862453829e-08, "logits/chosen": -1.3587336540222168, "logits/rejected": 0.12183968722820282, "logps/chosen": -682.3876342773438, "logps/rejected": -1501.242431640625, "loss": 0.179, "rewards/accuracies": 0.75, "rewards/chosen": -0.07504203170537949, "rewards/margins": 0.25899559259414673, "rewards/rejected": -0.3340376317501068, "step": 2470 }, { "epoch": 0.97, "learning_rate": 1.14685399252093e-08, "logits/chosen": -1.2843676805496216, "logits/rejected": -0.37819939851760864, "logps/chosen": -639.9012451171875, "logps/rejected": -1667.867431640625, "loss": 0.1334, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05129992961883545, "rewards/margins": 0.30902066826820374, "rewards/rejected": -0.3603206276893616, "step": 2480 }, { "epoch": 0.98, "learning_rate": 8.427576920763957e-09, "logits/chosen": -1.2090356349945068, "logits/rejected": -0.08205322176218033, "logps/chosen": -759.0253295898438, "logps/rejected": -1694.987060546875, "loss": 0.2816, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13451895117759705, "rewards/margins": 0.25275808572769165, "rewards/rejected": -0.3872770071029663, "step": 2490 }, { "epoch": 0.98, "learning_rate": 5.853489432556536e-09, "logits/chosen": -1.5106983184814453, "logits/rejected": -0.8639631271362305, "logps/chosen": -654.6641845703125, "logps/rejected": -1713.3929443359375, "loss": 0.1831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06343318521976471, "rewards/margins": 0.31247463822364807, "rewards/rejected": -0.3759078085422516, "step": 2500 }, { "epoch": 0.98, "learning_rate": 3.746759797931265e-09, "logits/chosen": -1.4619848728179932, "logits/rejected": 0.3511095643043518, "logps/chosen": -736.3690795898438, "logps/rejected": -1626.9466552734375, "loss": 0.1604, "rewards/accuracies": 0.875, "rewards/chosen": -0.06985476613044739, "rewards/margins": 0.2860848307609558, "rewards/rejected": -0.3559395968914032, "step": 2510 }, { "epoch": 0.99, "learning_rate": 2.1077827798404728e-09, "logits/chosen": -1.3730641603469849, "logits/rejected": -0.6747050881385803, "logps/chosen": -546.4849853515625, "logps/rejected": -1590.315673828125, "loss": 0.1671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.036424748599529266, "rewards/margins": 0.34394755959510803, "rewards/rejected": -0.3803723454475403, "step": 2520 }, { "epoch": 0.99, "learning_rate": 9.368654928731958e-10, "logits/chosen": -1.3955776691436768, "logits/rejected": -0.6365998983383179, "logps/chosen": -608.5187377929688, "logps/rejected": -1592.2816162109375, "loss": 0.1843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08722618967294693, "rewards/margins": 0.30947092175483704, "rewards/rejected": -0.39669710397720337, "step": 2530 }, { "epoch": 1.0, "learning_rate": 2.3422734570816006e-10, "logits/chosen": -1.4981211423873901, "logits/rejected": -0.8375118374824524, "logps/chosen": -656.091796875, "logps/rejected": -1573.434814453125, "loss": 0.1836, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05932006239891052, "rewards/margins": 0.28673693537712097, "rewards/rejected": -0.3460569679737091, "step": 2540 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -1.4274616241455078, "logits/rejected": 0.46426883339881897, "logps/chosen": -776.08154296875, "logps/rejected": -1525.145751953125, "loss": 0.1924, "rewards/accuracies": 0.75, "rewards/chosen": -0.08221141993999481, "rewards/margins": 0.24572968482971191, "rewards/rejected": -0.32794108986854553, "step": 2550 }, { "epoch": 1.0, "step": 2550, "total_flos": 0.0, "train_loss": 0.19951762257837782, "train_runtime": 10798.5669, "train_samples_per_second": 0.945, "train_steps_per_second": 0.236 } ], "logging_steps": 10, "max_steps": 2550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }