diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3614 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.9607843137254902e-08, + "logits/chosen": -0.505158543586731, + "logits/rejected": 1.1344256401062012, + "logps/chosen": -534.2272338867188, + "logps/rejected": -995.0223388671875, + "loss": 0.21, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -1.4771511554718018, + "logits/rejected": -0.7203052043914795, + "logps/chosen": -653.9701538085938, + "logps/rejected": -1290.11083984375, + "loss": 0.2983, + "rewards/accuracies": 0.3055555522441864, + "rewards/chosen": -0.00023890436568763107, + "rewards/margins": -0.0006189702544361353, + "rewards/rejected": 0.00038006596150808036, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -1.5881028175354004, + "logits/rejected": -0.847257137298584, + "logps/chosen": -677.5276489257812, + "logps/rejected": -1343.302978515625, + "loss": 0.34, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0005764259840361774, + "rewards/margins": 0.0008251671679317951, + "rewards/rejected": -0.0002487411838956177, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -1.5565259456634521, + "logits/rejected": -0.9040892720222473, + "logps/chosen": -587.6061401367188, + "logps/rejected": -1259.46630859375, + "loss": 0.3992, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0015199712943285704, + "rewards/margins": 0.002795459469780326, + "rewards/rejected": -0.004315430298447609, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -1.3543564081192017, + "logits/rejected": -0.5594847798347473, + "logps/chosen": -660.8809814453125, + "logps/rejected": -1349.8839111328125, + "loss": 0.3377, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.007950540632009506, + "rewards/margins": 0.009673960506916046, + "rewards/rejected": -0.017624501138925552, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -1.4439340829849243, + "logits/rejected": -0.9004542231559753, + "logps/chosen": -625.8778076171875, + "logps/rejected": -1303.6329345703125, + "loss": 0.3665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01639598235487938, + "rewards/margins": 0.029322799295186996, + "rewards/rejected": -0.04571877792477608, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -1.5793389081954956, + "logits/rejected": -0.6903096437454224, + "logps/chosen": -691.1597290039062, + "logps/rejected": -1354.8695068359375, + "loss": 0.3259, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04962822049856186, + "rewards/margins": 0.04500482603907585, + "rewards/rejected": -0.0946330577135086, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -1.2960580587387085, + "logits/rejected": -0.5226901173591614, + "logps/chosen": -677.5730590820312, + "logps/rejected": -1611.273681640625, + "loss": 0.2328, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09385339170694351, + "rewards/margins": 0.11672432720661163, + "rewards/rejected": -0.21057769656181335, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -1.0945719480514526, + "logits/rejected": -0.5267337560653687, + "logps/chosen": -776.620849609375, + "logps/rejected": -1658.595703125, + "loss": 0.1974, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19967763125896454, + "rewards/margins": 0.2444140613079071, + "rewards/rejected": -0.44409170746803284, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -1.4510087966918945, + "logits/rejected": -0.023749172687530518, + "logps/chosen": -911.9953002929688, + "logps/rejected": -1725.72265625, + "loss": 0.2263, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2247885912656784, + "rewards/margins": 0.1656641662120819, + "rewards/rejected": -0.3904527723789215, + "step": 90 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -1.3619906902313232, + "logits/rejected": -0.15897789597511292, + "logps/chosen": -822.1832275390625, + "logps/rejected": -1571.0025634765625, + "loss": 0.2765, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16276445984840393, + "rewards/margins": 0.13095514476299286, + "rewards/rejected": -0.2937195897102356, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -1.1530998945236206, + "logits/rejected": -0.40491175651550293, + "logps/chosen": -854.7205200195312, + "logps/rejected": -1822.2965087890625, + "loss": 0.2099, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15811415016651154, + "rewards/margins": 0.23178577423095703, + "rewards/rejected": -0.3898999094963074, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -1.5039284229278564, + "logits/rejected": -0.5590807199478149, + "logps/chosen": -726.2496337890625, + "logps/rejected": -1728.7135009765625, + "loss": 0.2438, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13852688670158386, + "rewards/margins": 0.27724406123161316, + "rewards/rejected": -0.415770947933197, + "step": 120 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -1.47978937625885, + "logits/rejected": -0.7583194971084595, + "logps/chosen": -777.576416015625, + "logps/rejected": -1722.5013427734375, + "loss": 0.169, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19047263264656067, + "rewards/margins": 0.221228688955307, + "rewards/rejected": -0.41170138120651245, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -1.3619725704193115, + "logits/rejected": -0.45514482259750366, + "logps/chosen": -859.7752685546875, + "logps/rejected": -1761.045654296875, + "loss": 0.2, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2766234278678894, + "rewards/margins": 0.1959143877029419, + "rewards/rejected": -0.4725378155708313, + "step": 140 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -1.2937225103378296, + "logits/rejected": -0.18269118666648865, + "logps/chosen": -1000.1593627929688, + "logps/rejected": -1983.649169921875, + "loss": 0.2365, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3481788635253906, + "rewards/margins": 0.32042697072029114, + "rewards/rejected": -0.6686058640480042, + "step": 150 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -1.3573691844940186, + "logits/rejected": -0.8902850151062012, + "logps/chosen": -908.5567626953125, + "logps/rejected": -1647.058349609375, + "loss": 0.2157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2951143980026245, + "rewards/margins": 0.15847407281398773, + "rewards/rejected": -0.45358848571777344, + "step": 160 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.657151222229004, + "logits/rejected": -0.9709945917129517, + "logps/chosen": -817.062744140625, + "logps/rejected": -1781.638671875, + "loss": 0.1573, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21228685975074768, + "rewards/margins": 0.23603840172290802, + "rewards/rejected": -0.4483252465724945, + "step": 170 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -1.4512741565704346, + "logits/rejected": -0.1740313172340393, + "logps/chosen": -889.5105590820312, + "logps/rejected": -2051.538818359375, + "loss": 0.2172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25935059785842896, + "rewards/margins": 0.3617965579032898, + "rewards/rejected": -0.6211471557617188, + "step": 180 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -1.1388862133026123, + "logits/rejected": -0.18607623875141144, + "logps/chosen": -821.3484497070312, + "logps/rejected": -1919.426513671875, + "loss": 0.1605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21933992207050323, + "rewards/margins": 0.3276643455028534, + "rewards/rejected": -0.547004222869873, + "step": 190 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -1.348503828048706, + "logits/rejected": -0.5367448925971985, + "logps/chosen": -625.4483032226562, + "logps/rejected": -1496.484130859375, + "loss": 0.2089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10934285074472427, + "rewards/margins": 0.23809942603111267, + "rewards/rejected": -0.34744226932525635, + "step": 200 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -1.3590004444122314, + "logits/rejected": -0.8172636032104492, + "logps/chosen": -815.4562377929688, + "logps/rejected": -1780.2926025390625, + "loss": 0.2143, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1335686445236206, + "rewards/margins": 0.2463621348142624, + "rewards/rejected": -0.3799307644367218, + "step": 210 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -1.5793583393096924, + "logits/rejected": -0.32534486055374146, + "logps/chosen": -928.5148315429688, + "logps/rejected": -1738.1536865234375, + "loss": 0.2501, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20052528381347656, + "rewards/margins": 0.21043157577514648, + "rewards/rejected": -0.41095685958862305, + "step": 220 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -0.9972221255302429, + "logits/rejected": -0.37468206882476807, + "logps/chosen": -708.7088623046875, + "logps/rejected": -1586.076416015625, + "loss": 0.205, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14892444014549255, + "rewards/margins": 0.24091584980487823, + "rewards/rejected": -0.3898402750492096, + "step": 230 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -1.3743550777435303, + "logits/rejected": -0.13277244567871094, + "logps/chosen": -719.425537109375, + "logps/rejected": -1675.90234375, + "loss": 0.2009, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1083696037530899, + "rewards/margins": 0.3259289562702179, + "rewards/rejected": -0.4342985153198242, + "step": 240 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -1.1646835803985596, + "logits/rejected": -0.5943381786346436, + "logps/chosen": -621.46630859375, + "logps/rejected": -1612.871826171875, + "loss": 0.1643, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17562244832515717, + "rewards/margins": 0.2645077705383301, + "rewards/rejected": -0.44013017416000366, + "step": 250 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941442477777e-06, + "logits/chosen": -1.2978475093841553, + "logits/rejected": -0.576497495174408, + "logps/chosen": -937.4520263671875, + "logps/rejected": -1737.780029296875, + "loss": 0.2432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25836849212646484, + "rewards/margins": 0.241961270570755, + "rewards/rejected": -0.5003297924995422, + "step": 260 + }, + { + "epoch": 0.11, + "learning_rate": 4.999472998758979e-06, + "logits/chosen": -1.4330791234970093, + "logits/rejected": -0.8838942646980286, + "logps/chosen": -877.1728515625, + "logps/rejected": -1793.1947021484375, + "loss": 0.1393, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25408753752708435, + "rewards/margins": 0.2761463522911072, + "rewards/rejected": -0.5302339792251587, + "step": 270 + }, + { + "epoch": 0.11, + "learning_rate": 4.998536199099246e-06, + "logits/chosen": -1.3899977207183838, + "logits/rejected": 0.03836112096905708, + "logps/chosen": -923.8590087890625, + "logps/rejected": -1724.1558837890625, + "loss": 0.1851, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1945658177137375, + "rewards/margins": 0.2358773946762085, + "rewards/rejected": -0.4304431974887848, + "step": 280 + }, + { + "epoch": 0.11, + "learning_rate": 4.997131219037856e-06, + "logits/chosen": -1.186488389968872, + "logits/rejected": -0.389091819524765, + "logps/chosen": -757.4147338867188, + "logps/rejected": -1886.984130859375, + "loss": 0.1841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21733467280864716, + "rewards/margins": 0.3353338837623596, + "rewards/rejected": -0.5526684522628784, + "step": 290 + }, + { + "epoch": 0.12, + "learning_rate": 4.995258321842611e-06, + "logits/chosen": -1.1964404582977295, + "logits/rejected": -0.06750938296318054, + "logps/chosen": -907.2109375, + "logps/rejected": -1809.0191650390625, + "loss": 0.1834, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2906198799610138, + "rewards/margins": 0.279925137758255, + "rewards/rejected": -0.5705450177192688, + "step": 300 + }, + { + "epoch": 0.12, + "learning_rate": 4.9929178584605e-06, + "logits/chosen": -1.649431586265564, + "logits/rejected": -0.20804986357688904, + "logps/chosen": -891.9801635742188, + "logps/rejected": -1733.181884765625, + "loss": 0.1278, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.22755351662635803, + "rewards/margins": 0.2664097547531128, + "rewards/rejected": -0.49396324157714844, + "step": 310 + }, + { + "epoch": 0.13, + "learning_rate": 4.9901102674519446e-06, + "logits/chosen": -1.4958832263946533, + "logits/rejected": -0.3006078004837036, + "logps/chosen": -951.6578369140625, + "logps/rejected": -1706.25390625, + "loss": 0.2295, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2715206742286682, + "rewards/margins": 0.25203150510787964, + "rewards/rejected": -0.5235521793365479, + "step": 320 + }, + { + "epoch": 0.13, + "learning_rate": 4.986836074908616e-06, + "logits/chosen": -1.3995481729507446, + "logits/rejected": 0.009560632519423962, + "logps/chosen": -718.5650634765625, + "logps/rejected": -1350.846923828125, + "loss": 0.2471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20702452957630157, + "rewards/margins": 0.14036989212036133, + "rewards/rejected": -0.3473944067955017, + "step": 330 + }, + { + "epoch": 0.13, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -1.5904731750488281, + "logits/rejected": -0.14893893897533417, + "logps/chosen": -855.9501953125, + "logps/rejected": -1916.079345703125, + "loss": 0.2131, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24469470977783203, + "rewards/margins": 0.22492530941963196, + "rewards/rejected": -0.4696199893951416, + "step": 340 + }, + { + "epoch": 0.14, + "learning_rate": 4.9788904266327206e-06, + "logits/chosen": -1.6823375225067139, + "logits/rejected": -0.4657576084136963, + "logps/chosen": -784.65234375, + "logps/rejected": -1751.244873046875, + "loss": 0.1888, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1359667181968689, + "rewards/margins": 0.29217660427093506, + "rewards/rejected": -0.42814335227012634, + "step": 350 + }, + { + "epoch": 0.14, + "learning_rate": 4.9742204597706386e-06, + "logits/chosen": -1.5003750324249268, + "logits/rejected": -0.001354557229205966, + "logps/chosen": -755.9137573242188, + "logps/rejected": -1653.0166015625, + "loss": 0.1933, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12234246730804443, + "rewards/margins": 0.2765265107154846, + "rewards/rejected": -0.39886897802352905, + "step": 360 + }, + { + "epoch": 0.15, + "learning_rate": 4.9690868688357655e-06, + "logits/chosen": -1.3799958229064941, + "logits/rejected": -0.4311766028404236, + "logps/chosen": -724.7586059570312, + "logps/rejected": -1667.642822265625, + "loss": 0.1828, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1624334752559662, + "rewards/margins": 0.2481921911239624, + "rewards/rejected": -0.4106256365776062, + "step": 370 + }, + { + "epoch": 0.15, + "learning_rate": 4.963490615770003e-06, + "logits/chosen": -1.295836329460144, + "logits/rejected": -0.5849100947380066, + "logps/chosen": -835.3861083984375, + "logps/rejected": -1846.414794921875, + "loss": 0.2211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23319277167320251, + "rewards/margins": 0.3248142898082733, + "rewards/rejected": -0.5580071210861206, + "step": 380 + }, + { + "epoch": 0.15, + "learning_rate": 4.957432749209755e-06, + "logits/chosen": -1.4312934875488281, + "logits/rejected": 0.31627362966537476, + "logps/chosen": -939.7803955078125, + "logps/rejected": -1674.4808349609375, + "loss": 0.2533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2971717119216919, + "rewards/margins": 0.19919905066490173, + "rewards/rejected": -0.496370792388916, + "step": 390 + }, + { + "epoch": 0.16, + "learning_rate": 4.950914404289423e-06, + "logits/chosen": -1.3529198169708252, + "logits/rejected": -0.19551090896129608, + "logps/chosen": -940.6759643554688, + "logps/rejected": -1822.1956787109375, + "loss": 0.2262, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3031768500804901, + "rewards/margins": 0.22445103526115417, + "rewards/rejected": -0.5276279449462891, + "step": 400 + }, + { + "epoch": 0.16, + "learning_rate": 4.943936802428712e-06, + "logits/chosen": -1.1721961498260498, + "logits/rejected": 0.37075644731521606, + "logps/chosen": -702.531005859375, + "logps/rejected": -1698.3720703125, + "loss": 0.1711, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18371441960334778, + "rewards/margins": 0.269645094871521, + "rewards/rejected": -0.4533595144748688, + "step": 410 + }, + { + "epoch": 0.16, + "learning_rate": 4.936501251103751e-06, + "logits/chosen": -1.1501245498657227, + "logits/rejected": -0.04669635370373726, + "logps/chosen": -934.7687377929688, + "logps/rejected": -1762.8375244140625, + "loss": 0.2049, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.27317720651626587, + "rewards/margins": 0.24144259095191956, + "rewards/rejected": -0.5146198272705078, + "step": 420 + }, + { + "epoch": 0.17, + "learning_rate": 4.928609143602102e-06, + "logits/chosen": -1.3455841541290283, + "logits/rejected": -0.689312219619751, + "logps/chosen": -953.3030395507812, + "logps/rejected": -2143.519775390625, + "loss": 0.1132, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3812543451786041, + "rewards/margins": 0.47345447540283203, + "rewards/rejected": -0.8547086715698242, + "step": 430 + }, + { + "epoch": 0.17, + "learning_rate": 4.920261958761677e-06, + "logits/chosen": -1.1954295635223389, + "logits/rejected": 0.1524878442287445, + "logps/chosen": -988.5673828125, + "logps/rejected": -1907.625, + "loss": 0.2181, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3235063850879669, + "rewards/margins": 0.30497947335243225, + "rewards/rejected": -0.6284858584403992, + "step": 440 + }, + { + "epoch": 0.18, + "learning_rate": 4.911461260693639e-06, + "logits/chosen": -1.384975552558899, + "logits/rejected": -0.3957231938838959, + "logps/chosen": -864.88623046875, + "logps/rejected": -1796.1107177734375, + "loss": 0.1692, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24604813754558563, + "rewards/margins": 0.25146228075027466, + "rewards/rejected": -0.4975104331970215, + "step": 450 + }, + { + "epoch": 0.18, + "learning_rate": 4.902208698489302e-06, + "logits/chosen": -1.0432078838348389, + "logits/rejected": -0.16131794452667236, + "logps/chosen": -885.232421875, + "logps/rejected": -1651.9114990234375, + "loss": 0.2494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22546634078025818, + "rewards/margins": 0.1930330991744995, + "rewards/rejected": -0.4184994697570801, + "step": 460 + }, + { + "epoch": 0.18, + "learning_rate": 4.89250600591114e-06, + "logits/chosen": -1.3176567554473877, + "logits/rejected": -0.0033722042571753263, + "logps/chosen": -723.5933837890625, + "logps/rejected": -1598.0091552734375, + "loss": 0.2398, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15554097294807434, + "rewards/margins": 0.26405271887779236, + "rewards/rejected": -0.4195936620235443, + "step": 470 + }, + { + "epoch": 0.19, + "learning_rate": 4.882355001067892e-06, + "logits/chosen": -1.188307523727417, + "logits/rejected": 0.14929169416427612, + "logps/chosen": -815.7213134765625, + "logps/rejected": -1634.1407470703125, + "loss": 0.2558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16603827476501465, + "rewards/margins": 0.21612891554832458, + "rewards/rejected": -0.38216716051101685, + "step": 480 + }, + { + "epoch": 0.19, + "learning_rate": 4.871757586073897e-06, + "logits/chosen": -1.3035974502563477, + "logits/rejected": 0.26524829864501953, + "logps/chosen": -763.2244262695312, + "logps/rejected": -1522.682861328125, + "loss": 0.2258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1318461149930954, + "rewards/margins": 0.23059546947479248, + "rewards/rejected": -0.3624415993690491, + "step": 490 + }, + { + "epoch": 0.2, + "learning_rate": 4.860715746692661e-06, + "logits/chosen": -1.1487717628479004, + "logits/rejected": 0.05942107364535332, + "logps/chosen": -886.2254638671875, + "logps/rejected": -1841.0814208984375, + "loss": 0.1885, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18671779334545135, + "rewards/margins": 0.27246180176734924, + "rewards/rejected": -0.4591795802116394, + "step": 500 + }, + { + "epoch": 0.2, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -1.2474385499954224, + "logits/rejected": -0.14498676359653473, + "logps/chosen": -778.3880615234375, + "logps/rejected": -1752.142333984375, + "loss": 0.1754, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18546968698501587, + "rewards/margins": 0.3222576379776001, + "rewards/rejected": -0.5077272653579712, + "step": 510 + }, + { + "epoch": 0.2, + "learning_rate": 4.837307153820184e-06, + "logits/chosen": -1.1251775026321411, + "logits/rejected": 0.15637345612049103, + "logps/chosen": -924.3635864257812, + "logps/rejected": -2070.327392578125, + "loss": 0.1343, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3112488090991974, + "rewards/margins": 0.38895484805107117, + "rewards/rejected": -0.7002035975456238, + "step": 520 + }, + { + "epoch": 0.21, + "learning_rate": 4.824944786675003e-06, + "logits/chosen": -1.3947086334228516, + "logits/rejected": 0.045419882982969284, + "logps/chosen": -856.5111083984375, + "logps/rejected": -1587.355712890625, + "loss": 0.1704, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2732272148132324, + "rewards/margins": 0.24021320044994354, + "rewards/rejected": -0.5134404301643372, + "step": 530 + }, + { + "epoch": 0.21, + "learning_rate": 4.81214676701278e-06, + "logits/chosen": -1.2445639371871948, + "logits/rejected": 0.1435929536819458, + "logps/chosen": -935.2590942382812, + "logps/rejected": -1872.558349609375, + "loss": 0.1603, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2728428244590759, + "rewards/margins": 0.303517609834671, + "rewards/rejected": -0.5763604044914246, + "step": 540 + }, + { + "epoch": 0.22, + "learning_rate": 4.798915492950456e-06, + "logits/chosen": -1.3926843404769897, + "logits/rejected": -0.8224552273750305, + "logps/chosen": -930.3948364257812, + "logps/rejected": -1831.987060546875, + "loss": 0.2094, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22408756613731384, + "rewards/margins": 0.306917279958725, + "rewards/rejected": -0.5310048460960388, + "step": 550 + }, + { + "epoch": 0.22, + "learning_rate": 4.785253443788997e-06, + "logits/chosen": -1.452789306640625, + "logits/rejected": -0.08553876727819443, + "logps/chosen": -834.9271240234375, + "logps/rejected": -1715.3486328125, + "loss": 0.2066, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1931687295436859, + "rewards/margins": 0.24109697341918945, + "rewards/rejected": -0.43426570296287537, + "step": 560 + }, + { + "epoch": 0.22, + "learning_rate": 4.771163179548809e-06, + "logits/chosen": -1.2075916528701782, + "logits/rejected": -0.4084923267364502, + "logps/chosen": -895.1989135742188, + "logps/rejected": -1892.2545166015625, + "loss": 0.1562, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.27637767791748047, + "rewards/margins": 0.3559107184410095, + "rewards/rejected": -0.6322883367538452, + "step": 570 + }, + { + "epoch": 0.23, + "learning_rate": 4.75664734049005e-06, + "logits/chosen": -1.4612247943878174, + "logits/rejected": -0.4000505805015564, + "logps/chosen": -838.0030517578125, + "logps/rejected": -1836.1865234375, + "loss": 0.1866, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27399009466171265, + "rewards/margins": 0.3403601050376892, + "rewards/rejected": -0.6143501996994019, + "step": 580 + }, + { + "epoch": 0.23, + "learning_rate": 4.741708646617879e-06, + "logits/chosen": -1.4533047676086426, + "logits/rejected": -0.44210928678512573, + "logps/chosen": -826.44921875, + "logps/rejected": -1627.9482421875, + "loss": 0.1568, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18495787680149078, + "rewards/margins": 0.24494799971580505, + "rewards/rejected": -0.42990580201148987, + "step": 590 + }, + { + "epoch": 0.24, + "learning_rate": 4.726349897172791e-06, + "logits/chosen": -1.2161755561828613, + "logits/rejected": -0.4458787441253662, + "logps/chosen": -677.1725463867188, + "logps/rejected": -1372.3172607421875, + "loss": 0.2348, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11152330785989761, + "rewards/margins": 0.17953188717365265, + "rewards/rejected": -0.29105520248413086, + "step": 600 + }, + { + "epoch": 0.24, + "learning_rate": 4.710573970106076e-06, + "logits/chosen": -1.2787022590637207, + "logits/rejected": -0.5003519654273987, + "logps/chosen": -937.7862548828125, + "logps/rejected": -1879.7720947265625, + "loss": 0.2216, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.27286097407341003, + "rewards/margins": 0.2761802077293396, + "rewards/rejected": -0.5490411520004272, + "step": 610 + }, + { + "epoch": 0.24, + "learning_rate": 4.694383821540554e-06, + "logits/chosen": -1.4234240055084229, + "logits/rejected": -0.529420793056488, + "logps/chosen": -879.75830078125, + "logps/rejected": -1886.7099609375, + "loss": 0.1449, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.23544082045555115, + "rewards/margins": 0.3297392725944519, + "rewards/rejected": -0.5651801824569702, + "step": 620 + }, + { + "epoch": 0.25, + "learning_rate": 4.677782485216644e-06, + "logits/chosen": -1.5074328184127808, + "logits/rejected": 0.13324348628520966, + "logps/chosen": -894.3519287109375, + "logps/rejected": -1656.986328125, + "loss": 0.2306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2585051953792572, + "rewards/margins": 0.17639592289924622, + "rewards/rejected": -0.4349011480808258, + "step": 630 + }, + { + "epoch": 0.25, + "learning_rate": 4.660773071923901e-06, + "logits/chosen": -1.254246473312378, + "logits/rejected": -0.4503572881221771, + "logps/chosen": -743.8980712890625, + "logps/rejected": -1586.318603515625, + "loss": 0.2306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20321564376354218, + "rewards/margins": 0.28816673159599304, + "rewards/rejected": -0.49138230085372925, + "step": 640 + }, + { + "epoch": 0.25, + "learning_rate": 4.643358768918106e-06, + "logits/chosen": -1.2100741863250732, + "logits/rejected": -0.6602537631988525, + "logps/chosen": -866.4385986328125, + "logps/rejected": -1698.565185546875, + "loss": 0.2341, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24518051743507385, + "rewards/margins": 0.21809275448322296, + "rewards/rejected": -0.46327322721481323, + "step": 650 + }, + { + "epoch": 0.26, + "learning_rate": 4.625542839324036e-06, + "logits/chosen": -1.2801318168640137, + "logits/rejected": -0.20570655167102814, + "logps/chosen": -696.7703857421875, + "logps/rejected": -1810.203857421875, + "loss": 0.1443, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1753673553466797, + "rewards/margins": 0.33784395456314087, + "rewards/rejected": -0.5132113099098206, + "step": 660 + }, + { + "epoch": 0.26, + "learning_rate": 4.6073286215240105e-06, + "logits/chosen": -1.573704719543457, + "logits/rejected": -0.5480459928512573, + "logps/chosen": -698.1392211914062, + "logps/rejected": -1760.8834228515625, + "loss": 3.1142, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11380796134471893, + "rewards/margins": 0.48678064346313477, + "rewards/rejected": -0.6005885601043701, + "step": 670 + }, + { + "epoch": 0.27, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -1.5839512348175049, + "logits/rejected": -0.7513319253921509, + "logps/chosen": -620.78955078125, + "logps/rejected": -1401.8199462890625, + "loss": 0.2422, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02382536605000496, + "rewards/margins": 0.09586119651794434, + "rewards/rejected": -0.119686558842659, + "step": 680 + }, + { + "epoch": 0.27, + "learning_rate": 4.569719047355795e-06, + "logits/chosen": -1.5924733877182007, + "logits/rejected": -0.816574215888977, + "logps/chosen": -557.929931640625, + "logps/rejected": -1159.7681884765625, + "loss": 0.292, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.010283837094902992, + "rewards/margins": 0.04927302524447441, + "rewards/rejected": -0.059556860476732254, + "step": 690 + }, + { + "epoch": 0.27, + "learning_rate": 4.550330738340189e-06, + "logits/chosen": -1.4926470518112183, + "logits/rejected": -0.8066496849060059, + "logps/chosen": -669.9822387695312, + "logps/rejected": -1387.30419921875, + "loss": 0.2635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06443636119365692, + "rewards/margins": 0.10144983232021332, + "rewards/rejected": -0.16588619351387024, + "step": 700 + }, + { + "epoch": 0.28, + "learning_rate": 4.530558234503252e-06, + "logits/chosen": -1.504148244857788, + "logits/rejected": -0.710750937461853, + "logps/chosen": -563.5753173828125, + "logps/rejected": -1385.9373779296875, + "loss": 0.1933, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02186558023095131, + "rewards/margins": 0.15772321820259094, + "rewards/rejected": -0.17958880960941315, + "step": 710 + }, + { + "epoch": 0.28, + "learning_rate": 4.5104052408538545e-06, + "logits/chosen": -1.3532848358154297, + "logits/rejected": -0.17277280986309052, + "logps/chosen": -665.1290893554688, + "logps/rejected": -1473.070068359375, + "loss": 0.2188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0683923214673996, + "rewards/margins": 0.23080816864967346, + "rewards/rejected": -0.29920047521591187, + "step": 720 + }, + { + "epoch": 0.29, + "learning_rate": 4.489875533697767e-06, + "logits/chosen": -1.2411041259765625, + "logits/rejected": -0.6769916415214539, + "logps/chosen": -796.1107177734375, + "logps/rejected": -1779.8375244140625, + "loss": 0.2014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11370061337947845, + "rewards/margins": 0.2886132001876831, + "rewards/rejected": -0.40231385827064514, + "step": 730 + }, + { + "epoch": 0.29, + "learning_rate": 4.468972959930043e-06, + "logits/chosen": -1.4062107801437378, + "logits/rejected": -0.11251994222402573, + "logps/chosen": -810.1907958984375, + "logps/rejected": -1755.5439453125, + "loss": 0.207, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.138728067278862, + "rewards/margins": 0.25112494826316833, + "rewards/rejected": -0.38985303044319153, + "step": 740 + }, + { + "epoch": 0.29, + "learning_rate": 4.447701436314176e-06, + "logits/chosen": -1.1295002698898315, + "logits/rejected": -0.491716206073761, + "logps/chosen": -665.5704345703125, + "logps/rejected": -1605.226318359375, + "loss": 0.2432, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09421003609895706, + "rewards/margins": 0.22589227557182312, + "rewards/rejected": -0.3201023042201996, + "step": 750 + }, + { + "epoch": 0.3, + "learning_rate": 4.4260649487481835e-06, + "logits/chosen": -1.3528281450271606, + "logits/rejected": -0.8653984069824219, + "logps/chosen": -560.7476806640625, + "logps/rejected": -1564.6998291015625, + "loss": 0.1747, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06951048225164413, + "rewards/margins": 0.29366621375083923, + "rewards/rejected": -0.36317676305770874, + "step": 760 + }, + { + "epoch": 0.3, + "learning_rate": 4.404067551517704e-06, + "logits/chosen": -1.496765375137329, + "logits/rejected": -0.7339566349983215, + "logps/chosen": -559.6861572265625, + "logps/rejected": -1561.775634765625, + "loss": 0.1495, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0916418582201004, + "rewards/margins": 0.28826963901519775, + "rewards/rejected": -0.37991148233413696, + "step": 770 + }, + { + "epoch": 0.31, + "learning_rate": 4.381713366536312e-06, + "logits/chosen": -1.2229559421539307, + "logits/rejected": -0.3822958469390869, + "logps/chosen": -795.2717895507812, + "logps/rejected": -1646.298583984375, + "loss": 0.2385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16480056941509247, + "rewards/margins": 0.25636088848114014, + "rewards/rejected": -0.4211614727973938, + "step": 780 + }, + { + "epoch": 0.31, + "learning_rate": 4.359006582573138e-06, + "logits/chosen": -1.3127458095550537, + "logits/rejected": -0.6002156138420105, + "logps/chosen": -731.8434448242188, + "logps/rejected": -1655.339599609375, + "loss": 0.2386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1232454776763916, + "rewards/margins": 0.260085791349411, + "rewards/rejected": -0.3833312392234802, + "step": 790 + }, + { + "epoch": 0.31, + "learning_rate": 4.335951454467971e-06, + "logits/chosen": -1.4491212368011475, + "logits/rejected": -0.4968988299369812, + "logps/chosen": -708.8034057617188, + "logps/rejected": -1689.820068359375, + "loss": 0.1514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11892116069793701, + "rewards/margins": 0.311443567276001, + "rewards/rejected": -0.4303646981716156, + "step": 800 + }, + { + "epoch": 0.32, + "learning_rate": 4.3125523023339825e-06, + "logits/chosen": -1.532845377922058, + "logits/rejected": -0.5454439520835876, + "logps/chosen": -708.6060791015625, + "logps/rejected": -1473.7392578125, + "loss": 0.2365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0950343906879425, + "rewards/margins": 0.2247859686613083, + "rewards/rejected": -0.3198204040527344, + "step": 810 + }, + { + "epoch": 0.32, + "learning_rate": 4.288813510748207e-06, + "logits/chosen": -1.3746122121810913, + "logits/rejected": -0.3929213881492615, + "logps/chosen": -709.5933837890625, + "logps/rejected": -1493.141357421875, + "loss": 0.1891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05677127093076706, + "rewards/margins": 0.22434012591838837, + "rewards/rejected": -0.28111138939857483, + "step": 820 + }, + { + "epoch": 0.33, + "learning_rate": 4.264739527929959e-06, + "logits/chosen": -1.6062724590301514, + "logits/rejected": -0.8062151074409485, + "logps/chosen": -672.033447265625, + "logps/rejected": -1605.8253173828125, + "loss": 0.2076, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04862620308995247, + "rewards/margins": 0.277982234954834, + "rewards/rejected": -0.32660841941833496, + "step": 830 + }, + { + "epoch": 0.33, + "learning_rate": 4.240334864907317e-06, + "logits/chosen": -1.429529070854187, + "logits/rejected": -0.1541730761528015, + "logps/chosen": -751.5721435546875, + "logps/rejected": -1614.796875, + "loss": 0.1689, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07653092592954636, + "rewards/margins": 0.23011043667793274, + "rewards/rejected": -0.3066413402557373, + "step": 840 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.4942229986190796, + "logits/rejected": -0.5664646029472351, + "logps/chosen": -683.9749755859375, + "logps/rejected": -1751.165771484375, + "loss": 0.1305, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07548153400421143, + "rewards/margins": 0.3088182806968689, + "rewards/rejected": -0.3842998147010803, + "step": 850 + }, + { + "epoch": 0.34, + "learning_rate": 4.190551851321647e-06, + "logits/chosen": -1.5068459510803223, + "logits/rejected": -0.3654994070529938, + "logps/chosen": -753.12060546875, + "logps/rejected": -1841.876220703125, + "loss": 0.1256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10451909154653549, + "rewards/margins": 0.35891246795654297, + "rewards/rejected": -0.46343153715133667, + "step": 860 + }, + { + "epoch": 0.34, + "learning_rate": 4.165182829193126e-06, + "logits/chosen": -1.4504587650299072, + "logits/rejected": 0.0904449075460434, + "logps/chosen": -773.3833618164062, + "logps/rejected": -1582.4493408203125, + "loss": 0.2156, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06579799205064774, + "rewards/margins": 0.241295725107193, + "rewards/rejected": -0.3070937395095825, + "step": 870 + }, + { + "epoch": 0.35, + "learning_rate": 4.139501781981245e-06, + "logits/chosen": -1.5094424486160278, + "logits/rejected": -0.5480602383613586, + "logps/chosen": -672.755126953125, + "logps/rejected": -1651.116943359375, + "loss": 0.1111, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07722672820091248, + "rewards/margins": 0.26155346632003784, + "rewards/rejected": -0.33878016471862793, + "step": 880 + }, + { + "epoch": 0.35, + "learning_rate": 4.113513521848821e-06, + "logits/chosen": -1.594499111175537, + "logits/rejected": -0.5706368684768677, + "logps/chosen": -772.4927978515625, + "logps/rejected": -1745.507080078125, + "loss": 0.1475, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10902180522680283, + "rewards/margins": 0.3192656934261322, + "rewards/rejected": -0.4282875061035156, + "step": 890 + }, + { + "epoch": 0.35, + "learning_rate": 4.087222918524807e-06, + "logits/chosen": -1.297629952430725, + "logits/rejected": -0.6775213479995728, + "logps/chosen": -705.9368896484375, + "logps/rejected": -1540.4775390625, + "loss": 0.2268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1348380744457245, + "rewards/margins": 0.2274044305086136, + "rewards/rejected": -0.36224251985549927, + "step": 900 + }, + { + "epoch": 0.36, + "learning_rate": 4.0606348983917924e-06, + "logits/chosen": -1.3503175973892212, + "logits/rejected": -0.9185010194778442, + "logps/chosen": -610.7164306640625, + "logps/rejected": -1734.915771484375, + "loss": 0.1352, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10167312622070312, + "rewards/margins": 0.36634570360183716, + "rewards/rejected": -0.4680188298225403, + "step": 910 + }, + { + "epoch": 0.36, + "learning_rate": 4.03375444356288e-06, + "logits/chosen": -1.4071118831634521, + "logits/rejected": -0.8690752983093262, + "logps/chosen": -836.24169921875, + "logps/rejected": -1863.6539306640625, + "loss": 0.2307, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1224076971411705, + "rewards/margins": 0.3151033818721771, + "rewards/rejected": -0.43751105666160583, + "step": 920 + }, + { + "epoch": 0.36, + "learning_rate": 4.006586590948141e-06, + "logits/chosen": -1.3949382305145264, + "logits/rejected": -0.680055558681488, + "logps/chosen": -666.8121948242188, + "logps/rejected": -1796.386474609375, + "loss": 0.199, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07509482651948929, + "rewards/margins": 0.31507402658462524, + "rewards/rejected": -0.39016884565353394, + "step": 930 + }, + { + "epoch": 0.37, + "learning_rate": 3.979136431310781e-06, + "logits/chosen": -1.4007041454315186, + "logits/rejected": -0.44923824071884155, + "logps/chosen": -629.3880615234375, + "logps/rejected": -1281.4881591796875, + "loss": 0.27, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06840632110834122, + "rewards/margins": 0.14729034900665283, + "rewards/rejected": -0.21569669246673584, + "step": 940 + }, + { + "epoch": 0.37, + "learning_rate": 3.951409108313223e-06, + "logits/chosen": -1.3141412734985352, + "logits/rejected": -0.3359532654285431, + "logps/chosen": -682.4598999023438, + "logps/rejected": -1479.57763671875, + "loss": 0.2002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06678664684295654, + "rewards/margins": 0.18387752771377563, + "rewards/rejected": -0.2506641745567322, + "step": 950 + }, + { + "epoch": 0.38, + "learning_rate": 3.923409817553284e-06, + "logits/chosen": -1.26377534866333, + "logits/rejected": -0.5578689575195312, + "logps/chosen": -753.383056640625, + "logps/rejected": -1470.470458984375, + "loss": 0.1909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08720171451568604, + "rewards/margins": 0.24566006660461426, + "rewards/rejected": -0.3328618109226227, + "step": 960 + }, + { + "epoch": 0.38, + "learning_rate": 3.895143805590609e-06, + "logits/chosen": -1.5301742553710938, + "logits/rejected": -0.33912280201911926, + "logps/chosen": -788.5135498046875, + "logps/rejected": -1906.780029296875, + "loss": 0.2098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10219583660364151, + "rewards/margins": 0.3592928946018219, + "rewards/rejected": -0.4614887833595276, + "step": 970 + }, + { + "epoch": 0.38, + "learning_rate": 3.8666163689635614e-06, + "logits/chosen": -1.4293967485427856, + "logits/rejected": -0.766064465045929, + "logps/chosen": -697.79443359375, + "logps/rejected": -1692.085693359375, + "loss": 0.2074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10171394050121307, + "rewards/margins": 0.3115464448928833, + "rewards/rejected": -0.41326045989990234, + "step": 980 + }, + { + "epoch": 0.39, + "learning_rate": 3.837832853196751e-06, + "logits/chosen": -1.4031484127044678, + "logits/rejected": -0.46277111768722534, + "logps/chosen": -741.0556030273438, + "logps/rejected": -1712.839111328125, + "loss": 0.1786, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10311299562454224, + "rewards/margins": 0.2798925042152405, + "rewards/rejected": -0.3830054700374603, + "step": 990 + }, + { + "epoch": 0.39, + "learning_rate": 3.808798651799377e-06, + "logits/chosen": -1.4064973592758179, + "logits/rejected": -0.5826825499534607, + "logps/chosen": -687.228271484375, + "logps/rejected": -1728.9072265625, + "loss": 0.1515, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10456766188144684, + "rewards/margins": 0.31050539016723633, + "rewards/rejected": -0.4150730073451996, + "step": 1000 + }, + { + "epoch": 0.4, + "learning_rate": 3.7795192052545805e-06, + "logits/chosen": -1.3606574535369873, + "logits/rejected": -0.26507607102394104, + "logps/chosen": -657.6034545898438, + "logps/rejected": -1711.599365234375, + "loss": 0.1027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11261602491140366, + "rewards/margins": 0.3513622283935547, + "rewards/rejected": -0.46397823095321655, + "step": 1010 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.3821978569030762, + "logits/rejected": -0.846422016620636, + "logps/chosen": -700.0028076171875, + "logps/rejected": -1700.7777099609375, + "loss": 0.188, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1201629489660263, + "rewards/margins": 0.2851884663105011, + "rewards/rejected": -0.4053514003753662, + "step": 1020 + }, + { + "epoch": 0.4, + "learning_rate": 3.7202465673997123e-06, + "logits/chosen": -1.327423334121704, + "logits/rejected": -0.4249703884124756, + "logps/chosen": -733.533935546875, + "logps/rejected": -1811.9857177734375, + "loss": 0.2335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15838567912578583, + "rewards/margins": 0.3160237669944763, + "rewards/rejected": -0.47440940141677856, + "step": 1030 + }, + { + "epoch": 0.41, + "learning_rate": 3.6902644827077504e-06, + "logits/chosen": -1.163883924484253, + "logits/rejected": -0.564578652381897, + "logps/chosen": -714.31591796875, + "logps/rejected": -1658.974609375, + "loss": 0.204, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1494341641664505, + "rewards/margins": 0.2751534581184387, + "rewards/rejected": -0.4245876669883728, + "step": 1040 + }, + { + "epoch": 0.41, + "learning_rate": 3.660059364023409e-06, + "logits/chosen": -1.1056033372879028, + "logits/rejected": -0.6749047040939331, + "logps/chosen": -836.0635986328125, + "logps/rejected": -1795.9320068359375, + "loss": 0.1381, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13029779493808746, + "rewards/margins": 0.3451148271560669, + "rewards/rejected": -0.47541260719299316, + "step": 1050 + }, + { + "epoch": 0.42, + "learning_rate": 3.6296368712385084e-06, + "logits/chosen": -1.2282450199127197, + "logits/rejected": 0.033928144723176956, + "logps/chosen": -668.1098022460938, + "logps/rejected": -1750.6011962890625, + "loss": 0.187, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12075567245483398, + "rewards/margins": 0.3676701486110687, + "rewards/rejected": -0.4884257912635803, + "step": 1060 + }, + { + "epoch": 0.42, + "learning_rate": 3.599002704976835e-06, + "logits/chosen": -1.513203501701355, + "logits/rejected": -0.3770269453525543, + "logps/chosen": -774.125244140625, + "logps/rejected": -1470.924072265625, + "loss": 0.2331, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0825313851237297, + "rewards/margins": 0.2151786983013153, + "rewards/rejected": -0.297710120677948, + "step": 1070 + }, + { + "epoch": 0.42, + "learning_rate": 3.5681626055259526e-06, + "logits/chosen": -1.351539134979248, + "logits/rejected": 0.01821332611143589, + "logps/chosen": -615.5689086914062, + "logps/rejected": -1394.30859375, + "loss": 0.1882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04903438687324524, + "rewards/margins": 0.17228753864765167, + "rewards/rejected": -0.22132191061973572, + "step": 1080 + }, + { + "epoch": 0.43, + "learning_rate": 3.5371223517615684e-06, + "logits/chosen": -1.1955583095550537, + "logits/rejected": -0.7964296340942383, + "logps/chosen": -650.0599365234375, + "logps/rejected": -1640.6591796875, + "loss": 0.1666, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.061278946697711945, + "rewards/margins": 0.23827362060546875, + "rewards/rejected": -0.2995525896549225, + "step": 1090 + }, + { + "epoch": 0.43, + "learning_rate": 3.5058877600646814e-06, + "logits/chosen": -1.5846580266952515, + "logits/rejected": -0.4390091896057129, + "logps/chosen": -774.6456298828125, + "logps/rejected": -1672.4420166015625, + "loss": 0.1899, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09766945987939835, + "rewards/margins": 0.26369303464889526, + "rewards/rejected": -0.3613625466823578, + "step": 1100 + }, + { + "epoch": 0.44, + "learning_rate": 3.4744646832316985e-06, + "logits/chosen": -1.1662776470184326, + "logits/rejected": -0.2102310210466385, + "logps/chosen": -793.6665649414062, + "logps/rejected": -1921.721923828125, + "loss": 0.1516, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.128018319606781, + "rewards/margins": 0.35574427247047424, + "rewards/rejected": -0.48376256227493286, + "step": 1110 + }, + { + "epoch": 0.44, + "learning_rate": 3.442859009377724e-06, + "logits/chosen": -1.2999095916748047, + "logits/rejected": -0.5450000762939453, + "logps/chosen": -756.6891479492188, + "logps/rejected": -1727.3140869140625, + "loss": 0.2095, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12410111725330353, + "rewards/margins": 0.2954896092414856, + "rewards/rejected": -0.41959071159362793, + "step": 1120 + }, + { + "epoch": 0.44, + "learning_rate": 3.4110766608332347e-06, + "logits/chosen": -1.3748492002487183, + "logits/rejected": -0.4282529950141907, + "logps/chosen": -715.91064453125, + "logps/rejected": -1581.970703125, + "loss": 0.2029, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10205087810754776, + "rewards/margins": 0.2136324942111969, + "rewards/rejected": -0.31568339467048645, + "step": 1130 + }, + { + "epoch": 0.45, + "learning_rate": 3.379123593034342e-06, + "logits/chosen": -1.4860260486602783, + "logits/rejected": -0.33013448119163513, + "logps/chosen": -715.021240234375, + "logps/rejected": -1671.137939453125, + "loss": 0.1657, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0988413542509079, + "rewards/margins": 0.2489662617444992, + "rewards/rejected": -0.3478075861930847, + "step": 1140 + }, + { + "epoch": 0.45, + "learning_rate": 3.3470057934068533e-06, + "logits/chosen": -1.4496772289276123, + "logits/rejected": -0.6596914529800415, + "logps/chosen": -673.6126098632812, + "logps/rejected": -1665.568603515625, + "loss": 0.1832, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08717682957649231, + "rewards/margins": 0.2865816652774811, + "rewards/rejected": -0.3737585246562958, + "step": 1150 + }, + { + "epoch": 0.45, + "learning_rate": 3.314729280244332e-06, + "logits/chosen": -1.5033951997756958, + "logits/rejected": -0.4424918591976166, + "logps/chosen": -715.0887451171875, + "logps/rejected": -1384.922119140625, + "loss": 0.2064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12197653949260712, + "rewards/margins": 0.2576510012149811, + "rewards/rejected": -0.3796275556087494, + "step": 1160 + }, + { + "epoch": 0.46, + "learning_rate": 3.2823001015803863e-06, + "logits/chosen": -1.3551867008209229, + "logits/rejected": -0.6100107431411743, + "logps/chosen": -750.599853515625, + "logps/rejected": -1853.4273681640625, + "loss": 0.1589, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09711600840091705, + "rewards/margins": 0.3520352840423584, + "rewards/rejected": -0.44915127754211426, + "step": 1170 + }, + { + "epoch": 0.46, + "learning_rate": 3.2497243340553675e-06, + "logits/chosen": -1.0115400552749634, + "logits/rejected": -0.17798957228660583, + "logps/chosen": -745.58984375, + "logps/rejected": -1906.7685546875, + "loss": 0.2539, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14974217116832733, + "rewards/margins": 0.3422713875770569, + "rewards/rejected": -0.4920136332511902, + "step": 1180 + }, + { + "epoch": 0.47, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -1.1727737188339233, + "logits/rejected": -0.37460917234420776, + "logps/chosen": -709.9483642578125, + "logps/rejected": -1686.753173828125, + "loss": 0.1683, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10941555351018906, + "rewards/margins": 0.2822516858577728, + "rewards/rejected": -0.3916672468185425, + "step": 1190 + }, + { + "epoch": 0.47, + "learning_rate": 3.184157475180208e-06, + "logits/chosen": -1.3031466007232666, + "logits/rejected": -0.5970622301101685, + "logps/chosen": -697.8651123046875, + "logps/rejected": -1595.6754150390625, + "loss": 0.2328, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10812550783157349, + "rewards/margins": 0.23647412657737732, + "rewards/rejected": -0.3445996046066284, + "step": 1200 + }, + { + "epoch": 0.47, + "learning_rate": 3.1511786698711226e-06, + "logits/chosen": -1.3314238786697388, + "logits/rejected": 0.48418712615966797, + "logps/chosen": -731.9833984375, + "logps/rejected": -1517.853271484375, + "loss": 0.2287, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12708911299705505, + "rewards/margins": 0.23969343304634094, + "rewards/rejected": -0.3667825162410736, + "step": 1210 + }, + { + "epoch": 0.48, + "learning_rate": 3.1180778454808973e-06, + "logits/chosen": -1.289541244506836, + "logits/rejected": -0.4609376788139343, + "logps/chosen": -746.2857666015625, + "logps/rejected": -1523.1092529296875, + "loss": 0.1886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09962339699268341, + "rewards/margins": 0.28612619638442993, + "rewards/rejected": -0.38574957847595215, + "step": 1220 + }, + { + "epoch": 0.48, + "learning_rate": 3.084861204504122e-06, + "logits/chosen": -1.0148189067840576, + "logits/rejected": -0.48453038930892944, + "logps/chosen": -778.4666748046875, + "logps/rejected": -1931.9429931640625, + "loss": 0.1131, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.08722411096096039, + "rewards/margins": 0.36971360445022583, + "rewards/rejected": -0.4569377303123474, + "step": 1230 + }, + { + "epoch": 0.49, + "learning_rate": 3.051534971137315e-06, + "logits/chosen": -1.2210582494735718, + "logits/rejected": -0.43022990226745605, + "logps/chosen": -752.8408813476562, + "logps/rejected": -1476.504638671875, + "loss": 0.2269, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09663649648427963, + "rewards/margins": 0.20909292995929718, + "rewards/rejected": -0.3057294487953186, + "step": 1240 + }, + { + "epoch": 0.49, + "learning_rate": 3.0181053901126243e-06, + "logits/chosen": -1.1169403791427612, + "logits/rejected": 0.2767347991466522, + "logps/chosen": -749.15673828125, + "logps/rejected": -1505.369140625, + "loss": 0.1992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09570419043302536, + "rewards/margins": 0.19496819376945496, + "rewards/rejected": -0.2906723916530609, + "step": 1250 + }, + { + "epoch": 0.49, + "learning_rate": 2.9845787255276753e-06, + "logits/chosen": -1.5088775157928467, + "logits/rejected": -0.9695127606391907, + "logps/chosen": -588.0244750976562, + "logps/rejected": -1467.4212646484375, + "loss": 0.1487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04604783654212952, + "rewards/margins": 0.28641366958618164, + "rewards/rejected": -0.33246147632598877, + "step": 1260 + }, + { + "epoch": 0.5, + "learning_rate": 2.950961259671793e-06, + "logits/chosen": -1.50933837890625, + "logits/rejected": -0.6869689226150513, + "logps/chosen": -710.8389892578125, + "logps/rejected": -1601.9041748046875, + "loss": 0.2032, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0675523430109024, + "rewards/margins": 0.27207452058792114, + "rewards/rejected": -0.33962687849998474, + "step": 1270 + }, + { + "epoch": 0.5, + "learning_rate": 2.917259291848814e-06, + "logits/chosen": -1.4775984287261963, + "logits/rejected": -0.3601114749908447, + "logps/chosen": -680.5808715820312, + "logps/rejected": -1640.981689453125, + "loss": 0.2072, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06015778332948685, + "rewards/margins": 0.2646317481994629, + "rewards/rejected": -0.32478955388069153, + "step": 1280 + }, + { + "epoch": 0.51, + "learning_rate": 2.883479137196714e-06, + "logits/chosen": -1.826909065246582, + "logits/rejected": -0.6638845801353455, + "logps/chosen": -696.27734375, + "logps/rejected": -1482.72119140625, + "loss": 0.184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05563250929117203, + "rewards/margins": 0.2452922761440277, + "rewards/rejected": -0.30092480778694153, + "step": 1290 + }, + { + "epoch": 0.51, + "learning_rate": 2.849627125504262e-06, + "logits/chosen": -1.374955415725708, + "logits/rejected": -0.20216119289398193, + "logps/chosen": -578.1390380859375, + "logps/rejected": -1516.6820068359375, + "loss": 0.1879, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0654110461473465, + "rewards/margins": 0.27813297510147095, + "rewards/rejected": -0.34354403614997864, + "step": 1300 + }, + { + "epoch": 0.51, + "learning_rate": 2.8157096000249334e-06, + "logits/chosen": -1.5065643787384033, + "logits/rejected": -0.7829849123954773, + "logps/chosen": -630.3825073242188, + "logps/rejected": -1537.6138916015625, + "loss": 0.2042, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.061698682606220245, + "rewards/margins": 0.2721042037010193, + "rewards/rejected": -0.33380287885665894, + "step": 1310 + }, + { + "epoch": 0.52, + "learning_rate": 2.7817329162883033e-06, + "logits/chosen": -1.471840500831604, + "logits/rejected": -0.21121864020824432, + "logps/chosen": -743.4503173828125, + "logps/rejected": -1590.5904541015625, + "loss": 0.1418, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.055032916367053986, + "rewards/margins": 0.24513819813728333, + "rewards/rejected": -0.3001710772514343, + "step": 1320 + }, + { + "epoch": 0.52, + "learning_rate": 2.747703440909128e-06, + "logits/chosen": -1.6148380041122437, + "logits/rejected": -0.6764585375785828, + "logps/chosen": -709.3273315429688, + "logps/rejected": -1805.447021484375, + "loss": 0.1334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.030007129535079002, + "rewards/margins": 0.361659973859787, + "rewards/rejected": -0.39166706800460815, + "step": 1330 + }, + { + "epoch": 0.53, + "learning_rate": 2.713627550394363e-06, + "logits/chosen": -1.3852078914642334, + "logits/rejected": -0.6749362945556641, + "logps/chosen": -686.713623046875, + "logps/rejected": -1515.2998046875, + "loss": 0.1806, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05257093161344528, + "rewards/margins": 0.26767003536224365, + "rewards/rejected": -0.3202410042285919, + "step": 1340 + }, + { + "epoch": 0.53, + "learning_rate": 2.679511629948319e-06, + "logits/chosen": -1.352468729019165, + "logits/rejected": -0.6524327993392944, + "logps/chosen": -796.8145751953125, + "logps/rejected": -1669.538330078125, + "loss": 0.2095, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10906052589416504, + "rewards/margins": 0.23847489058971405, + "rewards/rejected": -0.3475354313850403, + "step": 1350 + }, + { + "epoch": 0.53, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -1.554595708847046, + "logits/rejected": 0.08318161964416504, + "logps/chosen": -606.7230224609375, + "logps/rejected": -1430.286865234375, + "loss": 0.2034, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04838673770427704, + "rewards/margins": 0.2375943958759308, + "rewards/rejected": -0.28598111867904663, + "step": 1360 + }, + { + "epoch": 0.54, + "learning_rate": 2.6111852763861763e-06, + "logits/chosen": -1.3457515239715576, + "logits/rejected": -0.39270055294036865, + "logps/chosen": -752.8702392578125, + "logps/rejected": -1860.8333740234375, + "loss": 0.1234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.04639287292957306, + "rewards/margins": 0.38252198696136475, + "rewards/rejected": -0.4289148449897766, + "step": 1370 + }, + { + "epoch": 0.54, + "learning_rate": 2.576987646390426e-06, + "logits/chosen": -1.5459932088851929, + "logits/rejected": -0.5794991254806519, + "logps/chosen": -691.588134765625, + "logps/rejected": -1757.112548828125, + "loss": 0.1192, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04728538915514946, + "rewards/margins": 0.32668638229370117, + "rewards/rejected": -0.3739717900753021, + "step": 1380 + }, + { + "epoch": 0.55, + "learning_rate": 2.542775590305023e-06, + "logits/chosen": -1.304917573928833, + "logits/rejected": -0.4121823310852051, + "logps/chosen": -630.0661010742188, + "logps/rejected": -1441.1773681640625, + "loss": 0.2289, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03756406903266907, + "rewards/margins": 0.20802605152130127, + "rewards/rejected": -0.24559013545513153, + "step": 1390 + }, + { + "epoch": 0.55, + "learning_rate": 2.5085555188492384e-06, + "logits/chosen": -1.2159336805343628, + "logits/rejected": -0.3775702118873596, + "logps/chosen": -709.61376953125, + "logps/rejected": -1723.700927734375, + "loss": 0.1568, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10398067533969879, + "rewards/margins": 0.2780481278896332, + "rewards/rejected": -0.38202884793281555, + "step": 1400 + }, + { + "epoch": 0.55, + "learning_rate": 2.474333844244276e-06, + "logits/chosen": -1.2202876806259155, + "logits/rejected": -0.35152697563171387, + "logps/chosen": -818.2611083984375, + "logps/rejected": -1743.5579833984375, + "loss": 0.1788, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09500784426927567, + "rewards/margins": 0.300513356924057, + "rewards/rejected": -0.39552122354507446, + "step": 1410 + }, + { + "epoch": 0.56, + "learning_rate": 2.440116979011743e-06, + "logits/chosen": -1.4342302083969116, + "logits/rejected": -0.45796999335289, + "logps/chosen": -718.6922607421875, + "logps/rejected": -1725.5560302734375, + "loss": 0.197, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05935473367571831, + "rewards/margins": 0.324557363986969, + "rewards/rejected": -0.383912056684494, + "step": 1420 + }, + { + "epoch": 0.56, + "learning_rate": 2.4059113347720573e-06, + "logits/chosen": -1.5391137599945068, + "logits/rejected": -0.13381418585777283, + "logps/chosen": -690.8306884765625, + "logps/rejected": -1534.461181640625, + "loss": 0.1946, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10408179461956024, + "rewards/margins": 0.27237391471862793, + "rewards/rejected": -0.376455694437027, + "step": 1430 + }, + { + "epoch": 0.56, + "learning_rate": 2.3717233210430258e-06, + "logits/chosen": -1.308176875114441, + "logits/rejected": -0.5252507925033569, + "logps/chosen": -736.7550659179688, + "logps/rejected": -1814.572021484375, + "loss": 0.1699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1053951233625412, + "rewards/margins": 0.3335246741771698, + "rewards/rejected": -0.4389197826385498, + "step": 1440 + }, + { + "epoch": 0.57, + "learning_rate": 2.337559344038817e-06, + "logits/chosen": -1.2826203107833862, + "logits/rejected": 0.2594057321548462, + "logps/chosen": -654.9820556640625, + "logps/rejected": -1548.369384765625, + "loss": 0.1628, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10001038014888763, + "rewards/margins": 0.2573556900024414, + "rewards/rejected": -0.35736608505249023, + "step": 1450 + }, + { + "epoch": 0.57, + "learning_rate": 2.303425805469554e-06, + "logits/chosen": -1.2893702983856201, + "logits/rejected": -0.615670382976532, + "logps/chosen": -686.9696044921875, + "logps/rejected": -1765.0921630859375, + "loss": 0.1341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06719444692134857, + "rewards/margins": 0.3570956885814667, + "rewards/rejected": -0.42429018020629883, + "step": 1460 + }, + { + "epoch": 0.58, + "learning_rate": 2.269329101341745e-06, + "logits/chosen": -1.5257200002670288, + "logits/rejected": -0.8465067744255066, + "logps/chosen": -722.9954833984375, + "logps/rejected": -1763.6884765625, + "loss": 0.1296, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0741962268948555, + "rewards/margins": 0.36958009004592896, + "rewards/rejected": -0.44377630949020386, + "step": 1470 + }, + { + "epoch": 0.58, + "learning_rate": 2.235275620759797e-06, + "logits/chosen": -1.3611409664154053, + "logits/rejected": 0.612551748752594, + "logps/chosen": -703.1578979492188, + "logps/rejected": -1591.042236328125, + "loss": 0.1764, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10201771557331085, + "rewards/margins": 0.2424260824918747, + "rewards/rejected": -0.34444376826286316, + "step": 1480 + }, + { + "epoch": 0.58, + "learning_rate": 2.2012717447288037e-06, + "logits/chosen": -1.3054463863372803, + "logits/rejected": -0.7033378481864929, + "logps/chosen": -731.6030883789062, + "logps/rejected": -1814.713134765625, + "loss": 0.1576, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0751316249370575, + "rewards/margins": 0.3522658348083496, + "rewards/rejected": -0.42739754915237427, + "step": 1490 + }, + { + "epoch": 0.59, + "learning_rate": 2.167323844958867e-06, + "logits/chosen": -1.524957299232483, + "logits/rejected": -0.6119885444641113, + "logps/chosen": -701.2098388671875, + "logps/rejected": -1545.368896484375, + "loss": 0.14, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10369672626256943, + "rewards/margins": 0.28280869126319885, + "rewards/rejected": -0.3865054249763489, + "step": 1500 + }, + { + "epoch": 0.59, + "learning_rate": 2.133438282671149e-06, + "logits/chosen": -1.2132611274719238, + "logits/rejected": -0.7082799673080444, + "logps/chosen": -762.6727294921875, + "logps/rejected": -1658.924072265625, + "loss": 0.1803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1439850628376007, + "rewards/margins": 0.27077409625053406, + "rewards/rejected": -0.41475915908813477, + "step": 1510 + }, + { + "epoch": 0.6, + "learning_rate": 2.0996214074059033e-06, + "logits/chosen": -1.6239715814590454, + "logits/rejected": -0.5037415623664856, + "logps/chosen": -786.1912841796875, + "logps/rejected": -1638.0843505859375, + "loss": 0.2179, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07825516164302826, + "rewards/margins": 0.2860822379589081, + "rewards/rejected": -0.36433738470077515, + "step": 1520 + }, + { + "epoch": 0.6, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.3029212951660156, + "logits/rejected": -0.10125327110290527, + "logps/chosen": -724.8988647460938, + "logps/rejected": -1521.581787109375, + "loss": 0.1988, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05552230030298233, + "rewards/margins": 0.29605624079704285, + "rewards/rejected": -0.3515785336494446, + "step": 1530 + }, + { + "epoch": 0.6, + "learning_rate": 2.0322190505629297e-06, + "logits/chosen": -1.1891577243804932, + "logits/rejected": -0.263233482837677, + "logps/chosen": -726.5543212890625, + "logps/rejected": -1851.503662109375, + "loss": 0.1454, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.10788372904062271, + "rewards/margins": 0.327489972114563, + "rewards/rejected": -0.4353737235069275, + "step": 1540 + }, + { + "epoch": 0.61, + "learning_rate": 1.998646198965312e-06, + "logits/chosen": -1.376450777053833, + "logits/rejected": -0.22948014736175537, + "logps/chosen": -596.0374755859375, + "logps/rejected": -1520.2818603515625, + "loss": 0.2496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.062327928841114044, + "rewards/margins": 0.3048885762691498, + "rewards/rejected": -0.3672165274620056, + "step": 1550 + }, + { + "epoch": 0.61, + "learning_rate": 1.965167291983757e-06, + "logits/chosen": -1.6274656057357788, + "logits/rejected": -0.2617906928062439, + "logps/chosen": -786.1827392578125, + "logps/rejected": -1801.614990234375, + "loss": 0.1203, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10416553169488907, + "rewards/margins": 0.327767550945282, + "rewards/rejected": -0.43193307518959045, + "step": 1560 + }, + { + "epoch": 0.62, + "learning_rate": 1.931788602958678e-06, + "logits/chosen": -0.9874919652938843, + "logits/rejected": 0.055336445569992065, + "logps/chosen": -801.8827514648438, + "logps/rejected": -1887.7252197265625, + "loss": 0.1647, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1263236552476883, + "rewards/margins": 0.3318944573402405, + "rewards/rejected": -0.4582180976867676, + "step": 1570 + }, + { + "epoch": 0.62, + "learning_rate": 1.8985163864514644e-06, + "logits/chosen": -1.4952738285064697, + "logits/rejected": -0.03670965135097504, + "logps/chosen": -776.7321166992188, + "logps/rejected": -1846.3646240234375, + "loss": 0.1433, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11145295947790146, + "rewards/margins": 0.3160027265548706, + "rewards/rejected": -0.42745572328567505, + "step": 1580 + }, + { + "epoch": 0.62, + "learning_rate": 1.8653568770724805e-06, + "logits/chosen": -1.352738618850708, + "logits/rejected": -0.2683241367340088, + "logps/chosen": -648.5192260742188, + "logps/rejected": -1464.099365234375, + "loss": 0.185, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.08493933826684952, + "rewards/margins": 0.2524186968803406, + "rewards/rejected": -0.3373579978942871, + "step": 1590 + }, + { + "epoch": 0.63, + "learning_rate": 1.8323162883128211e-06, + "logits/chosen": -1.419662356376648, + "logits/rejected": -0.4111382067203522, + "logps/chosen": -699.5247802734375, + "logps/rejected": -1743.6064453125, + "loss": 0.1541, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08453786373138428, + "rewards/margins": 0.296464741230011, + "rewards/rejected": -0.38100260496139526, + "step": 1600 + }, + { + "epoch": 0.63, + "learning_rate": 1.7994008113800105e-06, + "logits/chosen": -1.5189629793167114, + "logits/rejected": -0.9077790975570679, + "logps/chosen": -701.3331298828125, + "logps/rejected": -1603.5174560546875, + "loss": 0.1429, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08178045600652695, + "rewards/margins": 0.3028547167778015, + "rewards/rejected": -0.38463518023490906, + "step": 1610 + }, + { + "epoch": 0.64, + "learning_rate": 1.7666166140378853e-06, + "logits/chosen": -1.169510841369629, + "logits/rejected": 0.19725301861763, + "logps/chosen": -734.7293090820312, + "logps/rejected": -1564.332763671875, + "loss": 0.1528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09660240262746811, + "rewards/margins": 0.28595981001853943, + "rewards/rejected": -0.38256219029426575, + "step": 1620 + }, + { + "epoch": 0.64, + "learning_rate": 1.7339698394508632e-06, + "logits/chosen": -1.266775369644165, + "logits/rejected": -0.6185767650604248, + "logps/chosen": -627.6648559570312, + "logps/rejected": -1780.268310546875, + "loss": 0.1694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0766761377453804, + "rewards/margins": 0.3623715043067932, + "rewards/rejected": -0.43904757499694824, + "step": 1630 + }, + { + "epoch": 0.64, + "learning_rate": 1.7014666050328325e-06, + "logits/chosen": -1.5317351818084717, + "logits/rejected": -0.46623557806015015, + "logps/chosen": -639.0328369140625, + "logps/rejected": -1635.7354736328125, + "loss": 0.126, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07408356666564941, + "rewards/margins": 0.34189721941947937, + "rewards/rejected": -0.41598081588745117, + "step": 1640 + }, + { + "epoch": 0.65, + "learning_rate": 1.6691130013008514e-06, + "logits/chosen": -1.421917200088501, + "logits/rejected": -0.19839780032634735, + "logps/chosen": -837.2825317382812, + "logps/rejected": -1678.8179931640625, + "loss": 0.1956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08556106686592102, + "rewards/margins": 0.2512792646884918, + "rewards/rejected": -0.33684033155441284, + "step": 1650 + }, + { + "epoch": 0.65, + "learning_rate": 1.6369150907339007e-06, + "logits/chosen": -1.195821762084961, + "logits/rejected": -0.20372645556926727, + "logps/chosen": -709.2095336914062, + "logps/rejected": -1652.1871337890625, + "loss": 0.1906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07075698673725128, + "rewards/margins": 0.289537250995636, + "rewards/rejected": -0.36029425263404846, + "step": 1660 + }, + { + "epoch": 0.65, + "learning_rate": 1.6048789066368858e-06, + "logits/chosen": -1.354961633682251, + "logits/rejected": -0.20124280452728271, + "logps/chosen": -728.2799072265625, + "logps/rejected": -1569.3551025390625, + "loss": 0.1916, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08767645061016083, + "rewards/margins": 0.2705709636211395, + "rewards/rejected": -0.35824739933013916, + "step": 1670 + }, + { + "epoch": 0.66, + "learning_rate": 1.5730104520100984e-06, + "logits/chosen": -1.496524453163147, + "logits/rejected": -0.8575867414474487, + "logps/chosen": -612.16650390625, + "logps/rejected": -1632.1365966796875, + "loss": 0.1279, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06737435609102249, + "rewards/margins": 0.3229941725730896, + "rewards/rejected": -0.3903685212135315, + "step": 1680 + }, + { + "epoch": 0.66, + "learning_rate": 1.5413156984243715e-06, + "logits/chosen": -1.3209052085876465, + "logits/rejected": -0.12577922642230988, + "logps/chosen": -759.8672485351562, + "logps/rejected": -1498.656494140625, + "loss": 0.1552, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10295001417398453, + "rewards/margins": 0.21431489288806915, + "rewards/rejected": -0.3172649145126343, + "step": 1690 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.1863139867782593, + "logits/rejected": -0.08450505882501602, + "logps/chosen": -838.8494873046875, + "logps/rejected": -1536.5277099609375, + "loss": 0.167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12472305446863174, + "rewards/margins": 0.24718424677848816, + "rewards/rejected": -0.3719072937965393, + "step": 1700 + }, + { + "epoch": 0.67, + "learning_rate": 1.4784710168044215e-06, + "logits/chosen": -1.369985818862915, + "logits/rejected": -0.5248149037361145, + "logps/chosen": -883.6121826171875, + "logps/rejected": -1617.128662109375, + "loss": 0.1984, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1338253617286682, + "rewards/margins": 0.2511526942253113, + "rewards/rejected": -0.3849780857563019, + "step": 1710 + }, + { + "epoch": 0.67, + "learning_rate": 1.4473328647245726e-06, + "logits/chosen": -1.624087929725647, + "logits/rejected": -0.42871198058128357, + "logps/chosen": -694.0233764648438, + "logps/rejected": -1572.922119140625, + "loss": 0.2198, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1324300318956375, + "rewards/margins": 0.2687898874282837, + "rewards/rejected": -0.4012199342250824, + "step": 1720 + }, + { + "epoch": 0.68, + "learning_rate": 1.4163919633879325e-06, + "logits/chosen": -1.4249976873397827, + "logits/rejected": -0.46216440200805664, + "logps/chosen": -831.1329956054688, + "logps/rejected": -1623.590087890625, + "loss": 0.2073, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09118635952472687, + "rewards/margins": 0.26622968912124634, + "rewards/rejected": -0.357416033744812, + "step": 1730 + }, + { + "epoch": 0.68, + "learning_rate": 1.3856541105586545e-06, + "logits/chosen": -1.5596380233764648, + "logits/rejected": -0.4608355462551117, + "logps/chosen": -826.0984497070312, + "logps/rejected": -1898.5416259765625, + "loss": 0.1421, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1307828575372696, + "rewards/margins": 0.34290483593940735, + "rewards/rejected": -0.47368764877319336, + "step": 1740 + }, + { + "epoch": 0.69, + "learning_rate": 1.3551250659532853e-06, + "logits/chosen": -1.492356300354004, + "logits/rejected": -0.7112780809402466, + "logps/chosen": -699.1672973632812, + "logps/rejected": -1537.228271484375, + "loss": 0.1776, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06596329063177109, + "rewards/margins": 0.2567977011203766, + "rewards/rejected": -0.32276099920272827, + "step": 1750 + }, + { + "epoch": 0.69, + "learning_rate": 1.3248105501614897e-06, + "logits/chosen": -1.2990128993988037, + "logits/rejected": -0.7208808660507202, + "logps/chosen": -714.08544921875, + "logps/rejected": -1732.0875244140625, + "loss": 0.2147, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05500803142786026, + "rewards/margins": 0.26819437742233276, + "rewards/rejected": -0.32320234179496765, + "step": 1760 + }, + { + "epoch": 0.69, + "learning_rate": 1.2947162435741278e-06, + "logits/chosen": -1.1586157083511353, + "logits/rejected": 0.03688998147845268, + "logps/chosen": -734.365966796875, + "logps/rejected": -1622.6265869140625, + "loss": 0.2471, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1007305383682251, + "rewards/margins": 0.197604700922966, + "rewards/rejected": -0.2983352243900299, + "step": 1770 + }, + { + "epoch": 0.7, + "learning_rate": 1.2648477853188395e-06, + "logits/chosen": -1.412379503250122, + "logits/rejected": -0.5264952778816223, + "logps/chosen": -698.6842651367188, + "logps/rejected": -1511.8642578125, + "loss": 0.1865, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.042982958257198334, + "rewards/margins": 0.26162266731262207, + "rewards/rejected": -0.304605633020401, + "step": 1780 + }, + { + "epoch": 0.7, + "learning_rate": 1.2352107722033842e-06, + "logits/chosen": -1.2586696147918701, + "logits/rejected": -0.15170638263225555, + "logps/chosen": -653.3026123046875, + "logps/rejected": -1529.8104248046875, + "loss": 0.1549, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0475340336561203, + "rewards/margins": 0.275061696767807, + "rewards/rejected": -0.3225957453250885, + "step": 1790 + }, + { + "epoch": 0.71, + "learning_rate": 1.205810757666894e-06, + "logits/chosen": -1.3673145771026611, + "logits/rejected": -0.4642263948917389, + "logps/chosen": -588.0513916015625, + "logps/rejected": -1447.431396484375, + "loss": 0.1613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07587876915931702, + "rewards/margins": 0.24527780711650848, + "rewards/rejected": -0.3211565613746643, + "step": 1800 + }, + { + "epoch": 0.71, + "learning_rate": 1.176653250739265e-06, + "logits/chosen": -1.4524450302124023, + "logits/rejected": -0.21896734833717346, + "logps/chosen": -831.2824096679688, + "logps/rejected": -1819.2064208984375, + "loss": 0.1362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09457580000162125, + "rewards/margins": 0.29631510376930237, + "rewards/rejected": -0.390890896320343, + "step": 1810 + }, + { + "epoch": 0.71, + "learning_rate": 1.1477437150088599e-06, + "logits/chosen": -1.112823247909546, + "logits/rejected": -0.731514573097229, + "logps/chosen": -659.6626586914062, + "logps/rejected": -1812.48828125, + "loss": 0.1304, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05299247428774834, + "rewards/margins": 0.3928179442882538, + "rewards/rejected": -0.4458104074001312, + "step": 1820 + }, + { + "epoch": 0.72, + "learning_rate": 1.1190875675987355e-06, + "logits/chosen": -1.3094470500946045, + "logits/rejected": -0.5637291073799133, + "logps/chosen": -753.520263671875, + "logps/rejected": -1700.703857421875, + "loss": 0.1502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09490607678890228, + "rewards/margins": 0.31419411301612854, + "rewards/rejected": -0.409100204706192, + "step": 1830 + }, + { + "epoch": 0.72, + "learning_rate": 1.0906901781515695e-06, + "logits/chosen": -1.550244927406311, + "logits/rejected": -0.08849823474884033, + "logps/chosen": -724.5099487304688, + "logps/rejected": -1681.033447265625, + "loss": 0.1606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08625416457653046, + "rewards/margins": 0.31124037504196167, + "rewards/rejected": -0.39749449491500854, + "step": 1840 + }, + { + "epoch": 0.73, + "learning_rate": 1.0625568678234839e-06, + "logits/chosen": -1.0879476070404053, + "logits/rejected": -0.13099336624145508, + "logps/chosen": -671.8837280273438, + "logps/rejected": -1590.70068359375, + "loss": 0.1721, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06376481801271439, + "rewards/margins": 0.2916422486305237, + "rewards/rejected": -0.35540705919265747, + "step": 1850 + }, + { + "epoch": 0.73, + "learning_rate": 1.034692908286964e-06, + "logits/chosen": -1.3455946445465088, + "logits/rejected": -0.2840282917022705, + "logps/chosen": -611.4814453125, + "logps/rejected": -1663.345703125, + "loss": 0.2039, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07838527858257294, + "rewards/margins": 0.31080400943756104, + "rewards/rejected": -0.3891892731189728, + "step": 1860 + }, + { + "epoch": 0.73, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -1.2556473016738892, + "logits/rejected": -0.011271673254668713, + "logps/chosen": -753.8445434570312, + "logps/rejected": -1636.611083984375, + "loss": 0.2112, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11006224155426025, + "rewards/margins": 0.24791212379932404, + "rewards/rejected": -0.3579743504524231, + "step": 1870 + }, + { + "epoch": 0.74, + "learning_rate": 9.797938749429088e-07, + "logits/chosen": -1.2267249822616577, + "logits/rejected": -0.35565489530563354, + "logps/chosen": -690.4405517578125, + "logps/rejected": -1600.0665283203125, + "loss": 0.1862, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11180742084980011, + "rewards/margins": 0.24852195382118225, + "rewards/rejected": -0.36032935976982117, + "step": 1880 + }, + { + "epoch": 0.74, + "learning_rate": 9.527690882192636e-07, + "logits/chosen": -1.2072794437408447, + "logits/rejected": 0.457929790019989, + "logps/chosen": -697.0407104492188, + "logps/rejected": -1490.8367919921875, + "loss": 0.1672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08405301719903946, + "rewards/margins": 0.30805063247680664, + "rewards/rejected": -0.3921036422252655, + "step": 1890 + }, + { + "epoch": 0.75, + "learning_rate": 9.260342245273507e-07, + "logits/chosen": -1.3990890979766846, + "logits/rejected": -0.6794065237045288, + "logps/chosen": -618.4937744140625, + "logps/rejected": -1800.4622802734375, + "loss": 0.1376, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07932893931865692, + "rewards/margins": 0.37783947587013245, + "rewards/rejected": -0.4571684002876282, + "step": 1900 + }, + { + "epoch": 0.75, + "learning_rate": 8.995942934960964e-07, + "logits/chosen": -1.4945213794708252, + "logits/rejected": -0.18756787478923798, + "logps/chosen": -803.509521484375, + "logps/rejected": -1819.3349609375, + "loss": 0.1544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09666319191455841, + "rewards/margins": 0.36414963006973267, + "rewards/rejected": -0.4608128070831299, + "step": 1910 + }, + { + "epoch": 0.75, + "learning_rate": 8.734542494893955e-07, + "logits/chosen": -1.431398868560791, + "logits/rejected": -0.4752410352230072, + "logps/chosen": -792.5185546875, + "logps/rejected": -1632.630126953125, + "loss": 0.2053, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09532758593559265, + "rewards/margins": 0.30739787220954895, + "rewards/rejected": -0.4027254581451416, + "step": 1920 + }, + { + "epoch": 0.76, + "learning_rate": 8.476189906777457e-07, + "logits/chosen": -1.3982821702957153, + "logits/rejected": -0.08427709341049194, + "logps/chosen": -703.8153076171875, + "logps/rejected": -1600.6046142578125, + "loss": 0.1632, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08243191242218018, + "rewards/margins": 0.2651790678501129, + "rewards/rejected": -0.3476109802722931, + "step": 1930 + }, + { + "epoch": 0.76, + "learning_rate": 8.220933581204257e-07, + "logits/chosen": -1.2576748132705688, + "logits/rejected": 0.40268439054489136, + "logps/chosen": -528.5084228515625, + "logps/rejected": -1385.6802978515625, + "loss": 0.1183, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.042114533483982086, + "rewards/margins": 0.2899821698665619, + "rewards/rejected": -0.3320966958999634, + "step": 1940 + }, + { + "epoch": 0.76, + "learning_rate": 7.968821348583644e-07, + "logits/chosen": -1.3039714097976685, + "logits/rejected": -0.34471797943115234, + "logps/chosen": -695.2639770507812, + "logps/rejected": -1490.329345703125, + "loss": 0.1969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08580182492733002, + "rewards/margins": 0.252492755651474, + "rewards/rejected": -0.3382945656776428, + "step": 1950 + }, + { + "epoch": 0.77, + "learning_rate": 7.719900450178882e-07, + "logits/chosen": -1.2936707735061646, + "logits/rejected": 0.12274640798568726, + "logps/chosen": -856.3453979492188, + "logps/rejected": -1843.365966796875, + "loss": 0.1424, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11615820974111557, + "rewards/margins": 0.33368679881095886, + "rewards/rejected": -0.44984501600265503, + "step": 1960 + }, + { + "epoch": 0.77, + "learning_rate": 7.474217529255018e-07, + "logits/chosen": -1.611425757408142, + "logits/rejected": -0.11960859596729279, + "logps/chosen": -636.3781127929688, + "logps/rejected": -1416.641357421875, + "loss": 0.1836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03691656142473221, + "rewards/margins": 0.26324373483657837, + "rewards/rejected": -0.30016031861305237, + "step": 1970 + }, + { + "epoch": 0.78, + "learning_rate": 7.231818622338824e-07, + "logits/chosen": -1.616742730140686, + "logits/rejected": -0.024957846850156784, + "logps/chosen": -676.9722900390625, + "logps/rejected": -1823.395751953125, + "loss": 0.1225, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.06905169785022736, + "rewards/margins": 0.3410964906215668, + "rewards/rejected": -0.4101482033729553, + "step": 1980 + }, + { + "epoch": 0.78, + "learning_rate": 6.992749150592343e-07, + "logits/chosen": -1.2690980434417725, + "logits/rejected": -0.1918954849243164, + "logps/chosen": -866.05029296875, + "logps/rejected": -1606.396240234375, + "loss": 0.1865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.102320097386837, + "rewards/margins": 0.27621665596961975, + "rewards/rejected": -0.37853676080703735, + "step": 1990 + }, + { + "epoch": 0.78, + "learning_rate": 6.75705391130183e-07, + "logits/chosen": -1.2711069583892822, + "logits/rejected": -0.00027151108952239156, + "logps/chosen": -804.7188720703125, + "logps/rejected": -1668.5374755859375, + "loss": 0.166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07086384296417236, + "rewards/margins": 0.2957269251346588, + "rewards/rejected": -0.3665907680988312, + "step": 2000 + }, + { + "epoch": 0.79, + "learning_rate": 6.524777069483526e-07, + "logits/chosen": -1.225556492805481, + "logits/rejected": 0.41769227385520935, + "logps/chosen": -634.6071166992188, + "logps/rejected": -1566.803466796875, + "loss": 0.1916, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.020447371527552605, + "rewards/margins": 0.31025153398513794, + "rewards/rejected": -0.3306989073753357, + "step": 2010 + }, + { + "epoch": 0.79, + "learning_rate": 6.29596214960792e-07, + "logits/chosen": -1.3543643951416016, + "logits/rejected": -0.1612066775560379, + "logps/chosen": -731.138671875, + "logps/rejected": -1658.898193359375, + "loss": 0.1954, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.078438401222229, + "rewards/margins": 0.2861797511577606, + "rewards/rejected": -0.36461812257766724, + "step": 2020 + }, + { + "epoch": 0.8, + "learning_rate": 6.070652027444102e-07, + "logits/chosen": -1.5058627128601074, + "logits/rejected": -0.940344512462616, + "logps/chosen": -629.819580078125, + "logps/rejected": -1781.6654052734375, + "loss": 0.1992, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.061135418713092804, + "rewards/margins": 0.3336263597011566, + "rewards/rejected": -0.39476174116134644, + "step": 2030 + }, + { + "epoch": 0.8, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -1.524287462234497, + "logits/rejected": -0.8633726239204407, + "logps/chosen": -602.5631103515625, + "logps/rejected": -1604.6434326171875, + "loss": 0.1721, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05612843483686447, + "rewards/margins": 0.33471354842185974, + "rewards/rejected": -0.3908420205116272, + "step": 2040 + }, + { + "epoch": 0.8, + "learning_rate": 5.63071438773913e-07, + "logits/chosen": -1.4894258975982666, + "logits/rejected": -0.14880971610546112, + "logps/chosen": -642.1497802734375, + "logps/rejected": -1459.4459228515625, + "loss": 0.2064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0682179257273674, + "rewards/margins": 0.21762903034687042, + "rewards/rejected": -0.2858469486236572, + "step": 2050 + }, + { + "epoch": 0.81, + "learning_rate": 5.416169306538485e-07, + "logits/chosen": -1.3140041828155518, + "logits/rejected": 0.3596586287021637, + "logps/chosen": -820.9474487304688, + "logps/rejected": -1682.409912109375, + "loss": 0.2355, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09827397763729095, + "rewards/margins": 0.281690388917923, + "rewards/rejected": -0.3799643814563751, + "step": 2060 + }, + { + "epoch": 0.81, + "learning_rate": 5.205293880283552e-07, + "logits/chosen": -1.5573115348815918, + "logits/rejected": -0.13623039424419403, + "logps/chosen": -671.4677124023438, + "logps/rejected": -1707.608642578125, + "loss": 0.1752, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05831771343946457, + "rewards/margins": 0.3186204433441162, + "rewards/rejected": -0.37693825364112854, + "step": 2070 + }, + { + "epoch": 0.82, + "learning_rate": 4.998127623207404e-07, + "logits/chosen": -1.2270171642303467, + "logits/rejected": -0.16427640616893768, + "logps/chosen": -636.1573486328125, + "logps/rejected": -1320.9652099609375, + "loss": 0.1501, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04943504184484482, + "rewards/margins": 0.23794174194335938, + "rewards/rejected": -0.2873767912387848, + "step": 2080 + }, + { + "epoch": 0.82, + "learning_rate": 4.794709354512073e-07, + "logits/chosen": -1.4142221212387085, + "logits/rejected": -0.6630762219429016, + "logps/chosen": -694.4979858398438, + "logps/rejected": -1861.2236328125, + "loss": 0.1027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0681043490767479, + "rewards/margins": 0.33745378255844116, + "rewards/rejected": -0.40555816888809204, + "step": 2090 + }, + { + "epoch": 0.82, + "learning_rate": 4.5950771910944603e-07, + "logits/chosen": -1.386041522026062, + "logits/rejected": -0.4771800637245178, + "logps/chosen": -552.6729736328125, + "logps/rejected": -1493.256103515625, + "loss": 0.1758, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06818665564060211, + "rewards/margins": 0.2598266899585724, + "rewards/rejected": -0.3280133306980133, + "step": 2100 + }, + { + "epoch": 0.83, + "learning_rate": 4.399268540403975e-07, + "logits/chosen": -1.6429307460784912, + "logits/rejected": -0.7215126752853394, + "logps/chosen": -692.6094970703125, + "logps/rejected": -1617.4793701171875, + "loss": 0.1561, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04670856520533562, + "rewards/margins": 0.3174007534980774, + "rewards/rejected": -0.3641093373298645, + "step": 2110 + }, + { + "epoch": 0.83, + "learning_rate": 4.2073200934330316e-07, + "logits/chosen": -1.318565011024475, + "logits/rejected": 0.31595462560653687, + "logps/chosen": -688.9269409179688, + "logps/rejected": -1576.12939453125, + "loss": 0.1494, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05915086343884468, + "rewards/margins": 0.2843713164329529, + "rewards/rejected": -0.34352222084999084, + "step": 2120 + }, + { + "epoch": 0.84, + "learning_rate": 4.019267817841835e-07, + "logits/chosen": -1.4014190435409546, + "logits/rejected": 0.06803856045007706, + "logps/chosen": -661.041015625, + "logps/rejected": -1782.5843505859375, + "loss": 0.1339, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.060147546231746674, + "rewards/margins": 0.367543488740921, + "rewards/rejected": -0.4276910424232483, + "step": 2130 + }, + { + "epoch": 0.84, + "learning_rate": 3.8351469512186656e-07, + "logits/chosen": -1.293666124343872, + "logits/rejected": 0.01516579370945692, + "logps/chosen": -703.8981323242188, + "logps/rejected": -1585.350830078125, + "loss": 0.2612, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07257186621427536, + "rewards/margins": 0.24214370548725128, + "rewards/rejected": -0.31471556425094604, + "step": 2140 + }, + { + "epoch": 0.84, + "learning_rate": 3.654991994477039e-07, + "logits/chosen": -1.4482967853546143, + "logits/rejected": -0.5136088132858276, + "logps/chosen": -739.101318359375, + "logps/rejected": -1636.627685546875, + "loss": 0.2446, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.08362185955047607, + "rewards/margins": 0.2510630488395691, + "rewards/rejected": -0.33468490839004517, + "step": 2150 + }, + { + "epoch": 0.85, + "learning_rate": 3.4788367053908087e-07, + "logits/chosen": -1.464727520942688, + "logits/rejected": -0.6895856261253357, + "logps/chosen": -649.2825927734375, + "logps/rejected": -1706.675048828125, + "loss": 0.1222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0619744174182415, + "rewards/margins": 0.3182070851325989, + "rewards/rejected": -0.3801814913749695, + "step": 2160 + }, + { + "epoch": 0.85, + "learning_rate": 3.3067140922686175e-07, + "logits/chosen": -1.2893580198287964, + "logits/rejected": -0.02746570110321045, + "logps/chosen": -637.2613525390625, + "logps/rejected": -1635.1025390625, + "loss": 0.1475, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06607834994792938, + "rewards/margins": 0.306417852640152, + "rewards/rejected": -0.37249621748924255, + "step": 2170 + }, + { + "epoch": 0.85, + "learning_rate": 3.1386564077687115e-07, + "logits/chosen": -1.2429146766662598, + "logits/rejected": -0.5083945989608765, + "logps/chosen": -689.4899291992188, + "logps/rejected": -1385.622802734375, + "loss": 0.2019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08055596053600311, + "rewards/margins": 0.19417758285999298, + "rewards/rejected": -0.2747335135936737, + "step": 2180 + }, + { + "epoch": 0.86, + "learning_rate": 2.9746951428553884e-07, + "logits/chosen": -1.2200576066970825, + "logits/rejected": 0.4126719534397125, + "logps/chosen": -697.4050903320312, + "logps/rejected": -1761.609375, + "loss": 0.1621, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.057256706058979034, + "rewards/margins": 0.3544352650642395, + "rewards/rejected": -0.41169196367263794, + "step": 2190 + }, + { + "epoch": 0.86, + "learning_rate": 2.814861020898146e-07, + "logits/chosen": -1.5707600116729736, + "logits/rejected": -0.5523526668548584, + "logps/chosen": -807.9168090820312, + "logps/rejected": -1893.062255859375, + "loss": 0.12, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.036843474954366684, + "rewards/margins": 0.38658010959625244, + "rewards/rejected": -0.4234235882759094, + "step": 2200 + }, + { + "epoch": 0.87, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -1.3549106121063232, + "logits/rejected": -0.0543874129652977, + "logps/chosen": -659.9330444335938, + "logps/rejected": -1517.608642578125, + "loss": 0.1865, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08612764626741409, + "rewards/margins": 0.2554120123386383, + "rewards/rejected": -0.3415396809577942, + "step": 2210 + }, + { + "epoch": 0.87, + "learning_rate": 2.507693226958871e-07, + "logits/chosen": -1.5055897235870361, + "logits/rejected": -0.7960633635520935, + "logps/chosen": -594.6507568359375, + "logps/rejected": -1544.100341796875, + "loss": 0.1835, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05378856509923935, + "rewards/margins": 0.26992538571357727, + "rewards/rejected": -0.3237139582633972, + "step": 2220 + }, + { + "epoch": 0.87, + "learning_rate": 2.360417112654481e-07, + "logits/chosen": -1.3403241634368896, + "logits/rejected": -0.036334630101919174, + "logps/chosen": -747.6497802734375, + "logps/rejected": -1497.166748046875, + "loss": 0.2369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09375442564487457, + "rewards/margins": 0.2080894410610199, + "rewards/rejected": -0.30184388160705566, + "step": 2230 + }, + { + "epoch": 0.88, + "learning_rate": 2.2173832458762146e-07, + "logits/chosen": -1.3305310010910034, + "logits/rejected": 0.5647405385971069, + "logps/chosen": -708.2887573242188, + "logps/rejected": -1672.27734375, + "loss": 0.1525, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0794595405459404, + "rewards/margins": 0.27278995513916016, + "rewards/rejected": -0.35224950313568115, + "step": 2240 + }, + { + "epoch": 0.88, + "learning_rate": 2.07861842857843e-07, + "logits/chosen": -1.3758533000946045, + "logits/rejected": -0.3346394896507263, + "logps/chosen": -641.7481689453125, + "logps/rejected": -1659.6363525390625, + "loss": 0.1304, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05183352157473564, + "rewards/margins": 0.3061942458152771, + "rewards/rejected": -0.35802772641181946, + "step": 2250 + }, + { + "epoch": 0.89, + "learning_rate": 1.9441486627729987e-07, + "logits/chosen": -1.2939542531967163, + "logits/rejected": -0.2226782590150833, + "logps/chosen": -574.517822265625, + "logps/rejected": -1345.4554443359375, + "loss": 0.2427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.044371530413627625, + "rewards/margins": 0.25636622309684753, + "rewards/rejected": -0.30073776841163635, + "step": 2260 + }, + { + "epoch": 0.89, + "learning_rate": 1.8139991456569694e-07, + "logits/chosen": -1.5377174615859985, + "logits/rejected": -0.4445236623287201, + "logps/chosen": -666.4637451171875, + "logps/rejected": -1826.706298828125, + "loss": 0.1407, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04511731117963791, + "rewards/margins": 0.3437032103538513, + "rewards/rejected": -0.3888205587863922, + "step": 2270 + }, + { + "epoch": 0.89, + "learning_rate": 1.6881942648911077e-07, + "logits/chosen": -1.1588428020477295, + "logits/rejected": -0.33162426948547363, + "logps/chosen": -692.0267333984375, + "logps/rejected": -1666.332763671875, + "loss": 0.1417, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08260687440633774, + "rewards/margins": 0.27366960048675537, + "rewards/rejected": -0.3562764525413513, + "step": 2280 + }, + { + "epoch": 0.9, + "learning_rate": 1.5667575940300384e-07, + "logits/chosen": -1.2564775943756104, + "logits/rejected": 0.01690312661230564, + "logps/chosen": -673.8323974609375, + "logps/rejected": -1666.777099609375, + "loss": 0.1716, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0681915134191513, + "rewards/margins": 0.31038275361061096, + "rewards/rejected": -0.3785742521286011, + "step": 2290 + }, + { + "epoch": 0.9, + "learning_rate": 1.449711888105046e-07, + "logits/chosen": -1.518640160560608, + "logits/rejected": -0.6442452669143677, + "logps/chosen": -570.983154296875, + "logps/rejected": -1282.307373046875, + "loss": 0.2478, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0646091103553772, + "rewards/margins": 0.1985008716583252, + "rewards/rejected": -0.2631099820137024, + "step": 2300 + }, + { + "epoch": 0.91, + "learning_rate": 1.3370790793601373e-07, + "logits/chosen": -1.3143935203552246, + "logits/rejected": -0.862291157245636, + "logps/chosen": -554.1536254882812, + "logps/rejected": -1571.299560546875, + "loss": 0.1953, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04557369276881218, + "rewards/margins": 0.30963796377182007, + "rewards/rejected": -0.35521167516708374, + "step": 2310 + }, + { + "epoch": 0.91, + "learning_rate": 1.2288802731423882e-07, + "logits/chosen": -1.0400464534759521, + "logits/rejected": -0.2425573766231537, + "logps/chosen": -563.1722412109375, + "logps/rejected": -1669.356201171875, + "loss": 0.1733, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.032249629497528076, + "rewards/margins": 0.35338449478149414, + "rewards/rejected": -0.38563409447669983, + "step": 2320 + }, + { + "epoch": 0.91, + "learning_rate": 1.125135743947145e-07, + "logits/chosen": -1.392665982246399, + "logits/rejected": -0.2731800079345703, + "logps/chosen": -636.9364013671875, + "logps/rejected": -1649.939453125, + "loss": 0.1732, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05704592913389206, + "rewards/margins": 0.2943916916847229, + "rewards/rejected": -0.35143759846687317, + "step": 2330 + }, + { + "epoch": 0.92, + "learning_rate": 1.0258649316189722e-07, + "logits/chosen": -1.448233723640442, + "logits/rejected": -0.09116245806217194, + "logps/chosen": -595.1334838867188, + "logps/rejected": -1466.4798583984375, + "loss": 0.1563, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.029435504227876663, + "rewards/margins": 0.2879069745540619, + "rewards/rejected": -0.31734246015548706, + "step": 2340 + }, + { + "epoch": 0.92, + "learning_rate": 9.310864377089696e-08, + "logits/chosen": -1.298662543296814, + "logits/rejected": 0.7022291421890259, + "logps/chosen": -692.7830810546875, + "logps/rejected": -1592.398193359375, + "loss": 0.1976, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06722499430179596, + "rewards/margins": 0.26029545068740845, + "rewards/rejected": -0.327520489692688, + "step": 2350 + }, + { + "epoch": 0.93, + "learning_rate": 8.408180219891899e-08, + "logits/chosen": -1.0684707164764404, + "logits/rejected": -0.727800726890564, + "logps/chosen": -588.0787353515625, + "logps/rejected": -1701.8707275390625, + "loss": 0.1205, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.026762153953313828, + "rewards/margins": 0.3723045885562897, + "rewards/rejected": -0.3990667462348938, + "step": 2360 + }, + { + "epoch": 0.93, + "learning_rate": 7.550765991247655e-08, + "logits/chosen": -1.3413885831832886, + "logits/rejected": -0.5428125262260437, + "logps/chosen": -576.0411376953125, + "logps/rejected": -1852.245361328125, + "loss": 0.1253, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.046153269708156586, + "rewards/margins": 0.3955245614051819, + "rewards/rejected": -0.44167786836624146, + "step": 2370 + }, + { + "epoch": 0.93, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -1.3843357563018799, + "logits/rejected": -0.3473323881626129, + "logps/chosen": -653.3553466796875, + "logps/rejected": -1551.5245361328125, + "loss": 0.2134, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07392759621143341, + "rewards/margins": 0.27120834589004517, + "rewards/rejected": -0.3451359272003174, + "step": 2380 + }, + { + "epoch": 0.94, + "learning_rate": 5.972381462298643e-08, + "logits/chosen": -1.446597695350647, + "logits/rejected": -0.7707004547119141, + "logps/chosen": -587.423095703125, + "logps/rejected": -1533.5751953125, + "loss": 0.1378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048751670867204666, + "rewards/margins": 0.2718280255794525, + "rewards/rejected": -0.3205797076225281, + "step": 2390 + }, + { + "epoch": 0.94, + "learning_rate": 5.2517069226488694e-08, + "logits/chosen": -1.3285058736801147, + "logits/rejected": 0.6017956733703613, + "logps/chosen": -635.4934692382812, + "logps/rejected": -1658.809326171875, + "loss": 0.1218, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.054670076817274094, + "rewards/margins": 0.33853715658187866, + "rewards/rejected": -0.39320722222328186, + "step": 2400 + }, + { + "epoch": 0.95, + "learning_rate": 4.576893777442415e-08, + "logits/chosen": -1.455540418624878, + "logits/rejected": -0.42580240964889526, + "logps/chosen": -567.0203857421875, + "logps/rejected": -1439.871826171875, + "loss": 0.1791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04314614459872246, + "rewards/margins": 0.26481324434280396, + "rewards/rejected": -0.3079594075679779, + "step": 2410 + }, + { + "epoch": 0.95, + "learning_rate": 3.9480684744327145e-08, + "logits/chosen": -0.8030007481575012, + "logits/rejected": -0.6547081470489502, + "logps/chosen": -714.3633422851562, + "logps/rejected": -1776.2916259765625, + "loss": 0.135, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.07487257570028305, + "rewards/margins": 0.3637450039386749, + "rewards/rejected": -0.4386175274848938, + "step": 2420 + }, + { + "epoch": 0.95, + "learning_rate": 3.3653488440851255e-08, + "logits/chosen": -1.4338642358779907, + "logits/rejected": -0.2775370478630066, + "logps/chosen": -522.8925170898438, + "logps/rejected": -1403.964599609375, + "loss": 0.1429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.028084104880690575, + "rewards/margins": 0.2945864796638489, + "rewards/rejected": -0.322670578956604, + "step": 2430 + }, + { + "epoch": 0.96, + "learning_rate": 2.82884407749745e-08, + "logits/chosen": -1.548905611038208, + "logits/rejected": -0.1580895483493805, + "logps/chosen": -721.1185913085938, + "logps/rejected": -1815.273681640625, + "loss": 0.168, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.058462172746658325, + "rewards/margins": 0.334224134683609, + "rewards/rejected": -0.39268630743026733, + "step": 2440 + }, + { + "epoch": 0.96, + "learning_rate": 2.3386547059396634e-08, + "logits/chosen": -1.3936518430709839, + "logits/rejected": -0.42037662863731384, + "logps/chosen": -727.2462768554688, + "logps/rejected": -1849.8060302734375, + "loss": 0.1504, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.053724952042102814, + "rewards/margins": 0.34565088152885437, + "rewards/rejected": -0.3993757963180542, + "step": 2450 + }, + { + "epoch": 0.96, + "learning_rate": 1.8948725820160663e-08, + "logits/chosen": -1.5373561382293701, + "logits/rejected": -0.5124548673629761, + "logps/chosen": -707.9256591796875, + "logps/rejected": -1602.8634033203125, + "loss": 0.152, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06985752284526825, + "rewards/margins": 0.3106308579444885, + "rewards/rejected": -0.3804883658885956, + "step": 2460 + }, + { + "epoch": 0.97, + "learning_rate": 1.497580862453829e-08, + "logits/chosen": -1.3587336540222168, + "logits/rejected": 0.12183968722820282, + "logps/chosen": -682.3876342773438, + "logps/rejected": -1501.242431640625, + "loss": 0.179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07504203170537949, + "rewards/margins": 0.25899559259414673, + "rewards/rejected": -0.3340376317501068, + "step": 2470 + }, + { + "epoch": 0.97, + "learning_rate": 1.14685399252093e-08, + "logits/chosen": -1.2843676805496216, + "logits/rejected": -0.37819939851760864, + "logps/chosen": -639.9012451171875, + "logps/rejected": -1667.867431640625, + "loss": 0.1334, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05129992961883545, + "rewards/margins": 0.30902066826820374, + "rewards/rejected": -0.3603206276893616, + "step": 2480 + }, + { + "epoch": 0.98, + "learning_rate": 8.427576920763957e-09, + "logits/chosen": -1.2090356349945068, + "logits/rejected": -0.08205322176218033, + "logps/chosen": -759.0253295898438, + "logps/rejected": -1694.987060546875, + "loss": 0.2816, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13451895117759705, + "rewards/margins": 0.25275808572769165, + "rewards/rejected": -0.3872770071029663, + "step": 2490 + }, + { + "epoch": 0.98, + "learning_rate": 5.853489432556536e-09, + "logits/chosen": -1.5106983184814453, + "logits/rejected": -0.8639631271362305, + "logps/chosen": -654.6641845703125, + "logps/rejected": -1713.3929443359375, + "loss": 0.1831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06343318521976471, + "rewards/margins": 0.31247463822364807, + "rewards/rejected": -0.3759078085422516, + "step": 2500 + }, + { + "epoch": 0.98, + "learning_rate": 3.746759797931265e-09, + "logits/chosen": -1.4619848728179932, + "logits/rejected": 0.3511095643043518, + "logps/chosen": -736.3690795898438, + "logps/rejected": -1626.9466552734375, + "loss": 0.1604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06985476613044739, + "rewards/margins": 0.2860848307609558, + "rewards/rejected": -0.3559395968914032, + "step": 2510 + }, + { + "epoch": 0.99, + "learning_rate": 2.1077827798404728e-09, + "logits/chosen": -1.3730641603469849, + "logits/rejected": -0.6747050881385803, + "logps/chosen": -546.4849853515625, + "logps/rejected": -1590.315673828125, + "loss": 0.1671, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.036424748599529266, + "rewards/margins": 0.34394755959510803, + "rewards/rejected": -0.3803723454475403, + "step": 2520 + }, + { + "epoch": 0.99, + "learning_rate": 9.368654928731958e-10, + "logits/chosen": -1.3955776691436768, + "logits/rejected": -0.6365998983383179, + "logps/chosen": -608.5187377929688, + "logps/rejected": -1592.2816162109375, + "loss": 0.1843, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08722618967294693, + "rewards/margins": 0.30947092175483704, + "rewards/rejected": -0.39669710397720337, + "step": 2530 + }, + { + "epoch": 1.0, + "learning_rate": 2.3422734570816006e-10, + "logits/chosen": -1.4981211423873901, + "logits/rejected": -0.8375118374824524, + "logps/chosen": -656.091796875, + "logps/rejected": -1573.434814453125, + "loss": 0.1836, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05932006239891052, + "rewards/margins": 0.28673693537712097, + "rewards/rejected": -0.3460569679737091, + "step": 2540 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.4274616241455078, + "logits/rejected": 0.46426883339881897, + "logps/chosen": -776.08154296875, + "logps/rejected": -1525.145751953125, + "loss": 0.1924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08221141993999481, + "rewards/margins": 0.24572968482971191, + "rewards/rejected": -0.32794108986854553, + "step": 2550 + }, + { + "epoch": 1.0, + "step": 2550, + "total_flos": 0.0, + "train_loss": 0.19951762257837782, + "train_runtime": 10798.5669, + "train_samples_per_second": 0.945, + "train_steps_per_second": 0.236 + } + ], + "logging_steps": 10, + "max_steps": 2550, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}