{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 9883, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010118385105737125, "grad_norm": 130675.59889267603, "learning_rate": 5.055611729019211e-10, "logits/chosen": -3.332463264465332, "logits/rejected": -3.2607693672180176, "logps/chosen": -13.094183921813965, "logps/rejected": -55.58576583862305, "loss": 44508.2539, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0010118385105737124, "grad_norm": 246684.02039024935, "learning_rate": 5.055611729019211e-09, "logits/chosen": -4.188122749328613, "logits/rejected": -4.114339351654053, "logps/chosen": -72.08418273925781, "logps/rejected": -78.6976318359375, "loss": 103931.7292, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 6.935038254596293e-05, "rewards/margins": 0.00010217401722911745, "rewards/rejected": -3.282363832113333e-05, "step": 10 }, { "epoch": 0.0020236770211474247, "grad_norm": 181931.41954866686, "learning_rate": 1.0111223458038422e-08, "logits/chosen": -4.329117298126221, "logits/rejected": -4.3843793869018555, "logps/chosen": -131.36007690429688, "logps/rejected": -106.38752746582031, "loss": 114895.8125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -4.7947676648618653e-05, "rewards/margins": -0.000132886809296906, "rewards/rejected": 8.493913628626615e-05, "step": 20 }, { "epoch": 0.0030355155317211375, "grad_norm": 185237.75822008488, "learning_rate": 1.5166835187057634e-08, "logits/chosen": -4.288362979888916, "logits/rejected": -4.358180999755859, "logps/chosen": -76.87004852294922, "logps/rejected": -123.9841537475586, "loss": 100438.9438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.8260385331814177e-05, "rewards/margins": 8.6171567090787e-05, "rewards/rejected": -6.791118357796222e-05, "step": 30 }, { "epoch": 0.0040473540422948494, "grad_norm": 125620.1349149526, "learning_rate": 2.0222446916076843e-08, "logits/chosen": -4.082097053527832, "logits/rejected": -4.233819007873535, "logps/chosen": -85.59774017333984, "logps/rejected": -78.10315704345703, "loss": 102958.3438, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 6.667420529993251e-05, "rewards/margins": 9.528406371828169e-05, "rewards/rejected": -2.8609856599359773e-05, "step": 40 }, { "epoch": 0.005059192552868562, "grad_norm": 266895.2239217902, "learning_rate": 2.5278058645096056e-08, "logits/chosen": -4.083619594573975, "logits/rejected": -4.1032609939575195, "logps/chosen": -82.80220031738281, "logps/rejected": -126.37025451660156, "loss": 101648.3438, "rewards/accuracies": 0.25, "rewards/chosen": -5.494149081641808e-05, "rewards/margins": -0.00013328036584425718, "rewards/rejected": 7.833886775188148e-05, "step": 50 }, { "epoch": 0.006071031063442275, "grad_norm": 288426.6117820567, "learning_rate": 3.033367037411527e-08, "logits/chosen": -4.355353355407715, "logits/rejected": -4.348907947540283, "logps/chosen": -134.69390869140625, "logps/rejected": -109.37464904785156, "loss": 129448.625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -8.355580212082714e-05, "rewards/margins": 5.314249938237481e-05, "rewards/rejected": -0.00013669830514118075, "step": 60 }, { "epoch": 0.007082869574015987, "grad_norm": 1445827.3608766322, "learning_rate": 3.538928210313448e-08, "logits/chosen": -3.562352418899536, "logits/rejected": -3.5048129558563232, "logps/chosen": -63.22965621948242, "logps/rejected": -65.2130355834961, "loss": 97678.0625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00013282778672873974, "rewards/margins": 0.00014843029202893376, "rewards/rejected": -1.560251803311985e-05, "step": 70 }, { "epoch": 0.008094708084589699, "grad_norm": 73408.01587929162, "learning_rate": 4.044489383215369e-08, "logits/chosen": -4.808438301086426, "logits/rejected": -4.741355895996094, "logps/chosen": -63.851112365722656, "logps/rejected": -74.86654663085938, "loss": 108547.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.58021484821802e-05, "rewards/margins": 7.164253474911675e-05, "rewards/rejected": -8.744467777432874e-05, "step": 80 }, { "epoch": 0.009106546595163412, "grad_norm": 215833.74613630836, "learning_rate": 4.55005055611729e-08, "logits/chosen": -4.6879987716674805, "logits/rejected": -4.695956230163574, "logps/chosen": -60.84783935546875, "logps/rejected": -79.3995590209961, "loss": 119718.2625, "rewards/accuracies": 0.5, "rewards/chosen": -2.0168552509858273e-05, "rewards/margins": 4.896746395388618e-05, "rewards/rejected": -6.913601100677624e-05, "step": 90 }, { "epoch": 0.010118385105737124, "grad_norm": 276530.95944559335, "learning_rate": 5.055611729019211e-08, "logits/chosen": -4.245387077331543, "logits/rejected": -4.254336357116699, "logps/chosen": -60.32569122314453, "logps/rejected": -67.73658752441406, "loss": 120106.1, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -2.3961827537277713e-05, "rewards/margins": -0.0001041849609464407, "rewards/rejected": 8.022312977118418e-05, "step": 100 }, { "epoch": 0.011130223616310837, "grad_norm": 348785.61390782887, "learning_rate": 5.561172901921132e-08, "logits/chosen": -4.400993824005127, "logits/rejected": -4.2609968185424805, "logps/chosen": -98.20852661132812, "logps/rejected": -109.89472961425781, "loss": 124653.725, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3725609278480988e-05, "rewards/margins": 3.550672408891842e-05, "rewards/rejected": -2.1781113900942728e-05, "step": 110 }, { "epoch": 0.01214206212688455, "grad_norm": 374907.9316991323, "learning_rate": 6.066734074823054e-08, "logits/chosen": -3.707151412963867, "logits/rejected": -3.708599805831909, "logps/chosen": -62.166969299316406, "logps/rejected": -79.69816589355469, "loss": 109043.4125, "rewards/accuracies": 0.5, "rewards/chosen": -1.677275577094406e-05, "rewards/margins": 7.811565592419356e-05, "rewards/rejected": -9.488839714322239e-05, "step": 120 }, { "epoch": 0.013153900637458261, "grad_norm": 260581.7367597193, "learning_rate": 6.572295247724974e-08, "logits/chosen": -3.178565502166748, "logits/rejected": -3.1010868549346924, "logps/chosen": -241.57479858398438, "logps/rejected": -271.00982666015625, "loss": 110479.2125, "rewards/accuracies": 0.5, "rewards/chosen": 3.957765784434741e-06, "rewards/margins": -3.187589027220383e-05, "rewards/rejected": 3.5833651054417714e-05, "step": 130 }, { "epoch": 0.014165739148031974, "grad_norm": 68376.30320387064, "learning_rate": 7.077856420626896e-08, "logits/chosen": -4.180408477783203, "logits/rejected": -4.4351911544799805, "logps/chosen": -112.57125091552734, "logps/rejected": -109.1659927368164, "loss": 132588.8125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 4.777870708494447e-05, "rewards/margins": 6.480945012299344e-05, "rewards/rejected": -1.7030721210176125e-05, "step": 140 }, { "epoch": 0.015177577658605687, "grad_norm": 160133.59015572155, "learning_rate": 7.583417593528817e-08, "logits/chosen": -4.1573591232299805, "logits/rejected": -4.044037818908691, "logps/chosen": -77.17411804199219, "logps/rejected": -98.65723419189453, "loss": 126298.125, "rewards/accuracies": 0.5, "rewards/chosen": -8.898177475202829e-05, "rewards/margins": 8.25377501314506e-05, "rewards/rejected": -0.0001715195394353941, "step": 150 }, { "epoch": 0.016189416169179398, "grad_norm": 348627.9031050746, "learning_rate": 8.088978766430737e-08, "logits/chosen": -4.038428783416748, "logits/rejected": -3.8097851276397705, "logps/chosen": -119.74607849121094, "logps/rejected": -133.05685424804688, "loss": 115321.45, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 4.590778917190619e-05, "rewards/margins": -3.3422925298509654e-06, "rewards/rejected": 4.925008397549391e-05, "step": 160 }, { "epoch": 0.01720125467975311, "grad_norm": 250266.4152638944, "learning_rate": 8.594539939332659e-08, "logits/chosen": -4.0218329429626465, "logits/rejected": -4.214414119720459, "logps/chosen": -83.14879608154297, "logps/rejected": -79.21330261230469, "loss": 102231.7125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 9.309604138252325e-06, "rewards/margins": 1.933842577273026e-05, "rewards/rejected": -1.0028795259131584e-05, "step": 170 }, { "epoch": 0.018213093190326823, "grad_norm": 165872.78730164474, "learning_rate": 9.10010111223458e-08, "logits/chosen": -4.20759391784668, "logits/rejected": -4.046136856079102, "logps/chosen": -56.49616241455078, "logps/rejected": -68.46202087402344, "loss": 113184.175, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -3.246061896788888e-05, "rewards/margins": 5.398775329013006e-07, "rewards/rejected": -3.300049502286129e-05, "step": 180 }, { "epoch": 0.019224931700900536, "grad_norm": 171301.05024547508, "learning_rate": 9.605662285136501e-08, "logits/chosen": -4.20707893371582, "logits/rejected": -4.128954887390137, "logps/chosen": -66.51692962646484, "logps/rejected": -97.65692901611328, "loss": 126858.075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -5.221440369496122e-05, "rewards/margins": -0.00011791993165388703, "rewards/rejected": 6.57055206829682e-05, "step": 190 }, { "epoch": 0.02023677021147425, "grad_norm": 236198.98121327988, "learning_rate": 1.0111223458038422e-07, "logits/chosen": -4.207874774932861, "logits/rejected": -4.218282222747803, "logps/chosen": -131.6715545654297, "logps/rejected": -128.51194763183594, "loss": 124119.5125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -5.316782699082978e-05, "rewards/margins": -8.030439494177699e-05, "rewards/rejected": 2.7136566131957807e-05, "step": 200 }, { "epoch": 0.021248608722047962, "grad_norm": 111177.68425950008, "learning_rate": 1.0616784630940344e-07, "logits/chosen": -4.391724109649658, "logits/rejected": -4.5016770362854, "logps/chosen": -84.17726135253906, "logps/rejected": -89.42311096191406, "loss": 118269.0625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -2.6094878194271587e-05, "rewards/margins": -7.128424385882681e-06, "rewards/rejected": -1.896645153465215e-05, "step": 210 }, { "epoch": 0.022260447232621675, "grad_norm": 205759.18272204694, "learning_rate": 1.1122345803842264e-07, "logits/chosen": -4.2671918869018555, "logits/rejected": -4.235365867614746, "logps/chosen": -73.93778991699219, "logps/rejected": -90.3472900390625, "loss": 130410.3375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00010480267519596964, "rewards/margins": -1.958228494913783e-05, "rewards/rejected": -8.52203811518848e-05, "step": 220 }, { "epoch": 0.023272285743195387, "grad_norm": 68155.04310495066, "learning_rate": 1.1627906976744186e-07, "logits/chosen": -4.059153079986572, "logits/rejected": -3.9588394165039062, "logps/chosen": -50.34394454956055, "logps/rejected": -58.03104782104492, "loss": 113785.4625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 5.274124487186782e-05, "rewards/margins": 0.00013244811270851642, "rewards/rejected": -7.970685692271218e-05, "step": 230 }, { "epoch": 0.0242841242537691, "grad_norm": 313831.78802323464, "learning_rate": 1.2133468149646107e-07, "logits/chosen": -4.180145263671875, "logits/rejected": -4.019026756286621, "logps/chosen": -72.72502136230469, "logps/rejected": -93.2811050415039, "loss": 99971.5437, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.7753208996728063e-05, "rewards/margins": 8.947304741013795e-05, "rewards/rejected": -6.171983841340989e-05, "step": 240 }, { "epoch": 0.02529596276434281, "grad_norm": 400849.8030837691, "learning_rate": 1.263902932254803e-07, "logits/chosen": -4.0756449699401855, "logits/rejected": -4.023280143737793, "logps/chosen": -108.12876892089844, "logps/rejected": -156.34463500976562, "loss": 122431.65, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -6.876101542729884e-05, "rewards/margins": -5.784331733593717e-05, "rewards/rejected": -1.0917714462266304e-05, "step": 250 }, { "epoch": 0.026307801274916522, "grad_norm": 3090.0667697755316, "learning_rate": 1.3144590495449948e-07, "logits/chosen": -4.172642707824707, "logits/rejected": -4.273802280426025, "logps/chosen": -110.00355529785156, "logps/rejected": -122.1195068359375, "loss": 109109.8625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.058090704958886e-05, "rewards/margins": 1.1655441085167695e-05, "rewards/rejected": 5.892547051189467e-05, "step": 260 }, { "epoch": 0.027319639785490235, "grad_norm": 203286.8689302739, "learning_rate": 1.365015166835187e-07, "logits/chosen": -4.383858680725098, "logits/rejected": -4.393070220947266, "logps/chosen": -62.46295928955078, "logps/rejected": -67.74301147460938, "loss": 110608.2875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.4111870516208e-05, "rewards/margins": 4.255682142684236e-05, "rewards/rejected": -6.666869012406096e-05, "step": 270 }, { "epoch": 0.028331478296063948, "grad_norm": 219239.62812444416, "learning_rate": 1.415571284125379e-07, "logits/chosen": -4.062226295471191, "logits/rejected": -3.8921420574188232, "logps/chosen": -80.3340835571289, "logps/rejected": -73.1257553100586, "loss": 123251.275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3757085071119945e-05, "rewards/margins": 3.593442670535296e-05, "rewards/rejected": -2.217734800069593e-05, "step": 280 }, { "epoch": 0.02934331680663766, "grad_norm": 192855.42106606776, "learning_rate": 1.466127401415571e-07, "logits/chosen": -3.927004337310791, "logits/rejected": -3.965533494949341, "logps/chosen": -80.45733642578125, "logps/rejected": -106.53145599365234, "loss": 127830.7875, "rewards/accuracies": 0.5, "rewards/chosen": -6.10698334639892e-05, "rewards/margins": -0.00016501438221894205, "rewards/rejected": 0.00010394454147899523, "step": 290 }, { "epoch": 0.030355155317211373, "grad_norm": 263410.0953273921, "learning_rate": 1.5166835187057634e-07, "logits/chosen": -4.216609001159668, "logits/rejected": -4.230715751647949, "logps/chosen": -104.42545318603516, "logps/rejected": -107.36568450927734, "loss": 128553.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.75340174388839e-06, "rewards/margins": 7.252732757478952e-05, "rewards/rejected": -8.128073386615142e-05, "step": 300 }, { "epoch": 0.031366993827785086, "grad_norm": 162347.33685246325, "learning_rate": 1.5672396359959556e-07, "logits/chosen": -4.288687229156494, "logits/rejected": -4.24789571762085, "logps/chosen": -103.89166259765625, "logps/rejected": -108.08259582519531, "loss": 122233.8875, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -7.75146036176011e-05, "rewards/margins": -4.761310628964566e-05, "rewards/rejected": -2.990149732795544e-05, "step": 310 }, { "epoch": 0.032378832338358796, "grad_norm": 207459.07800562834, "learning_rate": 1.6177957532861475e-07, "logits/chosen": -3.9720618724823, "logits/rejected": -4.004500389099121, "logps/chosen": -62.90068817138672, "logps/rejected": -82.53974914550781, "loss": 108088.75, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00020353223953861743, "rewards/margins": -0.0001431352720828727, "rewards/rejected": -6.039698564563878e-05, "step": 320 }, { "epoch": 0.03339067084893251, "grad_norm": 197972.05687974067, "learning_rate": 1.6683518705763396e-07, "logits/chosen": -4.002933502197266, "logits/rejected": -3.7585086822509766, "logps/chosen": -77.411376953125, "logps/rejected": -108.38993835449219, "loss": 137764.575, "rewards/accuracies": 0.5, "rewards/chosen": -8.893576159607619e-05, "rewards/margins": -2.0317453163443133e-05, "rewards/rejected": -6.861830479465425e-05, "step": 330 }, { "epoch": 0.03440250935950622, "grad_norm": 191340.3639310882, "learning_rate": 1.7189079878665318e-07, "logits/chosen": -4.003809928894043, "logits/rejected": -3.859312057495117, "logps/chosen": -54.80177688598633, "logps/rejected": -75.59323120117188, "loss": 131509.125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.985093957860954e-05, "rewards/margins": -3.8393918657675385e-05, "rewards/rejected": 9.82448473223485e-05, "step": 340 }, { "epoch": 0.03541434787007994, "grad_norm": 146312.11556025446, "learning_rate": 1.769464105156724e-07, "logits/chosen": -3.906332492828369, "logits/rejected": -4.063315391540527, "logps/chosen": -79.22567749023438, "logps/rejected": -119.48348236083984, "loss": 123989.7125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.4007726096897386e-05, "rewards/margins": 6.328203198791016e-06, "rewards/rejected": -3.033592292922549e-05, "step": 350 }, { "epoch": 0.03642618638065365, "grad_norm": 391674.40525206213, "learning_rate": 1.820020222446916e-07, "logits/chosen": -4.456021785736084, "logits/rejected": -4.524321556091309, "logps/chosen": -83.03251647949219, "logps/rejected": -99.01311492919922, "loss": 119734.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 7.101938535925001e-05, "rewards/margins": 0.00010033122089225799, "rewards/rejected": -2.931182098109275e-05, "step": 360 }, { "epoch": 0.03743802489122736, "grad_norm": 411594.147692985, "learning_rate": 1.870576339737108e-07, "logits/chosen": -4.200623512268066, "logits/rejected": -4.043582916259766, "logps/chosen": -77.21430206298828, "logps/rejected": -120.98372650146484, "loss": 113076.4125, "rewards/accuracies": 0.5, "rewards/chosen": 5.871327448403463e-05, "rewards/margins": 0.0001382279151584953, "rewards/rejected": -7.95146479504183e-05, "step": 370 }, { "epoch": 0.03844986340180107, "grad_norm": 158512.1758764559, "learning_rate": 1.9211324570273002e-07, "logits/chosen": -4.352120399475098, "logits/rejected": -4.428531646728516, "logps/chosen": -105.45347595214844, "logps/rejected": -107.72145080566406, "loss": 117179.0625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1454385457909666e-05, "rewards/margins": -7.518515485571697e-05, "rewards/rejected": 8.663953485665843e-05, "step": 380 }, { "epoch": 0.03946170191237478, "grad_norm": 267795.17101275804, "learning_rate": 1.9716885743174923e-07, "logits/chosen": -4.25710391998291, "logits/rejected": -4.207553863525391, "logps/chosen": -93.67794036865234, "logps/rejected": -102.38844299316406, "loss": 107432.3375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3614962881547399e-05, "rewards/margins": 3.587862738640979e-05, "rewards/rejected": -2.2263668142841198e-05, "step": 390 }, { "epoch": 0.0404735404229485, "grad_norm": 367229.9998835191, "learning_rate": 2.0222446916076845e-07, "logits/chosen": -4.232907772064209, "logits/rejected": -4.205918312072754, "logps/chosen": -101.13680267333984, "logps/rejected": -111.1734848022461, "loss": 119572.875, "rewards/accuracies": 0.5, "rewards/chosen": 3.490962990326807e-05, "rewards/margins": 0.00014453369658440351, "rewards/rejected": -0.0001096240448532626, "step": 400 }, { "epoch": 0.04148537893352221, "grad_norm": 222044.09324011495, "learning_rate": 2.0728008088978766e-07, "logits/chosen": -3.795947313308716, "logits/rejected": -3.8180782794952393, "logps/chosen": -43.13126754760742, "logps/rejected": -53.095741271972656, "loss": 113554.6625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -9.573295392328873e-05, "rewards/margins": -7.306464249268174e-05, "rewards/rejected": -2.2668318706564605e-05, "step": 410 }, { "epoch": 0.042497217444095924, "grad_norm": 265849.48894903413, "learning_rate": 2.1233569261880688e-07, "logits/chosen": -4.235118865966797, "logits/rejected": -4.1318793296813965, "logps/chosen": -77.5746078491211, "logps/rejected": -92.60408020019531, "loss": 113236.825, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00016302018775604665, "rewards/margins": -0.00011850716691697016, "rewards/rejected": -4.4513039028970525e-05, "step": 420 }, { "epoch": 0.04350905595466963, "grad_norm": 241998.8771684406, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -4.27359676361084, "logits/rejected": -4.3082733154296875, "logps/chosen": -55.873291015625, "logps/rejected": -55.68131637573242, "loss": 111678.3, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -4.9143236537929624e-05, "rewards/margins": 4.143899332120782e-06, "rewards/rejected": -5.328714178176597e-05, "step": 430 }, { "epoch": 0.04452089446524335, "grad_norm": 265462.07466423576, "learning_rate": 2.2244691607684528e-07, "logits/chosen": -4.060147762298584, "logits/rejected": -4.102156162261963, "logps/chosen": -64.58871459960938, "logps/rejected": -93.20319366455078, "loss": 102235.6, "rewards/accuracies": 0.75, "rewards/chosen": 0.00011470810568425804, "rewards/margins": 0.00016805611085146666, "rewards/rejected": -5.3348001529229805e-05, "step": 440 }, { "epoch": 0.04553273297581706, "grad_norm": 250145.93075953092, "learning_rate": 2.2750252780586447e-07, "logits/chosen": -4.094784259796143, "logits/rejected": -4.056227207183838, "logps/chosen": -67.34135437011719, "logps/rejected": -98.86811065673828, "loss": 122771.9, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.008484472071359e-07, "rewards/margins": 0.0001057122353813611, "rewards/rejected": -0.00010551139712333679, "step": 450 }, { "epoch": 0.046544571486390775, "grad_norm": 299147.9614541462, "learning_rate": 2.3255813953488372e-07, "logits/chosen": -4.029704570770264, "logits/rejected": -3.9427123069763184, "logps/chosen": -72.84172821044922, "logps/rejected": -97.85685729980469, "loss": 133632.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.591473821667023e-05, "rewards/margins": 0.0002320517087355256, "rewards/rejected": -0.0002061369887087494, "step": 460 }, { "epoch": 0.047556409996964484, "grad_norm": 282823.2974999728, "learning_rate": 2.3761375126390293e-07, "logits/chosen": -4.090446472167969, "logits/rejected": -4.085078239440918, "logps/chosen": -111.3682632446289, "logps/rejected": -131.857421875, "loss": 128675.0, "rewards/accuracies": 0.75, "rewards/chosen": 1.3980845324113034e-05, "rewards/margins": 7.680008275201544e-05, "rewards/rejected": -6.2819235608913e-05, "step": 470 }, { "epoch": 0.0485682485075382, "grad_norm": 318894.469264257, "learning_rate": 2.4266936299292215e-07, "logits/chosen": -4.48383903503418, "logits/rejected": -4.533525466918945, "logps/chosen": -58.193763732910156, "logps/rejected": -69.75321197509766, "loss": 133138.9375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2398764738463797e-05, "rewards/margins": 0.0001332695537712425, "rewards/rejected": -0.0001208707908517681, "step": 480 }, { "epoch": 0.04958008701811191, "grad_norm": 204447.34682806593, "learning_rate": 2.4772497472194136e-07, "logits/chosen": -4.168319225311279, "logits/rejected": -4.095081806182861, "logps/chosen": -111.54915618896484, "logps/rejected": -115.64688873291016, "loss": 114209.1875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.983090209658258e-05, "rewards/margins": 0.00011614644608926028, "rewards/rejected": -7.631555490661412e-05, "step": 490 }, { "epoch": 0.05059192552868562, "grad_norm": 247564.7497283286, "learning_rate": 2.527805864509606e-07, "logits/chosen": -4.436467170715332, "logits/rejected": -4.406083583831787, "logps/chosen": -78.80534362792969, "logps/rejected": -85.81243133544922, "loss": 109709.125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00011426885612308979, "rewards/margins": -8.780457574175671e-05, "rewards/rejected": -2.6464296752237715e-05, "step": 500 }, { "epoch": 0.051603764039259335, "grad_norm": 327256.8364514171, "learning_rate": 2.5783619817997974e-07, "logits/chosen": -4.199801445007324, "logits/rejected": -4.204330921173096, "logps/chosen": -105.13301086425781, "logps/rejected": -88.58314514160156, "loss": 118763.45, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00013961232616566122, "rewards/margins": -5.879976015421562e-05, "rewards/rejected": -8.08125696494244e-05, "step": 510 }, { "epoch": 0.052615602549833045, "grad_norm": 198989.22051740385, "learning_rate": 2.6289180990899896e-07, "logits/chosen": -3.8059699535369873, "logits/rejected": -3.8173305988311768, "logps/chosen": -43.953651428222656, "logps/rejected": -95.66313171386719, "loss": 117801.525, "rewards/accuracies": 0.5, "rewards/chosen": -0.00012080957822035998, "rewards/margins": 5.709574179491028e-05, "rewards/rejected": -0.00017790532729122788, "step": 520 }, { "epoch": 0.05362744106040676, "grad_norm": 344594.12242389756, "learning_rate": 2.679474216380182e-07, "logits/chosen": -3.735436201095581, "logits/rejected": -3.808124542236328, "logps/chosen": -99.54512023925781, "logps/rejected": -108.14998626708984, "loss": 138225.9375, "rewards/accuracies": 0.5, "rewards/chosen": -3.086147262365557e-05, "rewards/margins": 4.7710705985082313e-05, "rewards/rejected": -7.857219316065311e-05, "step": 530 }, { "epoch": 0.05463927957098047, "grad_norm": 227961.48501828575, "learning_rate": 2.730030333670374e-07, "logits/chosen": -4.092320442199707, "logits/rejected": -3.997737407684326, "logps/chosen": -45.971153259277344, "logps/rejected": -99.94884490966797, "loss": 109650.125, "rewards/accuracies": 0.75, "rewards/chosen": -7.710925274295732e-05, "rewards/margins": 0.00029912195168435574, "rewards/rejected": -0.0003762312117032707, "step": 540 }, { "epoch": 0.055651118081554186, "grad_norm": 234928.12498469127, "learning_rate": 2.780586450960566e-07, "logits/chosen": -4.362364292144775, "logits/rejected": -4.342806339263916, "logps/chosen": -87.87278747558594, "logps/rejected": -87.86195373535156, "loss": 120884.7125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0001846780360210687, "rewards/margins": 6.11522700637579e-05, "rewards/rejected": -0.00024583033518865705, "step": 550 }, { "epoch": 0.056662956592127896, "grad_norm": 132025.25379995382, "learning_rate": 2.831142568250758e-07, "logits/chosen": -3.955767869949341, "logits/rejected": -4.1907639503479, "logps/chosen": -91.56539154052734, "logps/rejected": -101.23915100097656, "loss": 131657.3125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00012422776489984244, "rewards/margins": -2.5379082217114046e-05, "rewards/rejected": -9.884869359666482e-05, "step": 560 }, { "epoch": 0.05767479510270161, "grad_norm": 286671.2873833456, "learning_rate": 2.8816986855409504e-07, "logits/chosen": -4.035500526428223, "logits/rejected": -4.204102993011475, "logps/chosen": -76.91935729980469, "logps/rejected": -76.7132568359375, "loss": 114683.3875, "rewards/accuracies": 0.5, "rewards/chosen": -9.585612133378163e-05, "rewards/margins": 3.435999678913504e-05, "rewards/rejected": -0.00013021611084695905, "step": 570 }, { "epoch": 0.05868663361327532, "grad_norm": 376195.7227128406, "learning_rate": 2.932254802831142e-07, "logits/chosen": -3.989431381225586, "logits/rejected": -3.9996447563171387, "logps/chosen": -78.96307373046875, "logps/rejected": -91.03147888183594, "loss": 114766.9375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00028401362942531705, "rewards/margins": -8.376798177778255e-06, "rewards/rejected": -0.0002756367903202772, "step": 580 }, { "epoch": 0.05969847212384903, "grad_norm": 285783.4867723129, "learning_rate": 2.9828109201213347e-07, "logits/chosen": -4.225393295288086, "logits/rejected": -4.16351842880249, "logps/chosen": -130.45774841308594, "logps/rejected": -147.64181518554688, "loss": 124768.55, "rewards/accuracies": 0.5, "rewards/chosen": -0.0003105011419393122, "rewards/margins": 2.2350413928506896e-05, "rewards/rejected": -0.00033285151585005224, "step": 590 }, { "epoch": 0.06071031063442275, "grad_norm": 59296.889768400906, "learning_rate": 3.033367037411527e-07, "logits/chosen": -4.0909295082092285, "logits/rejected": -3.9738807678222656, "logps/chosen": -68.32615661621094, "logps/rejected": -87.63771057128906, "loss": 120852.1625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00018324334814678878, "rewards/margins": 2.723810939642135e-05, "rewards/rejected": -0.00021048146300017834, "step": 600 }, { "epoch": 0.061722149144996456, "grad_norm": 206471.645394112, "learning_rate": 3.0839231547017185e-07, "logits/chosen": -4.498924255371094, "logits/rejected": -4.461515426635742, "logps/chosen": -77.2944564819336, "logps/rejected": -97.91581726074219, "loss": 107941.15, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0002564065798651427, "rewards/margins": 3.958632441936061e-05, "rewards/rejected": -0.00029599288245663047, "step": 610 }, { "epoch": 0.06273398765557017, "grad_norm": 160680.42033539797, "learning_rate": 3.134479271991911e-07, "logits/chosen": -3.8811497688293457, "logits/rejected": -3.8669190406799316, "logps/chosen": -66.71482849121094, "logps/rejected": -114.29515075683594, "loss": 112053.2625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0002459154056850821, "rewards/margins": 0.00019958156917709857, "rewards/rejected": -0.0004454969894140959, "step": 620 }, { "epoch": 0.06374582616614388, "grad_norm": 279078.84447787574, "learning_rate": 3.185035389282103e-07, "logits/chosen": -4.32260799407959, "logits/rejected": -4.4003167152404785, "logps/chosen": -92.05845642089844, "logps/rejected": -85.54139709472656, "loss": 107647.475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00017451116582378745, "rewards/margins": 0.00019710566266439855, "rewards/rejected": -0.00037161679938435555, "step": 630 }, { "epoch": 0.06475766467671759, "grad_norm": 306927.8991793007, "learning_rate": 3.235591506572295e-07, "logits/chosen": -3.9913978576660156, "logits/rejected": -3.8945717811584473, "logps/chosen": -221.60122680664062, "logps/rejected": -248.7289581298828, "loss": 112767.6125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00014936232764739543, "rewards/margins": 0.00027651485288515687, "rewards/rejected": -0.00042587719508446753, "step": 640 }, { "epoch": 0.06576950318729131, "grad_norm": 209851.04084178555, "learning_rate": 3.2861476238624876e-07, "logits/chosen": -4.10396671295166, "logits/rejected": -4.40726375579834, "logps/chosen": -52.275718688964844, "logps/rejected": -81.21295166015625, "loss": 125850.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0002986097242683172, "rewards/margins": 0.00010185830615228042, "rewards/rejected": -0.0004004680085927248, "step": 650 }, { "epoch": 0.06678134169786502, "grad_norm": 272775.7170894341, "learning_rate": 3.336703741152679e-07, "logits/chosen": -4.294694423675537, "logits/rejected": -4.437451362609863, "logps/chosen": -88.2703857421875, "logps/rejected": -83.26488494873047, "loss": 110147.5625, "rewards/accuracies": 0.5, "rewards/chosen": -0.00032427330734208226, "rewards/margins": 3.0951810913393274e-05, "rewards/rejected": -0.00035522511461749673, "step": 660 }, { "epoch": 0.06779318020843873, "grad_norm": 209295.66202698936, "learning_rate": 3.3872598584428714e-07, "logits/chosen": -4.617118835449219, "logits/rejected": -4.332075595855713, "logps/chosen": -95.84889221191406, "logps/rejected": -113.36787414550781, "loss": 135496.4125, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0005084475851617754, "rewards/margins": -1.6850308384164236e-05, "rewards/rejected": -0.0004915973404422402, "step": 670 }, { "epoch": 0.06880501871901244, "grad_norm": 308013.6066852334, "learning_rate": 3.4378159757330636e-07, "logits/chosen": -3.8772666454315186, "logits/rejected": -3.9374232292175293, "logps/chosen": -104.16230773925781, "logps/rejected": -130.36439514160156, "loss": 114386.1125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003361152485013008, "rewards/margins": 0.00013667097664438188, "rewards/rejected": -0.0004727862833533436, "step": 680 }, { "epoch": 0.06981685722958615, "grad_norm": 375028.5905462513, "learning_rate": 3.4883720930232557e-07, "logits/chosen": -4.191579341888428, "logits/rejected": -4.041550636291504, "logps/chosen": -76.59661865234375, "logps/rejected": -81.14154052734375, "loss": 110808.5625, "rewards/accuracies": 0.25, "rewards/chosen": -0.00032104557612910867, "rewards/margins": -3.4021872124867514e-05, "rewards/rejected": -0.00028702375129796565, "step": 690 }, { "epoch": 0.07082869574015987, "grad_norm": 251796.81385128747, "learning_rate": 3.538928210313448e-07, "logits/chosen": -4.171774864196777, "logits/rejected": -4.2111711502075195, "logps/chosen": -65.45740509033203, "logps/rejected": -84.54772186279297, "loss": 126839.7375, "rewards/accuracies": 0.5, "rewards/chosen": -0.0003757965750992298, "rewards/margins": -1.1553353033377789e-05, "rewards/rejected": -0.00036424322752282023, "step": 700 }, { "epoch": 0.07184053425073358, "grad_norm": 266693.8909757348, "learning_rate": 3.5894843276036395e-07, "logits/chosen": -3.841721773147583, "logits/rejected": -3.7526583671569824, "logps/chosen": -82.08513641357422, "logps/rejected": -120.21766662597656, "loss": 112644.2625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00047400445328094065, "rewards/margins": 0.0002652026014402509, "rewards/rejected": -0.0007392070256173611, "step": 710 }, { "epoch": 0.0728523727613073, "grad_norm": 433138.75749086303, "learning_rate": 3.640040444893832e-07, "logits/chosen": -3.947869062423706, "logits/rejected": -3.7925686836242676, "logps/chosen": -89.23374938964844, "logps/rejected": -96.95865631103516, "loss": 127356.025, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0004088584682904184, "rewards/margins": -0.00012834268272854388, "rewards/rejected": -0.00028051575645804405, "step": 720 }, { "epoch": 0.073864211271881, "grad_norm": 393126.80254603014, "learning_rate": 3.690596562184024e-07, "logits/chosen": -4.451858997344971, "logits/rejected": -4.468457221984863, "logps/chosen": -112.4037857055664, "logps/rejected": -101.23915100097656, "loss": 104478.8938, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0004056698235217482, "rewards/margins": -1.3732968000113033e-05, "rewards/rejected": -0.00039193680277094245, "step": 730 }, { "epoch": 0.07487604978245473, "grad_norm": 182491.5916781474, "learning_rate": 3.741152679474216e-07, "logits/chosen": -4.439207077026367, "logits/rejected": -4.437518119812012, "logps/chosen": -47.778038024902344, "logps/rejected": -60.95914840698242, "loss": 107928.35, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00025101759820245206, "rewards/margins": 7.540622027590871e-05, "rewards/rejected": -0.00032642381847836077, "step": 740 }, { "epoch": 0.07588788829302844, "grad_norm": 12.987326060962737, "learning_rate": 3.7917087967644087e-07, "logits/chosen": -4.487083435058594, "logits/rejected": -4.534282684326172, "logps/chosen": -91.82740783691406, "logps/rejected": -106.0748291015625, "loss": 127407.0625, "rewards/accuracies": 0.5, "rewards/chosen": -0.00031008836231194437, "rewards/margins": 9.193028381559998e-05, "rewards/rejected": -0.00040201866067945957, "step": 750 }, { "epoch": 0.07689972680360214, "grad_norm": 310382.2685628708, "learning_rate": 3.8422649140546003e-07, "logits/chosen": -4.359978675842285, "logits/rejected": -4.260481834411621, "logps/chosen": -86.91165924072266, "logps/rejected": -126.35318756103516, "loss": 127596.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00011125938181066886, "rewards/margins": 0.00020849080465268344, "rewards/rejected": -0.00031975016463547945, "step": 760 }, { "epoch": 0.07791156531417585, "grad_norm": 226916.46225030549, "learning_rate": 3.8928210313447925e-07, "logits/chosen": -4.408380031585693, "logits/rejected": -4.429563045501709, "logps/chosen": -102.02629089355469, "logps/rejected": -125.5072021484375, "loss": 138177.5375, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0005772575968876481, "rewards/margins": 0.00030261004576459527, "rewards/rejected": -0.0008798675844445825, "step": 770 }, { "epoch": 0.07892340382474956, "grad_norm": 379886.4401032071, "learning_rate": 3.9433771486349846e-07, "logits/chosen": -4.098866939544678, "logits/rejected": -4.200720310211182, "logps/chosen": -137.054443359375, "logps/rejected": -145.7952880859375, "loss": 106410.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0004144218401052058, "rewards/margins": 3.9120553992688656e-05, "rewards/rejected": -0.00045354239409789443, "step": 780 }, { "epoch": 0.07993524233532329, "grad_norm": 167363.37898726985, "learning_rate": 3.993933265925177e-07, "logits/chosen": -4.247289180755615, "logits/rejected": -4.150348663330078, "logps/chosen": -75.54122924804688, "logps/rejected": -102.0378646850586, "loss": 127640.35, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005550560890696943, "rewards/margins": -5.9023032008553855e-06, "rewards/rejected": -0.0005491537740454078, "step": 790 }, { "epoch": 0.080947080845897, "grad_norm": 300594.436850355, "learning_rate": 4.044489383215369e-07, "logits/chosen": -3.6476833820343018, "logits/rejected": -3.6540863513946533, "logps/chosen": -460.6437072753906, "logps/rejected": -476.91412353515625, "loss": 110343.4875, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00038041570223867893, "rewards/margins": 9.091954416362569e-05, "rewards/rejected": -0.0004713352245744318, "step": 800 }, { "epoch": 0.0819589193564707, "grad_norm": 224399.24976094803, "learning_rate": 4.095045500505561e-07, "logits/chosen": -4.06188440322876, "logits/rejected": -4.031253814697266, "logps/chosen": -90.99593353271484, "logps/rejected": -102.22840881347656, "loss": 118085.05, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00047040978097356856, "rewards/margins": 4.811291091755265e-06, "rewards/rejected": -0.0004752210224978626, "step": 810 }, { "epoch": 0.08297075786704441, "grad_norm": 179005.70650462346, "learning_rate": 4.145601617795753e-07, "logits/chosen": -3.972946882247925, "logits/rejected": -4.114864349365234, "logps/chosen": -80.93687438964844, "logps/rejected": -109.09355163574219, "loss": 124951.3875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005539607373066247, "rewards/margins": 7.179776730481535e-05, "rewards/rejected": -0.0006257584900595248, "step": 820 }, { "epoch": 0.08398259637761814, "grad_norm": 170213.2953033029, "learning_rate": 4.196157735085945e-07, "logits/chosen": -4.35772705078125, "logits/rejected": -4.332313060760498, "logps/chosen": -53.49726486206055, "logps/rejected": -64.27909851074219, "loss": 111748.025, "rewards/accuracies": 0.75, "rewards/chosen": -0.00029984163120388985, "rewards/margins": 0.0002470030158292502, "rewards/rejected": -0.0005468447343446314, "step": 830 }, { "epoch": 0.08499443488819185, "grad_norm": 250076.34834013722, "learning_rate": 4.2467138523761376e-07, "logits/chosen": -4.2786478996276855, "logits/rejected": -4.246649742126465, "logps/chosen": -79.6366958618164, "logps/rejected": -66.25493621826172, "loss": 123805.4375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0005498457467183471, "rewards/margins": 0.00014871449093334377, "rewards/rejected": -0.0006985602667555213, "step": 840 }, { "epoch": 0.08600627339876556, "grad_norm": 279093.0971446919, "learning_rate": 4.2972699696663297e-07, "logits/chosen": -3.3573520183563232, "logits/rejected": -3.3823413848876953, "logps/chosen": -98.96321868896484, "logps/rejected": -93.07720184326172, "loss": 100464.0125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0003005825565196574, "rewards/margins": 0.00013041349302511662, "rewards/rejected": -0.0004309960058890283, "step": 850 }, { "epoch": 0.08701811190933927, "grad_norm": 213700.5084776966, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -3.815347194671631, "logits/rejected": -3.757537841796875, "logps/chosen": -105.80558013916016, "logps/rejected": -90.20960998535156, "loss": 110706.325, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00048279273323714733, "rewards/margins": 7.939124770928174e-05, "rewards/rejected": -0.0005621839663945138, "step": 860 }, { "epoch": 0.08802995041991298, "grad_norm": 273471.33535714017, "learning_rate": 4.398382204246714e-07, "logits/chosen": -3.721285343170166, "logits/rejected": -3.528754711151123, "logps/chosen": -99.2734375, "logps/rejected": -110.80770111083984, "loss": 112672.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0005123598966747522, "rewards/margins": 0.0001581446995260194, "rewards/rejected": -0.0006705046398565173, "step": 870 }, { "epoch": 0.0890417889304867, "grad_norm": 188312.04680058546, "learning_rate": 4.4489383215369057e-07, "logits/chosen": -3.996007204055786, "logits/rejected": -4.096622467041016, "logps/chosen": -47.138328552246094, "logps/rejected": -67.0001449584961, "loss": 114993.2625, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004639817925635725, "rewards/margins": 4.699070268543437e-05, "rewards/rejected": -0.0005109725170768797, "step": 880 }, { "epoch": 0.09005362744106041, "grad_norm": 254507.5901441956, "learning_rate": 4.499494438827098e-07, "logits/chosen": -3.657151699066162, "logits/rejected": -3.575319766998291, "logps/chosen": -69.49859619140625, "logps/rejected": -89.43305969238281, "loss": 114915.425, "rewards/accuracies": 0.75, "rewards/chosen": -0.00047133973566815257, "rewards/margins": 0.00014643651957158, "rewards/rejected": -0.0006177762988954782, "step": 890 }, { "epoch": 0.09106546595163412, "grad_norm": 85341.22143177563, "learning_rate": 4.5500505561172895e-07, "logits/chosen": -4.018014430999756, "logits/rejected": -4.1633524894714355, "logps/chosen": -78.35000610351562, "logps/rejected": -74.81382751464844, "loss": 114435.325, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.000522762187756598, "rewards/margins": -5.5874021199997514e-05, "rewards/rejected": -0.0004668882174883038, "step": 900 }, { "epoch": 0.09207730446220783, "grad_norm": 374282.74457530444, "learning_rate": 4.600606673407482e-07, "logits/chosen": -4.08697509765625, "logits/rejected": -4.136386871337891, "logps/chosen": -122.10968017578125, "logps/rejected": -132.62310791015625, "loss": 117278.4, "rewards/accuracies": 0.5, "rewards/chosen": -0.000500369758810848, "rewards/margins": -7.479538908228278e-05, "rewards/rejected": -0.0004255743697285652, "step": 910 }, { "epoch": 0.09308914297278155, "grad_norm": 190364.88573811995, "learning_rate": 4.6511627906976743e-07, "logits/chosen": -4.119508743286133, "logits/rejected": -4.053428649902344, "logps/chosen": -72.90638732910156, "logps/rejected": -62.0927848815918, "loss": 105714.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0004819636233150959, "rewards/margins": 0.00018739425286184996, "rewards/rejected": -0.000669357948936522, "step": 920 }, { "epoch": 0.09410098148335526, "grad_norm": 4772890.149995768, "learning_rate": 4.701718907987866e-07, "logits/chosen": -3.911916732788086, "logits/rejected": -3.873152256011963, "logps/chosen": -90.25139617919922, "logps/rejected": -106.4377212524414, "loss": 119826.75, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0005738490144722164, "rewards/margins": 0.0007181967375800014, "rewards/rejected": -0.0012920459266752005, "step": 930 }, { "epoch": 0.09511281999392897, "grad_norm": 188912.61183273976, "learning_rate": 4.7522750252780586e-07, "logits/chosen": -3.7851920127868652, "logits/rejected": -3.829699754714966, "logps/chosen": -93.10279846191406, "logps/rejected": -97.0986099243164, "loss": 119543.5125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0006792129133827984, "rewards/margins": 7.087735866662115e-05, "rewards/rejected": -0.0007500902866013348, "step": 940 }, { "epoch": 0.09612465850450268, "grad_norm": 243219.84078530275, "learning_rate": 4.802831142568251e-07, "logits/chosen": -4.285732746124268, "logits/rejected": -4.319547176361084, "logps/chosen": -103.0975112915039, "logps/rejected": -97.14813232421875, "loss": 116598.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0007636704249307513, "rewards/margins": 0.00020572729408740997, "rewards/rejected": -0.0009693976608105004, "step": 950 }, { "epoch": 0.0971364970150764, "grad_norm": 314583.84685758024, "learning_rate": 4.853387259858443e-07, "logits/chosen": -3.9145922660827637, "logits/rejected": -4.074250221252441, "logps/chosen": -123.77119445800781, "logps/rejected": -110.5106430053711, "loss": 121326.025, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005679583991877735, "rewards/margins": 0.0003315097710583359, "rewards/rejected": -0.0008994681993499398, "step": 960 }, { "epoch": 0.09814833552565011, "grad_norm": 116804.78517690097, "learning_rate": 4.903943377148635e-07, "logits/chosen": -3.8666579723358154, "logits/rejected": -3.9234185218811035, "logps/chosen": -108.4072494506836, "logps/rejected": -130.3223419189453, "loss": 126734.075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0007635747897438705, "rewards/margins": 0.00022209924645721912, "rewards/rejected": -0.0009856739779934287, "step": 970 }, { "epoch": 0.09916017403622382, "grad_norm": 217634.34485126633, "learning_rate": 4.954499494438827e-07, "logits/chosen": -3.9971466064453125, "logits/rejected": -3.8893535137176514, "logps/chosen": -104.3780288696289, "logps/rejected": -118.33424377441406, "loss": 112446.8625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0006970642716623843, "rewards/margins": 8.479790994897485e-05, "rewards/rejected": -0.00078186223981902, "step": 980 }, { "epoch": 0.10017201254679753, "grad_norm": 136676.02292513885, "learning_rate": 4.99943782325163e-07, "logits/chosen": -4.009482383728027, "logits/rejected": -4.206858158111572, "logps/chosen": -56.4992790222168, "logps/rejected": -94.554931640625, "loss": 111529.275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.000787440687417984, "rewards/margins": 0.00043015400297008455, "rewards/rejected": -0.001217594719491899, "step": 990 }, { "epoch": 0.10118385105737124, "grad_norm": 282076.66178895254, "learning_rate": 4.993816055767933e-07, "logits/chosen": -4.315939903259277, "logits/rejected": -4.416481018066406, "logps/chosen": -84.02754974365234, "logps/rejected": -82.01793670654297, "loss": 90632.8813, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008166494080796838, "rewards/margins": 0.00015698290371801704, "rewards/rejected": -0.0009736322681419551, "step": 1000 }, { "epoch": 0.10219568956794496, "grad_norm": 270495.250815213, "learning_rate": 4.988194288284237e-07, "logits/chosen": -4.028949737548828, "logits/rejected": -3.908412456512451, "logps/chosen": -70.65345764160156, "logps/rejected": -98.29560852050781, "loss": 124507.2, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0010287861805409193, "rewards/margins": 9.499984298599884e-05, "rewards/rejected": -0.001123785856179893, "step": 1010 }, { "epoch": 0.10320752807851867, "grad_norm": 220543.64081696144, "learning_rate": 4.982572520800539e-07, "logits/chosen": -4.438923358917236, "logits/rejected": -4.359255790710449, "logps/chosen": -68.58352661132812, "logps/rejected": -78.6388168334961, "loss": 129903.1125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0009885670151561499, "rewards/margins": 0.00033462810097262263, "rewards/rejected": -0.001323195407167077, "step": 1020 }, { "epoch": 0.10421936658909238, "grad_norm": 429325.69175542536, "learning_rate": 4.976950753316843e-07, "logits/chosen": -4.095893859863281, "logits/rejected": -4.111529350280762, "logps/chosen": -146.13893127441406, "logps/rejected": -161.24984741210938, "loss": 119516.1375, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0010872108396142721, "rewards/margins": 0.00011753063881769776, "rewards/rejected": -0.001204741420224309, "step": 1030 }, { "epoch": 0.10523120509966609, "grad_norm": 119213.43941778033, "learning_rate": 4.971328985833146e-07, "logits/chosen": -4.227572441101074, "logits/rejected": -4.261704921722412, "logps/chosen": -70.33900451660156, "logps/rejected": -86.19837951660156, "loss": 120785.4125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0009618139592930675, "rewards/margins": 0.0003808088949881494, "rewards/rejected": -0.001342622796073556, "step": 1040 }, { "epoch": 0.10624304361023981, "grad_norm": 334815.43817025534, "learning_rate": 4.965707218349449e-07, "logits/chosen": -3.819000720977783, "logits/rejected": -3.752131223678589, "logps/chosen": -99.50486755371094, "logps/rejected": -122.65846252441406, "loss": 113318.5625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0010588231962174177, "rewards/margins": -0.00012557515583466738, "rewards/rejected": -0.0009332479676231742, "step": 1050 }, { "epoch": 0.10725488212081352, "grad_norm": 346241.3470961811, "learning_rate": 4.960085450865752e-07, "logits/chosen": -4.043279647827148, "logits/rejected": -3.9398789405822754, "logps/chosen": -73.0541000366211, "logps/rejected": -93.44039916992188, "loss": 102616.6938, "rewards/accuracies": 0.75, "rewards/chosen": -0.0011278989259153605, "rewards/margins": 0.0007243291474878788, "rewards/rejected": -0.0018522279569879174, "step": 1060 }, { "epoch": 0.10826672063138723, "grad_norm": 272545.27257603017, "learning_rate": 4.954463683382056e-07, "logits/chosen": -4.283538818359375, "logits/rejected": -4.265091896057129, "logps/chosen": -116.859130859375, "logps/rejected": -95.63941192626953, "loss": 112764.4, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0011635724222287536, "rewards/margins": 8.715410513104871e-05, "rewards/rejected": -0.0012507265200838447, "step": 1070 }, { "epoch": 0.10927855914196094, "grad_norm": 311380.85901774967, "learning_rate": 4.948841915898358e-07, "logits/chosen": -4.164912223815918, "logits/rejected": -4.245726585388184, "logps/chosen": -80.79125213623047, "logps/rejected": -95.61489868164062, "loss": 125844.6125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0013993822503834963, "rewards/margins": 0.00045929610496386886, "rewards/rejected": -0.0018586782971397042, "step": 1080 }, { "epoch": 0.11029039765253465, "grad_norm": 256568.04874269618, "learning_rate": 4.943220148414661e-07, "logits/chosen": -4.06484842300415, "logits/rejected": -4.096969127655029, "logps/chosen": -71.0582275390625, "logps/rejected": -103.97459411621094, "loss": 113698.3625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0015740857925266027, "rewards/margins": 8.061521657509729e-05, "rewards/rejected": -0.0016547009581699967, "step": 1090 }, { "epoch": 0.11130223616310837, "grad_norm": 333589.2976045888, "learning_rate": 4.937598380930965e-07, "logits/chosen": -3.9437575340270996, "logits/rejected": -3.8375630378723145, "logps/chosen": -79.3320541381836, "logps/rejected": -120.8829345703125, "loss": 123360.925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0014920301036909223, "rewards/margins": 0.00032946199644356966, "rewards/rejected": -0.001821492100134492, "step": 1100 }, { "epoch": 0.11231407467368208, "grad_norm": 186678.55177408687, "learning_rate": 4.931976613447267e-07, "logits/chosen": -3.7408955097198486, "logits/rejected": -3.802854061126709, "logps/chosen": -100.07715606689453, "logps/rejected": -96.84384155273438, "loss": 117806.3875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0018541684839874506, "rewards/margins": 0.0003147349343635142, "rewards/rejected": -0.002168903360143304, "step": 1110 }, { "epoch": 0.11332591318425579, "grad_norm": 165923.63825263642, "learning_rate": 4.926354845963571e-07, "logits/chosen": -3.757504940032959, "logits/rejected": -3.7291667461395264, "logps/chosen": -67.39073181152344, "logps/rejected": -78.77232360839844, "loss": 120356.8, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0014546301681548357, "rewards/margins": -0.00016837325529195368, "rewards/rejected": -0.0012862568255513906, "step": 1120 }, { "epoch": 0.1143377516948295, "grad_norm": 413450.7184880102, "learning_rate": 4.920733078479874e-07, "logits/chosen": -4.2089996337890625, "logits/rejected": -4.219716548919678, "logps/chosen": -94.91521453857422, "logps/rejected": -121.3793716430664, "loss": 128221.0875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0013026389060541987, "rewards/margins": 0.0003200122155249119, "rewards/rejected": -0.0016226511215791106, "step": 1130 }, { "epoch": 0.11534959020540322, "grad_norm": 540.7081836913974, "learning_rate": 4.915111310996177e-07, "logits/chosen": -3.7364563941955566, "logits/rejected": -3.732656955718994, "logps/chosen": -78.72477722167969, "logps/rejected": -105.9988021850586, "loss": 108036.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.001381844049319625, "rewards/margins": 0.0003017765993718058, "rewards/rejected": -0.0016836205031722784, "step": 1140 }, { "epoch": 0.11636142871597693, "grad_norm": 160808.88059841492, "learning_rate": 4.90948954351248e-07, "logits/chosen": -4.029149055480957, "logits/rejected": -3.904705047607422, "logps/chosen": -60.20563507080078, "logps/rejected": -83.43788146972656, "loss": 124160.2375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0017772444989532232, "rewards/margins": 0.00013463865616358817, "rewards/rejected": -0.0019118832424283028, "step": 1150 }, { "epoch": 0.11737326722655064, "grad_norm": 291506.6891671458, "learning_rate": 4.903867776028783e-07, "logits/chosen": -4.102453708648682, "logits/rejected": -4.128325939178467, "logps/chosen": -58.828948974609375, "logps/rejected": -64.96825408935547, "loss": 105803.2125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0011480475077405572, "rewards/margins": 0.0004561108653433621, "rewards/rejected": -0.0016041581984609365, "step": 1160 }, { "epoch": 0.11838510573712435, "grad_norm": 95711.47575386387, "learning_rate": 4.898246008545087e-07, "logits/chosen": -3.5265846252441406, "logits/rejected": -3.551593065261841, "logps/chosen": -54.99864959716797, "logps/rejected": -80.023193359375, "loss": 109764.275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.001392826670780778, "rewards/margins": 0.0004531313315965235, "rewards/rejected": -0.0018459579441696405, "step": 1170 }, { "epoch": 0.11939694424769806, "grad_norm": 197915.17485030237, "learning_rate": 4.892624241061389e-07, "logits/chosen": -4.017637729644775, "logits/rejected": -4.118896007537842, "logps/chosen": -86.74040222167969, "logps/rejected": -99.92372131347656, "loss": 113113.4625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0012234419118613005, "rewards/margins": 0.0005018307128921151, "rewards/rejected": -0.0017252726247534156, "step": 1180 }, { "epoch": 0.12040878275827178, "grad_norm": 192499.0131241805, "learning_rate": 4.887002473577693e-07, "logits/chosen": -4.391817569732666, "logits/rejected": -4.432112693786621, "logps/chosen": -78.59744262695312, "logps/rejected": -316.09405517578125, "loss": 126020.0875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.002040825318545103, "rewards/margins": -0.00017552505596540868, "rewards/rejected": -0.001865300117060542, "step": 1190 }, { "epoch": 0.1214206212688455, "grad_norm": 133085.6145038344, "learning_rate": 4.881380706093996e-07, "logits/chosen": -4.013171195983887, "logits/rejected": -4.15698766708374, "logps/chosen": -78.74140930175781, "logps/rejected": -83.27415466308594, "loss": 90145.2812, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.001593960216268897, "rewards/margins": 0.00025647395523265004, "rewards/rejected": -0.001850434229709208, "step": 1200 }, { "epoch": 0.1224324597794192, "grad_norm": 233011.01459222703, "learning_rate": 4.875758938610299e-07, "logits/chosen": -4.009166717529297, "logits/rejected": -4.08519172668457, "logps/chosen": -52.803611755371094, "logps/rejected": -133.89816284179688, "loss": 114620.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.001535617047920823, "rewards/margins": 0.0008856039494276047, "rewards/rejected": -0.0024212209973484278, "step": 1210 }, { "epoch": 0.12344429828999291, "grad_norm": 189521.48263492508, "learning_rate": 4.870137171126602e-07, "logits/chosen": -3.868739366531372, "logits/rejected": -3.937281847000122, "logps/chosen": -60.612388610839844, "logps/rejected": -87.3714828491211, "loss": 116136.7875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0014301678165793419, "rewards/margins": 0.000758271780796349, "rewards/rejected": -0.002188439480960369, "step": 1220 }, { "epoch": 0.12445613680056664, "grad_norm": 1409078.3481591968, "learning_rate": 4.864515403642905e-07, "logits/chosen": -3.4285855293273926, "logits/rejected": -3.2295989990234375, "logps/chosen": -239.4199676513672, "logps/rejected": -278.4762878417969, "loss": 123550.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0011258742306381464, "rewards/margins": 0.0003020817239303142, "rewards/rejected": -0.0014279558090493083, "step": 1230 }, { "epoch": 0.12546797531114035, "grad_norm": 399364.98303044075, "learning_rate": 4.858893636159208e-07, "logits/chosen": -4.277926445007324, "logits/rejected": -4.270390510559082, "logps/chosen": -101.40743255615234, "logps/rejected": -130.08621215820312, "loss": 134019.85, "rewards/accuracies": 0.75, "rewards/chosen": -0.002458154922351241, "rewards/margins": 0.0006810495397076011, "rewards/rejected": -0.003139204578474164, "step": 1240 }, { "epoch": 0.12647981382171405, "grad_norm": 213471.25321079476, "learning_rate": 4.853271868675511e-07, "logits/chosen": -3.903860569000244, "logits/rejected": -3.985483169555664, "logps/chosen": -38.732704162597656, "logps/rejected": -91.85098266601562, "loss": 117426.2375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0013733479427173734, "rewards/margins": 0.0010582536924630404, "rewards/rejected": -0.0024316017515957355, "step": 1250 }, { "epoch": 0.12749165233228776, "grad_norm": 200238.85005882665, "learning_rate": 4.847650101191815e-07, "logits/chosen": -3.826470136642456, "logits/rejected": -3.885911464691162, "logps/chosen": -109.8686294555664, "logps/rejected": -135.4347381591797, "loss": 132820.6375, "rewards/accuracies": 0.5, "rewards/chosen": -0.002316161058843136, "rewards/margins": 0.0004742151068057865, "rewards/rejected": -0.002790376078337431, "step": 1260 }, { "epoch": 0.12850349084286147, "grad_norm": 400267.35946190543, "learning_rate": 4.842028333708117e-07, "logits/chosen": -4.259737014770508, "logits/rejected": -4.36344051361084, "logps/chosen": -70.9277572631836, "logps/rejected": -72.38328552246094, "loss": 111187.8875, "rewards/accuracies": 0.5, "rewards/chosen": -0.002010022522881627, "rewards/margins": 0.0002968419867102057, "rewards/rejected": -0.0023068648297339678, "step": 1270 }, { "epoch": 0.12951532935343518, "grad_norm": 207698.23962602197, "learning_rate": 4.836406566224421e-07, "logits/chosen": -3.9796853065490723, "logits/rejected": -4.01785945892334, "logps/chosen": -129.58004760742188, "logps/rejected": -117.6347427368164, "loss": 109503.4, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.002062011044472456, "rewards/margins": 0.000328147376421839, "rewards/rejected": -0.002390158362686634, "step": 1280 }, { "epoch": 0.1305271678640089, "grad_norm": 238638.90961228753, "learning_rate": 4.830784798740724e-07, "logits/chosen": -4.097420692443848, "logits/rejected": -4.236047744750977, "logps/chosen": -91.17997741699219, "logps/rejected": -91.8326644897461, "loss": 112670.625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0020105503499507904, "rewards/margins": 0.00019929236441385, "rewards/rejected": -0.002209842437878251, "step": 1290 }, { "epoch": 0.13153900637458263, "grad_norm": 178040.8644974504, "learning_rate": 4.825163031257026e-07, "logits/chosen": -4.072221755981445, "logits/rejected": -4.007314205169678, "logps/chosen": -69.40257263183594, "logps/rejected": -89.54090881347656, "loss": 118365.1375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002298684325069189, "rewards/margins": 0.0005499212420545518, "rewards/rejected": -0.002848605392500758, "step": 1300 }, { "epoch": 0.13255084488515634, "grad_norm": 230618.2687216013, "learning_rate": 4.81954126377333e-07, "logits/chosen": -4.178137302398682, "logits/rejected": -4.119972229003906, "logps/chosen": -60.28234100341797, "logps/rejected": -87.38555908203125, "loss": 124005.975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0022037059534341097, "rewards/margins": 0.0008377780904993415, "rewards/rejected": -0.0030414839275181293, "step": 1310 }, { "epoch": 0.13356268339573005, "grad_norm": 237121.2955546329, "learning_rate": 4.813919496289633e-07, "logits/chosen": -3.3955371379852295, "logits/rejected": -3.4964001178741455, "logps/chosen": -59.37626266479492, "logps/rejected": -77.12128448486328, "loss": 119031.7125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0024296201299875975, "rewards/margins": 0.0006563514471054077, "rewards/rejected": -0.003085971577093005, "step": 1320 }, { "epoch": 0.13457452190630376, "grad_norm": 96340.42200638888, "learning_rate": 4.808297728805936e-07, "logits/chosen": -4.250865459442139, "logits/rejected": -4.229079246520996, "logps/chosen": -75.57382202148438, "logps/rejected": -93.5732421875, "loss": 125522.4625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002417216543108225, "rewards/margins": 0.0005665208445861936, "rewards/rejected": -0.0029837372712790966, "step": 1330 }, { "epoch": 0.13558636041687747, "grad_norm": 253628.91997350715, "learning_rate": 4.802675961322239e-07, "logits/chosen": -4.142685413360596, "logits/rejected": -4.067065238952637, "logps/chosen": -65.46693420410156, "logps/rejected": -84.69117736816406, "loss": 123979.3, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021675152238458395, "rewards/margins": 0.000524309347383678, "rewards/rejected": -0.0026918251533061266, "step": 1340 }, { "epoch": 0.13659819892745118, "grad_norm": 396325.2150547264, "learning_rate": 4.797054193838543e-07, "logits/chosen": -4.27273416519165, "logits/rejected": -4.311891555786133, "logps/chosen": -80.87626647949219, "logps/rejected": -137.62350463867188, "loss": 114949.1375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0021735928021371365, "rewards/margins": 0.0009866777108982205, "rewards/rejected": -0.003160270396620035, "step": 1350 }, { "epoch": 0.13761003743802488, "grad_norm": 277055.3901111539, "learning_rate": 4.791432426354845e-07, "logits/chosen": -4.529768943786621, "logits/rejected": -4.605081558227539, "logps/chosen": -61.59954833984375, "logps/rejected": -81.42635345458984, "loss": 126453.1875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.002782776253297925, "rewards/margins": 0.00021325847774278373, "rewards/rejected": -0.002996035385876894, "step": 1360 }, { "epoch": 0.1386218759485986, "grad_norm": 109710.1589626028, "learning_rate": 4.785810658871149e-07, "logits/chosen": -4.267306327819824, "logits/rejected": -4.065680027008057, "logps/chosen": -52.71366500854492, "logps/rejected": -61.97577667236328, "loss": 114226.7, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0022667930461466312, "rewards/margins": -0.00013912706344854087, "rewards/rejected": -0.0021276657935231924, "step": 1370 }, { "epoch": 0.1396337144591723, "grad_norm": 212352.0516607091, "learning_rate": 4.780188891387452e-07, "logits/chosen": -3.5670928955078125, "logits/rejected": -3.8216958045959473, "logps/chosen": -86.52713012695312, "logps/rejected": -114.81269836425781, "loss": 109738.0875, "rewards/accuracies": 0.5, "rewards/chosen": -0.0028053114656358957, "rewards/margins": 0.00032988196471706033, "rewards/rejected": -0.003135193604975939, "step": 1380 }, { "epoch": 0.14064555296974604, "grad_norm": 173491.48331909833, "learning_rate": 4.774567123903755e-07, "logits/chosen": -4.366482734680176, "logits/rejected": -4.150587558746338, "logps/chosen": -89.85624694824219, "logps/rejected": -74.56192779541016, "loss": 122029.4125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0023434595204889774, "rewards/margins": -0.00020478358783293515, "rewards/rejected": -0.0021386758890002966, "step": 1390 }, { "epoch": 0.14165739148031975, "grad_norm": 340561.2438646425, "learning_rate": 4.768945356420058e-07, "logits/chosen": -4.532942771911621, "logits/rejected": -4.443455696105957, "logps/chosen": -59.0555305480957, "logps/rejected": -79.49029541015625, "loss": 108440.1125, "rewards/accuracies": 0.5, "rewards/chosen": -0.0022386261261999607, "rewards/margins": 0.00017971015768125653, "rewards/rejected": -0.0024183364585042, "step": 1400 }, { "epoch": 0.14266922999089346, "grad_norm": 127236.81737985682, "learning_rate": 4.763323588936362e-07, "logits/chosen": -3.9243946075439453, "logits/rejected": -3.9819324016571045, "logps/chosen": -68.21260070800781, "logps/rejected": -92.05513000488281, "loss": 113291.15, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0021132181864231825, "rewards/margins": 0.0006459690630435944, "rewards/rejected": -0.002759187016636133, "step": 1410 }, { "epoch": 0.14368106850146717, "grad_norm": 101463.18039177092, "learning_rate": 4.757701821452664e-07, "logits/chosen": -4.129014492034912, "logits/rejected": -3.9459919929504395, "logps/chosen": -63.69234085083008, "logps/rejected": -89.98728942871094, "loss": 114706.9875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002005478832870722, "rewards/margins": 0.0007894946029409766, "rewards/rejected": -0.002794973086565733, "step": 1420 }, { "epoch": 0.14469290701204088, "grad_norm": 337341.4604701612, "learning_rate": 4.7520800539689676e-07, "logits/chosen": -4.289007186889648, "logits/rejected": -4.233653545379639, "logps/chosen": -65.11180114746094, "logps/rejected": -92.5044937133789, "loss": 114210.4625, "rewards/accuracies": 0.5, "rewards/chosen": -0.003049281658604741, "rewards/margins": 0.000174131739186123, "rewards/rejected": -0.0032234136015176773, "step": 1430 }, { "epoch": 0.1457047455226146, "grad_norm": 314386.22507871786, "learning_rate": 4.746458286485271e-07, "logits/chosen": -4.579699516296387, "logits/rejected": -4.501997470855713, "logps/chosen": -59.3968391418457, "logps/rejected": -119.77976989746094, "loss": 108199.0375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002428931649774313, "rewards/margins": 0.0013033599825575948, "rewards/rejected": -0.0037322912830859423, "step": 1440 }, { "epoch": 0.1467165840331883, "grad_norm": 199753.91103362554, "learning_rate": 4.7408365190015735e-07, "logits/chosen": -4.411381721496582, "logits/rejected": -4.393097877502441, "logps/chosen": -78.14238739013672, "logps/rejected": -68.27903747558594, "loss": 122014.525, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0029385939706116915, "rewards/margins": -1.9673758288263343e-05, "rewards/rejected": -0.002918920246884227, "step": 1450 }, { "epoch": 0.147728422543762, "grad_norm": 195876.4388793357, "learning_rate": 4.735214751517877e-07, "logits/chosen": -4.847861289978027, "logits/rejected": -4.821971893310547, "logps/chosen": -68.38887023925781, "logps/rejected": -94.08131408691406, "loss": 102597.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0029975702054798603, "rewards/margins": 0.0008448130683973432, "rewards/rejected": -0.003842382924631238, "step": 1460 }, { "epoch": 0.14874026105433572, "grad_norm": 300194.5567438234, "learning_rate": 4.7295929840341804e-07, "logits/chosen": -4.440006256103516, "logits/rejected": -4.441173076629639, "logps/chosen": -46.514957427978516, "logps/rejected": -74.5523452758789, "loss": 115516.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00369494641199708, "rewards/margins": -0.0001918985799420625, "rewards/rejected": -0.0035030480939894915, "step": 1470 }, { "epoch": 0.14975209956490945, "grad_norm": 191384.44704145443, "learning_rate": 4.723971216550483e-07, "logits/chosen": -4.301224231719971, "logits/rejected": -4.37109375, "logps/chosen": -80.74003601074219, "logps/rejected": -86.65975189208984, "loss": 126105.1875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036367543507367373, "rewards/margins": 9.753355698194355e-05, "rewards/rejected": -0.0037342875730246305, "step": 1480 }, { "epoch": 0.15076393807548316, "grad_norm": 189308.3669063108, "learning_rate": 4.718349449066786e-07, "logits/chosen": -4.257768630981445, "logits/rejected": -4.276392936706543, "logps/chosen": -60.5390510559082, "logps/rejected": -65.59141540527344, "loss": 117925.1625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0022234520874917507, "rewards/margins": 0.0007476452738046646, "rewards/rejected": -0.0029710973612964153, "step": 1490 }, { "epoch": 0.15177577658605687, "grad_norm": 163470.5384006441, "learning_rate": 4.7127276815830897e-07, "logits/chosen": -4.2732038497924805, "logits/rejected": -4.159806251525879, "logps/chosen": -96.32408142089844, "logps/rejected": -80.67424774169922, "loss": 107414.125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002951942151412368, "rewards/margins": 0.0005791998119093478, "rewards/rejected": -0.0035311419051140547, "step": 1500 }, { "epoch": 0.15278761509663058, "grad_norm": 195112.9472936429, "learning_rate": 4.7071059140993926e-07, "logits/chosen": -4.4413862228393555, "logits/rejected": -4.511037349700928, "logps/chosen": -82.16609191894531, "logps/rejected": -85.10350799560547, "loss": 107049.975, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0029795649461448193, "rewards/margins": -0.0003335486399009824, "rewards/rejected": -0.0026460164226591587, "step": 1510 }, { "epoch": 0.1537994536072043, "grad_norm": 219819.97864166374, "learning_rate": 4.7014841466156955e-07, "logits/chosen": -4.145008563995361, "logits/rejected": -4.145150184631348, "logps/chosen": -72.13661193847656, "logps/rejected": -70.73995208740234, "loss": 100945.65, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0028055261354893446, "rewards/margins": -0.00012280834198463708, "rewards/rejected": -0.0026827179826796055, "step": 1520 }, { "epoch": 0.154811292117778, "grad_norm": 387176.9834502504, "learning_rate": 4.695862379131999e-07, "logits/chosen": -3.5974159240722656, "logits/rejected": -3.4801688194274902, "logps/chosen": -65.54615020751953, "logps/rejected": -131.7211151123047, "loss": 114512.175, "rewards/accuracies": 0.5, "rewards/chosen": -0.003558442695066333, "rewards/margins": 0.0005630885134451091, "rewards/rejected": -0.00412153173238039, "step": 1530 }, { "epoch": 0.1558231306283517, "grad_norm": 265093.7827472936, "learning_rate": 4.690240611648302e-07, "logits/chosen": -4.002167701721191, "logits/rejected": -4.017189979553223, "logps/chosen": -80.39664459228516, "logps/rejected": -114.8666000366211, "loss": 131032.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.003226993605494499, "rewards/margins": 0.000310246687149629, "rewards/rejected": -0.003537239972501993, "step": 1540 }, { "epoch": 0.15683496913892542, "grad_norm": 161304.9419064831, "learning_rate": 4.684618844164605e-07, "logits/chosen": -3.8528435230255127, "logits/rejected": -3.9004955291748047, "logps/chosen": -85.77623748779297, "logps/rejected": -81.51467895507812, "loss": 122413.5, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.002765300450846553, "rewards/margins": 0.00018229690613225102, "rewards/rejected": -0.002947597298771143, "step": 1550 }, { "epoch": 0.15784680764949913, "grad_norm": 385839.01853998844, "learning_rate": 4.6789970766809083e-07, "logits/chosen": -4.098940849304199, "logits/rejected": -3.9840729236602783, "logps/chosen": -110.5191879272461, "logps/rejected": -151.79396057128906, "loss": 117807.375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003114499617367983, "rewards/margins": 0.0005868591251783073, "rewards/rejected": -0.0037013590335845947, "step": 1560 }, { "epoch": 0.15885864616007286, "grad_norm": 109740.44486198448, "learning_rate": 4.673375309197211e-07, "logits/chosen": -4.600703716278076, "logits/rejected": -4.495879650115967, "logps/chosen": -67.69212341308594, "logps/rejected": -68.3251953125, "loss": 120191.025, "rewards/accuracies": 0.5, "rewards/chosen": -0.002769334940239787, "rewards/margins": 0.0002543455339036882, "rewards/rejected": -0.00302368076518178, "step": 1570 }, { "epoch": 0.15987048467064657, "grad_norm": 256369.554948462, "learning_rate": 4.6677535417135147e-07, "logits/chosen": -3.8854293823242188, "logits/rejected": -3.7836883068084717, "logps/chosen": -95.18055725097656, "logps/rejected": -125.2585220336914, "loss": 107545.7875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0031250473111867905, "rewards/margins": 0.0007601731922477484, "rewards/rejected": -0.0038852207362651825, "step": 1580 }, { "epoch": 0.16088232318122028, "grad_norm": 159227.52820885702, "learning_rate": 4.6621317742298176e-07, "logits/chosen": -4.170536518096924, "logits/rejected": -4.070335388183594, "logps/chosen": -107.2885513305664, "logps/rejected": -99.94802856445312, "loss": 122276.7875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0035220072604715824, "rewards/margins": -5.590973887592554e-05, "rewards/rejected": -0.0034660971723496914, "step": 1590 }, { "epoch": 0.161894161691794, "grad_norm": 286388.66657985724, "learning_rate": 4.656510006746121e-07, "logits/chosen": -3.6643612384796143, "logits/rejected": -3.7156288623809814, "logps/chosen": -68.43424224853516, "logps/rejected": -93.03721618652344, "loss": 113322.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0024577316362410784, "rewards/margins": 0.0009059511357918382, "rewards/rejected": -0.003363682422786951, "step": 1600 }, { "epoch": 0.1629060002023677, "grad_norm": 186931.33886511953, "learning_rate": 4.650888239262424e-07, "logits/chosen": -3.9161739349365234, "logits/rejected": -4.00060510635376, "logps/chosen": -64.67398834228516, "logps/rejected": -108.53431701660156, "loss": 130030.8625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.002350736176595092, "rewards/margins": 0.001963825896382332, "rewards/rejected": -0.00431456184014678, "step": 1610 }, { "epoch": 0.1639178387129414, "grad_norm": 165255.21560736455, "learning_rate": 4.645266471778727e-07, "logits/chosen": -4.0226569175720215, "logits/rejected": -4.066514492034912, "logps/chosen": -71.673095703125, "logps/rejected": -71.8808364868164, "loss": 106046.525, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00177630758844316, "rewards/margins": 0.00028537274920381606, "rewards/rejected": -0.0020616804249584675, "step": 1620 }, { "epoch": 0.16492967722351512, "grad_norm": 211266.3542937618, "learning_rate": 4.6396447042950303e-07, "logits/chosen": -3.8165085315704346, "logits/rejected": -3.895458936691284, "logps/chosen": -108.13105773925781, "logps/rejected": -101.7791748046875, "loss": 112921.0875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0033067017793655396, "rewards/margins": -8.912711928132921e-05, "rewards/rejected": -0.0032175746746361256, "step": 1630 }, { "epoch": 0.16594151573408883, "grad_norm": 325958.1449494181, "learning_rate": 4.634022936811333e-07, "logits/chosen": -4.315334320068359, "logits/rejected": -4.264525413513184, "logps/chosen": -62.4387321472168, "logps/rejected": -69.93623352050781, "loss": 120264.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00270082731731236, "rewards/margins": 0.0004997997893951833, "rewards/rejected": -0.0032006266992539167, "step": 1640 }, { "epoch": 0.16695335424466254, "grad_norm": 177781.92604398407, "learning_rate": 4.6284011693276367e-07, "logits/chosen": -4.315485954284668, "logits/rejected": -4.309866905212402, "logps/chosen": -102.03514862060547, "logps/rejected": -133.59927368164062, "loss": 122749.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002910839393734932, "rewards/margins": 0.0005180056323297322, "rewards/rejected": -0.0034288447350263596, "step": 1650 }, { "epoch": 0.16796519275523628, "grad_norm": 333600.11008013313, "learning_rate": 4.6227794018439396e-07, "logits/chosen": -4.160773754119873, "logits/rejected": -4.172965049743652, "logps/chosen": -58.295135498046875, "logps/rejected": -82.595458984375, "loss": 123003.7375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003491337178274989, "rewards/margins": 0.00022335420362651348, "rewards/rejected": -0.00371469184756279, "step": 1660 }, { "epoch": 0.16897703126580998, "grad_norm": 238373.40798182107, "learning_rate": 4.6171576343602426e-07, "logits/chosen": -4.578868389129639, "logits/rejected": -4.632213115692139, "logps/chosen": -88.15584564208984, "logps/rejected": -98.40928649902344, "loss": 104444.8125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.002960801823064685, "rewards/margins": 0.0008488811436109245, "rewards/rejected": -0.0038096830248832703, "step": 1670 }, { "epoch": 0.1699888697763837, "grad_norm": 221524.61266032586, "learning_rate": 4.611535866876546e-07, "logits/chosen": -4.36057186126709, "logits/rejected": -4.36978816986084, "logps/chosen": -78.83447265625, "logps/rejected": -101.93792724609375, "loss": 110996.7875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00374145177192986, "rewards/margins": 0.0003354550281073898, "rewards/rejected": -0.004076906479895115, "step": 1680 }, { "epoch": 0.1710007082869574, "grad_norm": 250857.65823711432, "learning_rate": 4.605914099392849e-07, "logits/chosen": -3.8065707683563232, "logits/rejected": -3.86968994140625, "logps/chosen": -114.9301986694336, "logps/rejected": -126.54632568359375, "loss": 121450.8375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.002999036107212305, "rewards/margins": 0.0015940092271193862, "rewards/rejected": -0.004593045450747013, "step": 1690 }, { "epoch": 0.1720125467975311, "grad_norm": 1530495.6817036716, "learning_rate": 4.600292331909152e-07, "logits/chosen": -4.130616188049316, "logits/rejected": -4.350535869598389, "logps/chosen": -101.69114685058594, "logps/rejected": -117.83624267578125, "loss": 112113.1, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0029338381718844175, "rewards/margins": 0.0011778674088418484, "rewards/rejected": -0.0041117058135569096, "step": 1700 }, { "epoch": 0.17302438530810482, "grad_norm": 237290.0630745643, "learning_rate": 4.5946705644254553e-07, "logits/chosen": -4.430675506591797, "logits/rejected": -4.4244771003723145, "logps/chosen": -89.71955871582031, "logps/rejected": -103.50233459472656, "loss": 116525.825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004373278934508562, "rewards/margins": 0.0005314871086739004, "rewards/rejected": -0.004904766567051411, "step": 1710 }, { "epoch": 0.17403622381867853, "grad_norm": 305397.9527423171, "learning_rate": 4.589048796941759e-07, "logits/chosen": -4.339646339416504, "logits/rejected": -4.275872707366943, "logps/chosen": -108.80708312988281, "logps/rejected": -149.89361572265625, "loss": 116866.3625, "rewards/accuracies": 0.5, "rewards/chosen": -0.003453589277341962, "rewards/margins": 0.00043439079308882356, "rewards/rejected": -0.003887980477884412, "step": 1720 }, { "epoch": 0.17504806232925224, "grad_norm": 220131.1997440626, "learning_rate": 4.583427029458061e-07, "logits/chosen": -4.036735534667969, "logits/rejected": -3.9514331817626953, "logps/chosen": -98.88370513916016, "logps/rejected": -90.13319396972656, "loss": 121913.6625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003441751003265381, "rewards/margins": 0.0005175228579901159, "rewards/rejected": -0.00395927345380187, "step": 1730 }, { "epoch": 0.17605990083982595, "grad_norm": 760226.0301937454, "learning_rate": 4.5778052619743646e-07, "logits/chosen": -3.5149264335632324, "logits/rejected": -3.462064743041992, "logps/chosen": -272.89373779296875, "logps/rejected": -305.7469787597656, "loss": 114547.25, "rewards/accuracies": 0.5, "rewards/chosen": -0.003498078091070056, "rewards/margins": 0.00080921093467623, "rewards/rejected": -0.004307289142161608, "step": 1740 }, { "epoch": 0.1770717393503997, "grad_norm": 117129.72333028629, "learning_rate": 4.572183494490668e-07, "logits/chosen": -3.8273658752441406, "logits/rejected": -3.682508945465088, "logps/chosen": -59.99755096435547, "logps/rejected": -59.99519729614258, "loss": 106971.9125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0031796179246157408, "rewards/margins": 0.0004405599902383983, "rewards/rejected": -0.0036201779730618, "step": 1750 }, { "epoch": 0.1780835778609734, "grad_norm": 272072.8367077394, "learning_rate": 4.5665617270069705e-07, "logits/chosen": -4.15725040435791, "logits/rejected": -4.136308193206787, "logps/chosen": -106.7020034790039, "logps/rejected": -102.91386413574219, "loss": 111329.9, "rewards/accuracies": 0.5, "rewards/chosen": -0.0041197557002305984, "rewards/margins": 5.376250555855222e-05, "rewards/rejected": -0.0041735186241567135, "step": 1760 }, { "epoch": 0.1790954163715471, "grad_norm": 229266.56081393073, "learning_rate": 4.560939959523274e-07, "logits/chosen": -4.315399646759033, "logits/rejected": -4.180730819702148, "logps/chosen": -85.40532684326172, "logps/rejected": -79.35671997070312, "loss": 112749.5375, "rewards/accuracies": 0.5, "rewards/chosen": -0.004056447651237249, "rewards/margins": 0.0002649584203027189, "rewards/rejected": -0.0043214065954089165, "step": 1770 }, { "epoch": 0.18010725488212082, "grad_norm": 323576.1793818995, "learning_rate": 4.5553181920395774e-07, "logits/chosen": -3.9041988849639893, "logits/rejected": -4.078243732452393, "logps/chosen": -118.8115234375, "logps/rejected": -123.0760269165039, "loss": 129648.325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.004096274264156818, "rewards/margins": 0.00011004717089235783, "rewards/rejected": -0.00420632166787982, "step": 1780 }, { "epoch": 0.18111909339269452, "grad_norm": 289130.4261111133, "learning_rate": 4.54969642455588e-07, "logits/chosen": -4.252322196960449, "logits/rejected": -4.418753623962402, "logps/chosen": -92.62886810302734, "logps/rejected": -114.2290267944336, "loss": 120883.025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0035058532375842333, "rewards/margins": 0.0005805862019769847, "rewards/rejected": -0.004086439497768879, "step": 1790 }, { "epoch": 0.18213093190326823, "grad_norm": 284493.01226751966, "learning_rate": 4.544074657072183e-07, "logits/chosen": -4.2020583152771, "logits/rejected": -4.198037147521973, "logps/chosen": -127.97209167480469, "logps/rejected": -124.34565734863281, "loss": 130530.325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005443344824016094, "rewards/margins": 9.451466758036986e-05, "rewards/rejected": -0.005537859629839659, "step": 1800 }, { "epoch": 0.18314277041384194, "grad_norm": 96146.49056216612, "learning_rate": 4.5384528895884867e-07, "logits/chosen": -4.182011604309082, "logits/rejected": -4.333140850067139, "logps/chosen": -133.1157989501953, "logps/rejected": -132.61439514160156, "loss": 125634.3875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004612096585333347, "rewards/margins": 0.0013182652182877064, "rewards/rejected": -0.005930361803621054, "step": 1810 }, { "epoch": 0.18415460892441565, "grad_norm": 614876.1536815623, "learning_rate": 4.5328311221047896e-07, "logits/chosen": -3.5780868530273438, "logits/rejected": -3.786602020263672, "logps/chosen": -148.16358947753906, "logps/rejected": -105.85920715332031, "loss": 118793.0375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0033023287542164326, "rewards/margins": -0.0002225384523626417, "rewards/rejected": -0.0030797901563346386, "step": 1820 }, { "epoch": 0.18516644743498936, "grad_norm": 258582.59813015428, "learning_rate": 4.5272093546210925e-07, "logits/chosen": -3.7572360038757324, "logits/rejected": -3.602447032928467, "logps/chosen": -63.567108154296875, "logps/rejected": -90.2384033203125, "loss": 106564.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004181220196187496, "rewards/margins": 0.000998824485577643, "rewards/rejected": -0.005180044565349817, "step": 1830 }, { "epoch": 0.1861782859455631, "grad_norm": 305812.67043652793, "learning_rate": 4.521587587137396e-07, "logits/chosen": -4.025088310241699, "logits/rejected": -3.827329158782959, "logps/chosen": -71.41111755371094, "logps/rejected": -112.443603515625, "loss": 136212.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004553740378469229, "rewards/margins": 0.0005904590943828225, "rewards/rejected": -0.005144199822098017, "step": 1840 }, { "epoch": 0.1871901244561368, "grad_norm": 101177.58413943832, "learning_rate": 4.515965819653699e-07, "logits/chosen": -4.255812168121338, "logits/rejected": -4.244455814361572, "logps/chosen": -116.70538330078125, "logps/rejected": -114.45893859863281, "loss": 122537.2, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002781398594379425, "rewards/margins": 0.0009059592848643661, "rewards/rejected": -0.003687357995659113, "step": 1850 }, { "epoch": 0.18820196296671052, "grad_norm": 255209.3077536103, "learning_rate": 4.510344052170002e-07, "logits/chosen": -3.9718494415283203, "logits/rejected": -3.797884464263916, "logps/chosen": -78.7676010131836, "logps/rejected": -81.39573669433594, "loss": 116897.3875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0044184112921357155, "rewards/margins": 0.000351707567460835, "rewards/rejected": -0.004770118743181229, "step": 1860 }, { "epoch": 0.18921380147728423, "grad_norm": 250935.98623572217, "learning_rate": 4.504722284686305e-07, "logits/chosen": -4.274395942687988, "logits/rejected": -4.348827362060547, "logps/chosen": -99.34766387939453, "logps/rejected": -98.19015502929688, "loss": 119858.1375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003814449068158865, "rewards/margins": 3.638419002527371e-06, "rewards/rejected": -0.0038180872797966003, "step": 1870 }, { "epoch": 0.19022563998785794, "grad_norm": 361655.5509153546, "learning_rate": 4.499100517202608e-07, "logits/chosen": -4.039985179901123, "logits/rejected": -4.0378499031066895, "logps/chosen": -75.35425567626953, "logps/rejected": -95.51576232910156, "loss": 112037.0, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003652235958725214, "rewards/margins": 0.0006578944157809019, "rewards/rejected": -0.00431013060733676, "step": 1880 }, { "epoch": 0.19123747849843165, "grad_norm": 185322.67687128112, "learning_rate": 4.4934787497189116e-07, "logits/chosen": -4.648663520812988, "logits/rejected": -4.529079437255859, "logps/chosen": -55.469566345214844, "logps/rejected": -71.42753601074219, "loss": 102226.8687, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.003563587088137865, "rewards/margins": 0.0004916454199701548, "rewards/rejected": -0.004055232275277376, "step": 1890 }, { "epoch": 0.19224931700900535, "grad_norm": 216634.1405472203, "learning_rate": 4.4878569822352146e-07, "logits/chosen": -4.1222944259643555, "logits/rejected": -4.0320234298706055, "logps/chosen": -87.47676086425781, "logps/rejected": -125.8713150024414, "loss": 120904.05, "rewards/accuracies": 0.5, "rewards/chosen": -0.003524667350575328, "rewards/margins": 0.0007985819247551262, "rewards/rejected": -0.004323249217122793, "step": 1900 }, { "epoch": 0.19326115551957906, "grad_norm": 679638.3591668615, "learning_rate": 4.4822352147515175e-07, "logits/chosen": -4.219275951385498, "logits/rejected": -4.226140975952148, "logps/chosen": -91.05177307128906, "logps/rejected": -78.14591979980469, "loss": 117222.35, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004045435693114996, "rewards/margins": -6.907393981236964e-05, "rewards/rejected": -0.003976361360400915, "step": 1910 }, { "epoch": 0.1942729940301528, "grad_norm": 312940.810039785, "learning_rate": 4.476613447267821e-07, "logits/chosen": -3.848017454147339, "logits/rejected": -3.7952423095703125, "logps/chosen": -97.63762664794922, "logps/rejected": -144.7411346435547, "loss": 131292.4875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003409839468076825, "rewards/margins": 0.0007996425265446305, "rewards/rejected": -0.0042094821110367775, "step": 1920 }, { "epoch": 0.1952848325407265, "grad_norm": 199901.10847603722, "learning_rate": 4.470991679784124e-07, "logits/chosen": -4.133519172668457, "logits/rejected": -4.062095642089844, "logps/chosen": -92.30223083496094, "logps/rejected": -126.0958023071289, "loss": 122853.775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0026543443091213703, "rewards/margins": 0.0015392573550343513, "rewards/rejected": -0.004193601198494434, "step": 1930 }, { "epoch": 0.19629667105130022, "grad_norm": 124091.82185423943, "learning_rate": 4.465369912300427e-07, "logits/chosen": -4.4327850341796875, "logits/rejected": -4.443601608276367, "logps/chosen": -93.95780181884766, "logps/rejected": -92.31599426269531, "loss": 118199.075, "rewards/accuracies": 0.5, "rewards/chosen": -0.0039774104952812195, "rewards/margins": 0.0004179617390036583, "rewards/rejected": -0.004395372234284878, "step": 1940 }, { "epoch": 0.19730850956187393, "grad_norm": 219080.72115723544, "learning_rate": 4.45974814481673e-07, "logits/chosen": -4.413301944732666, "logits/rejected": -4.296863555908203, "logps/chosen": -61.06609344482422, "logps/rejected": -88.39147186279297, "loss": 119966.8375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00368743808940053, "rewards/margins": 0.000739839393645525, "rewards/rejected": -0.004427277483046055, "step": 1950 }, { "epoch": 0.19832034807244764, "grad_norm": 384160.14508403087, "learning_rate": 4.4541263773330337e-07, "logits/chosen": -4.148560523986816, "logits/rejected": -3.978665590286255, "logps/chosen": -81.22920227050781, "logps/rejected": -79.18531799316406, "loss": 129017.875, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0050753019750118256, "rewards/margins": -0.0007293333183042705, "rewards/rejected": -0.0043459683656692505, "step": 1960 }, { "epoch": 0.19933218658302135, "grad_norm": 304070.5992002273, "learning_rate": 4.448504609849336e-07, "logits/chosen": -3.9476075172424316, "logits/rejected": -4.047904014587402, "logps/chosen": -160.3112030029297, "logps/rejected": -138.8908233642578, "loss": 134163.625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.005255360156297684, "rewards/margins": -7.728241325821728e-05, "rewards/rejected": -0.005178078077733517, "step": 1970 }, { "epoch": 0.20034402509359506, "grad_norm": 346983.09327507456, "learning_rate": 4.4428828423656395e-07, "logits/chosen": -4.272963523864746, "logits/rejected": -4.150063514709473, "logps/chosen": -77.32780456542969, "logps/rejected": -114.16392517089844, "loss": 133727.725, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0037946898955851793, "rewards/margins": 0.0012455100659281015, "rewards/rejected": -0.005040199961513281, "step": 1980 }, { "epoch": 0.20135586360416877, "grad_norm": 79245.72580884933, "learning_rate": 4.437261074881943e-07, "logits/chosen": -4.103560447692871, "logits/rejected": -3.9143099784851074, "logps/chosen": -66.20712280273438, "logps/rejected": -96.43231964111328, "loss": 105774.375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0032118745148181915, "rewards/margins": 0.0013805328635498881, "rewards/rejected": -0.004592407029122114, "step": 1990 }, { "epoch": 0.20236770211474248, "grad_norm": 211611.46333824677, "learning_rate": 4.4316393073982454e-07, "logits/chosen": -3.849123477935791, "logits/rejected": -3.702561140060425, "logps/chosen": -311.74078369140625, "logps/rejected": -321.86138916015625, "loss": 129922.95, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0031457100994884968, "rewards/margins": 0.0005301374476402998, "rewards/rejected": -0.0036758475471287966, "step": 2000 }, { "epoch": 0.2033795406253162, "grad_norm": 247957.3642478691, "learning_rate": 4.426017539914549e-07, "logits/chosen": -3.8594932556152344, "logits/rejected": -3.906545639038086, "logps/chosen": -128.96380615234375, "logps/rejected": -146.83409118652344, "loss": 123847.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004845390096306801, "rewards/margins": 0.000764570024330169, "rewards/rejected": -0.0056099602952599525, "step": 2010 }, { "epoch": 0.20439137913588992, "grad_norm": 312550.6439000465, "learning_rate": 4.4203957724308523e-07, "logits/chosen": -4.225411415100098, "logits/rejected": -4.1688761711120605, "logps/chosen": -124.68400573730469, "logps/rejected": -189.3055419921875, "loss": 131527.925, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004552851896733046, "rewards/margins": 4.434562288224697e-05, "rewards/rejected": -0.004597197286784649, "step": 2020 }, { "epoch": 0.20540321764646363, "grad_norm": 233804.8915827967, "learning_rate": 4.414774004947155e-07, "logits/chosen": -3.79594087600708, "logits/rejected": -3.9326088428497314, "logps/chosen": -77.58548736572266, "logps/rejected": -83.85520935058594, "loss": 94684.2625, "rewards/accuracies": 0.5, "rewards/chosen": -0.003335801884531975, "rewards/margins": 0.0011467259610071778, "rewards/rejected": -0.004482527729123831, "step": 2030 }, { "epoch": 0.20641505615703734, "grad_norm": 145049.50360814293, "learning_rate": 4.409152237463458e-07, "logits/chosen": -4.0499267578125, "logits/rejected": -4.019339084625244, "logps/chosen": -75.54109191894531, "logps/rejected": -91.1786880493164, "loss": 128316.825, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0037224148400127888, "rewards/margins": 0.0011719355825334787, "rewards/rejected": -0.004894350189715624, "step": 2040 }, { "epoch": 0.20742689466761105, "grad_norm": 228971.52224760654, "learning_rate": 4.4035304699797616e-07, "logits/chosen": -3.8797969818115234, "logits/rejected": -3.666975498199463, "logps/chosen": -54.26128387451172, "logps/rejected": -108.7796859741211, "loss": 108345.0875, "rewards/accuracies": 0.75, "rewards/chosen": -0.0028869197703897953, "rewards/margins": 0.0014332833234220743, "rewards/rejected": -0.004320203326642513, "step": 2050 }, { "epoch": 0.20843873317818476, "grad_norm": 208927.30154327134, "learning_rate": 4.397908702496065e-07, "logits/chosen": -4.288293361663818, "logits/rejected": -4.1937408447265625, "logps/chosen": -49.233734130859375, "logps/rejected": -61.98224639892578, "loss": 106400.25, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.003528166562318802, "rewards/margins": 0.0005813142051920295, "rewards/rejected": -0.004109479952603579, "step": 2060 }, { "epoch": 0.20945057168875847, "grad_norm": 364690.4556354069, "learning_rate": 4.3922869350123674e-07, "logits/chosen": -3.9217472076416016, "logits/rejected": -4.083714008331299, "logps/chosen": -98.38418579101562, "logps/rejected": -125.1998519897461, "loss": 115754.1125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0037692685145884752, "rewards/margins": 0.001440725987777114, "rewards/rejected": -0.005209994502365589, "step": 2070 }, { "epoch": 0.21046241019933218, "grad_norm": 220476.13449774298, "learning_rate": 4.386665167528671e-07, "logits/chosen": -3.737628221511841, "logits/rejected": -3.694453477859497, "logps/chosen": -66.05598449707031, "logps/rejected": -89.79369354248047, "loss": 123937.25, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0035935533232986927, "rewards/margins": 0.0020208400674164295, "rewards/rejected": -0.005614393390715122, "step": 2080 }, { "epoch": 0.2114742487099059, "grad_norm": 265838.33321902703, "learning_rate": 4.3810434000449743e-07, "logits/chosen": -3.9230716228485107, "logits/rejected": -4.058587551116943, "logps/chosen": -94.68692779541016, "logps/rejected": -103.76722717285156, "loss": 118981.225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003789677517488599, "rewards/margins": 0.0010761318262666464, "rewards/rejected": -0.004865809343755245, "step": 2090 }, { "epoch": 0.21248608722047962, "grad_norm": 230517.36711136842, "learning_rate": 4.3754216325612767e-07, "logits/chosen": -4.437249660491943, "logits/rejected": -4.283448219299316, "logps/chosen": -85.22029113769531, "logps/rejected": -101.94863891601562, "loss": 118424.2125, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.005315971095114946, "rewards/margins": 0.00020142775611020625, "rewards/rejected": -0.005517398007214069, "step": 2100 }, { "epoch": 0.21349792573105333, "grad_norm": 251345.91881093642, "learning_rate": 4.36979986507758e-07, "logits/chosen": -4.235365867614746, "logits/rejected": -3.9305214881896973, "logps/chosen": -79.26345825195312, "logps/rejected": -85.10787200927734, "loss": 125467.6, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004840161185711622, "rewards/margins": 0.000868795090354979, "rewards/rejected": -0.005708956625312567, "step": 2110 }, { "epoch": 0.21450976424162704, "grad_norm": 262867.0591696035, "learning_rate": 4.3641780975938836e-07, "logits/chosen": -4.00656270980835, "logits/rejected": -3.9954562187194824, "logps/chosen": -99.58901977539062, "logps/rejected": -111.31529235839844, "loss": 115315.7625, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035789161920547485, "rewards/margins": 0.0006672415183857083, "rewards/rejected": -0.004246157594025135, "step": 2120 }, { "epoch": 0.21552160275220075, "grad_norm": 357626.17202431586, "learning_rate": 4.3585563301101866e-07, "logits/chosen": -4.5203728675842285, "logits/rejected": -4.500479698181152, "logps/chosen": -99.31680297851562, "logps/rejected": -77.39408111572266, "loss": 123851.375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00443462235853076, "rewards/margins": 0.0006322901463136077, "rewards/rejected": -0.005066913086920977, "step": 2130 }, { "epoch": 0.21653344126277446, "grad_norm": 193481.78414073476, "learning_rate": 4.3529345626264895e-07, "logits/chosen": -4.567302703857422, "logits/rejected": -4.553609848022461, "logps/chosen": -92.27445220947266, "logps/rejected": -114.75386047363281, "loss": 121411.8625, "rewards/accuracies": 0.5, "rewards/chosen": -0.004847072064876556, "rewards/margins": 0.0007584112463518977, "rewards/rejected": -0.005605483427643776, "step": 2140 }, { "epoch": 0.21754527977334817, "grad_norm": 218838.41219377978, "learning_rate": 4.347312795142793e-07, "logits/chosen": -3.845783233642578, "logits/rejected": -3.92968487739563, "logps/chosen": -62.03050994873047, "logps/rejected": -78.83201599121094, "loss": 121598.7875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005309219937771559, "rewards/margins": 0.0005388358258642256, "rewards/rejected": -0.005848056171089411, "step": 2150 }, { "epoch": 0.21855711828392188, "grad_norm": 295452.5542291759, "learning_rate": 4.341691027659096e-07, "logits/chosen": -4.565284252166748, "logits/rejected": -4.527190208435059, "logps/chosen": -111.7362060546875, "logps/rejected": -99.16593170166016, "loss": 117515.725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004125756211578846, "rewards/margins": 0.0007318585412576795, "rewards/rejected": -0.004857615102082491, "step": 2160 }, { "epoch": 0.2195689567944956, "grad_norm": 160617.7532688143, "learning_rate": 4.336069260175399e-07, "logits/chosen": -3.9262804985046387, "logits/rejected": -3.736189603805542, "logps/chosen": -113.66102600097656, "logps/rejected": -126.5133056640625, "loss": 115710.1375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0051412214525043964, "rewards/margins": 0.0010332257952541113, "rewards/rejected": -0.006174446549266577, "step": 2170 }, { "epoch": 0.2205807953050693, "grad_norm": 271217.7399536494, "learning_rate": 4.330447492691702e-07, "logits/chosen": -3.8329110145568848, "logits/rejected": -3.886263608932495, "logps/chosen": -87.99798583984375, "logps/rejected": -93.63317108154297, "loss": 119885.5, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.005205837544053793, "rewards/margins": 6.804079748690128e-05, "rewards/rejected": -0.005273878574371338, "step": 2180 }, { "epoch": 0.22159263381564304, "grad_norm": 210597.85039481558, "learning_rate": 4.324825725208005e-07, "logits/chosen": -3.8782620429992676, "logits/rejected": -3.8563759326934814, "logps/chosen": -53.061622619628906, "logps/rejected": -42.57799530029297, "loss": 95873.6062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0034923977218568325, "rewards/margins": 0.0005313722649589181, "rewards/rejected": -0.004023769870400429, "step": 2190 }, { "epoch": 0.22260447232621675, "grad_norm": 149252.95260806332, "learning_rate": 4.3192039577243086e-07, "logits/chosen": -3.9426283836364746, "logits/rejected": -3.8820881843566895, "logps/chosen": -94.60540008544922, "logps/rejected": -99.02853393554688, "loss": 129587.4625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005380167625844479, "rewards/margins": 0.0010467895772308111, "rewards/rejected": -0.006426957435905933, "step": 2200 }, { "epoch": 0.22361631083679046, "grad_norm": 252183.7419307713, "learning_rate": 4.3135821902406115e-07, "logits/chosen": -3.888875961303711, "logits/rejected": -3.685209274291992, "logps/chosen": -91.58513641357422, "logps/rejected": -86.85105895996094, "loss": 128561.35, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004170949570834637, "rewards/margins": 0.0033491221256554127, "rewards/rejected": -0.007520071230828762, "step": 2210 }, { "epoch": 0.22462814934736416, "grad_norm": 140635.1654088792, "learning_rate": 4.3079604227569145e-07, "logits/chosen": -4.007391929626465, "logits/rejected": -4.006250858306885, "logps/chosen": -87.653564453125, "logps/rejected": -104.83817291259766, "loss": 128723.8375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005467183887958527, "rewards/margins": 0.0009568767854943871, "rewards/rejected": -0.006424060557037592, "step": 2220 }, { "epoch": 0.22563998785793787, "grad_norm": 252490.00452484944, "learning_rate": 4.302338655273218e-07, "logits/chosen": -4.515185356140137, "logits/rejected": -4.402767181396484, "logps/chosen": -57.08858108520508, "logps/rejected": -75.56786346435547, "loss": 122040.1125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005372638814151287, "rewards/margins": 0.00021568030933849514, "rewards/rejected": -0.005588319152593613, "step": 2230 }, { "epoch": 0.22665182636851158, "grad_norm": 191336.61828315927, "learning_rate": 4.296716887789521e-07, "logits/chosen": -4.307282447814941, "logits/rejected": -4.321542263031006, "logps/chosen": -109.3209457397461, "logps/rejected": -112.74066162109375, "loss": 120663.7375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005548016168177128, "rewards/margins": 0.0010318085551261902, "rewards/rejected": -0.006579824723303318, "step": 2240 }, { "epoch": 0.2276636648790853, "grad_norm": 189673.168989855, "learning_rate": 4.291095120305824e-07, "logits/chosen": -4.29086971282959, "logits/rejected": -4.28035306930542, "logps/chosen": -143.39938354492188, "logps/rejected": -125.5017318725586, "loss": 122833.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0048203920014202595, "rewards/margins": 0.0005580293945968151, "rewards/rejected": -0.005378420930355787, "step": 2250 }, { "epoch": 0.228675503389659, "grad_norm": 487812.02131885063, "learning_rate": 4.285473352822127e-07, "logits/chosen": -3.903803586959839, "logits/rejected": -4.002103805541992, "logps/chosen": -101.29747772216797, "logps/rejected": -97.85029602050781, "loss": 115725.725, "rewards/accuracies": 0.75, "rewards/chosen": -0.004180229734629393, "rewards/margins": 0.0014212516834959388, "rewards/rejected": -0.005601480603218079, "step": 2260 }, { "epoch": 0.2296873419002327, "grad_norm": 280920.60303208284, "learning_rate": 4.27985158533843e-07, "logits/chosen": -4.18912410736084, "logits/rejected": -4.229294776916504, "logps/chosen": -121.73690032958984, "logps/rejected": -127.05914306640625, "loss": 125950.9125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.005621919874101877, "rewards/margins": -0.0005368964048102498, "rewards/rejected": -0.005085023120045662, "step": 2270 }, { "epoch": 0.23069918041080645, "grad_norm": 195761.41317251124, "learning_rate": 4.274229817854733e-07, "logits/chosen": -4.34868860244751, "logits/rejected": -4.269271373748779, "logps/chosen": -91.50044250488281, "logps/rejected": -93.80191802978516, "loss": 125958.6375, "rewards/accuracies": 0.5, "rewards/chosen": -0.004822305403649807, "rewards/margins": -6.04618908255361e-05, "rewards/rejected": -0.0047618430107831955, "step": 2280 }, { "epoch": 0.23171101892138016, "grad_norm": 125741.40982499786, "learning_rate": 4.2686080503710365e-07, "logits/chosen": -3.6477770805358887, "logits/rejected": -3.6584060192108154, "logps/chosen": -101.27046966552734, "logps/rejected": -96.59112548828125, "loss": 109036.25, "rewards/accuracies": 0.5, "rewards/chosen": -0.003919287584722042, "rewards/margins": 0.0004901940701529384, "rewards/rejected": -0.004409481771290302, "step": 2290 }, { "epoch": 0.23272285743195387, "grad_norm": 248539.09249414952, "learning_rate": 4.26298628288734e-07, "logits/chosen": -4.322298526763916, "logits/rejected": -4.302892684936523, "logps/chosen": -71.04796600341797, "logps/rejected": -106.3181381225586, "loss": 116029.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004150839522480965, "rewards/margins": 0.0007650136249139905, "rewards/rejected": -0.004915853496640921, "step": 2300 }, { "epoch": 0.23373469594252758, "grad_norm": 269869.04879164265, "learning_rate": 4.2573645154036423e-07, "logits/chosen": -4.011407375335693, "logits/rejected": -3.8971850872039795, "logps/chosen": -103.6158218383789, "logps/rejected": -148.23753356933594, "loss": 115188.7125, "rewards/accuracies": 0.75, "rewards/chosen": -0.004681793507188559, "rewards/margins": 0.0016157161444425583, "rewards/rejected": -0.006297510117292404, "step": 2310 }, { "epoch": 0.23474653445310129, "grad_norm": 227935.72331138715, "learning_rate": 4.251742747919946e-07, "logits/chosen": -4.269082546234131, "logits/rejected": -4.400497913360596, "logps/chosen": -48.705013275146484, "logps/rejected": -69.49369049072266, "loss": 114954.3625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003710730466991663, "rewards/margins": 0.00105691805947572, "rewards/rejected": -0.004767647944390774, "step": 2320 }, { "epoch": 0.235758372963675, "grad_norm": 249001.71059297142, "learning_rate": 4.246120980436249e-07, "logits/chosen": -3.8870880603790283, "logits/rejected": -3.960411787033081, "logps/chosen": -58.8763542175293, "logps/rejected": -111.39410400390625, "loss": 116615.0125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004866303410381079, "rewards/margins": 0.0020912562031298876, "rewards/rejected": -0.00695755984634161, "step": 2330 }, { "epoch": 0.2367702114742487, "grad_norm": 535206.7182252966, "learning_rate": 4.2404992129525516e-07, "logits/chosen": -4.1434783935546875, "logits/rejected": -4.162901878356934, "logps/chosen": -149.44241333007812, "logps/rejected": -167.46621704101562, "loss": 118083.75, "rewards/accuracies": 0.5, "rewards/chosen": -0.005842931102961302, "rewards/margins": 0.0008505303412675858, "rewards/rejected": -0.0066934614442288876, "step": 2340 }, { "epoch": 0.2377820499848224, "grad_norm": 256254.00770154162, "learning_rate": 4.234877445468855e-07, "logits/chosen": -4.321585178375244, "logits/rejected": -4.505631923675537, "logps/chosen": -93.20542907714844, "logps/rejected": -105.25077056884766, "loss": 118626.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004169521853327751, "rewards/margins": 0.0021848788019269705, "rewards/rejected": -0.006354401353746653, "step": 2350 }, { "epoch": 0.23879388849539612, "grad_norm": 281672.8371100282, "learning_rate": 4.2292556779851586e-07, "logits/chosen": -3.384584903717041, "logits/rejected": -3.430936098098755, "logps/chosen": -78.37525939941406, "logps/rejected": -126.27363586425781, "loss": 108431.8625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004434486851096153, "rewards/margins": 0.0012269311118870974, "rewards/rejected": -0.005661417730152607, "step": 2360 }, { "epoch": 0.23980572700596986, "grad_norm": 454930.53446776595, "learning_rate": 4.2236339105014615e-07, "logits/chosen": -3.5525002479553223, "logits/rejected": -3.4954819679260254, "logps/chosen": -330.58856201171875, "logps/rejected": -359.14508056640625, "loss": 126155.7625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004921243991702795, "rewards/margins": 0.001221759826876223, "rewards/rejected": -0.006143004167824984, "step": 2370 }, { "epoch": 0.24081756551654357, "grad_norm": 319427.4990017536, "learning_rate": 4.2180121430177644e-07, "logits/chosen": -3.9042885303497314, "logits/rejected": -3.8394227027893066, "logps/chosen": -109.74391174316406, "logps/rejected": -129.3977508544922, "loss": 117720.8875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004484993871301413, "rewards/margins": 0.0006752386689186096, "rewards/rejected": -0.005160232074558735, "step": 2380 }, { "epoch": 0.24182940402711728, "grad_norm": 242551.89721278875, "learning_rate": 4.212390375534068e-07, "logits/chosen": -4.149147987365723, "logits/rejected": -4.195471286773682, "logps/chosen": -77.7817611694336, "logps/rejected": -87.5733413696289, "loss": 125097.0875, "rewards/accuracies": 0.5, "rewards/chosen": -0.0041512493044137955, "rewards/margins": 0.0009974588174372911, "rewards/rejected": -0.005148708820343018, "step": 2390 }, { "epoch": 0.242841242537691, "grad_norm": 239915.02603792993, "learning_rate": 4.206768608050371e-07, "logits/chosen": -4.307778835296631, "logits/rejected": -4.3541717529296875, "logps/chosen": -71.07827758789062, "logps/rejected": -105.52192687988281, "loss": 134715.0375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005167326424270868, "rewards/margins": 0.001587193226441741, "rewards/rejected": -0.006754518952220678, "step": 2400 }, { "epoch": 0.2438530810482647, "grad_norm": 312461.7540227967, "learning_rate": 4.2011468405666737e-07, "logits/chosen": -4.049666404724121, "logits/rejected": -4.072889804840088, "logps/chosen": -60.415931701660156, "logps/rejected": -97.91028594970703, "loss": 111354.325, "rewards/accuracies": 0.75, "rewards/chosen": -0.0036996272392570972, "rewards/margins": 0.002598391380161047, "rewards/rejected": -0.0062980190850794315, "step": 2410 }, { "epoch": 0.2448649195588384, "grad_norm": 213208.30195597582, "learning_rate": 4.195525073082977e-07, "logits/chosen": -3.961641311645508, "logits/rejected": -3.7028307914733887, "logps/chosen": -55.05030059814453, "logps/rejected": -234.68783569335938, "loss": 99358.275, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005233920179307461, "rewards/margins": 0.0004369236121419817, "rewards/rejected": -0.00567084364593029, "step": 2420 }, { "epoch": 0.24587675806941212, "grad_norm": 329962.9740005669, "learning_rate": 4.18990330559928e-07, "logits/chosen": -3.7324554920196533, "logits/rejected": -3.701519727706909, "logps/chosen": -126.43896484375, "logps/rejected": -108.0107421875, "loss": 122848.4125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005350460298359394, "rewards/margins": 0.0005220606690272689, "rewards/rejected": -0.005872521083801985, "step": 2430 }, { "epoch": 0.24688859657998583, "grad_norm": 239869.83246210642, "learning_rate": 4.1842815381155835e-07, "logits/chosen": -4.0263848304748535, "logits/rejected": -4.242001056671143, "logps/chosen": -45.12281036376953, "logps/rejected": -110.39222717285156, "loss": 99570.3562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004296102561056614, "rewards/margins": 0.0020214994437992573, "rewards/rejected": -0.006317600607872009, "step": 2440 }, { "epoch": 0.24790043509055953, "grad_norm": 277557.5641697848, "learning_rate": 4.1786597706318864e-07, "logits/chosen": -4.250219821929932, "logits/rejected": -4.339365005493164, "logps/chosen": -106.50250244140625, "logps/rejected": -111.90193176269531, "loss": 132541.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005058504641056061, "rewards/margins": 0.0005139577551744878, "rewards/rejected": -0.0055724624544382095, "step": 2450 }, { "epoch": 0.24891227360113327, "grad_norm": 330590.32153841644, "learning_rate": 4.17303800314819e-07, "logits/chosen": -4.173534393310547, "logits/rejected": -3.8950576782226562, "logps/chosen": -80.00740051269531, "logps/rejected": -103.9118881225586, "loss": 124371.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005024563521146774, "rewards/margins": 0.0018501922022551298, "rewards/rejected": -0.006874756421893835, "step": 2460 }, { "epoch": 0.24992411211170698, "grad_norm": 124619.78071025328, "learning_rate": 4.167416235664493e-07, "logits/chosen": -3.645500659942627, "logits/rejected": -3.567645311355591, "logps/chosen": -76.82859802246094, "logps/rejected": -88.75694274902344, "loss": 103557.6125, "rewards/accuracies": 0.5, "rewards/chosen": -0.005730198230594397, "rewards/margins": 0.00042809624574147165, "rewards/rejected": -0.006158295087516308, "step": 2470 }, { "epoch": 0.2509359506222807, "grad_norm": 424955.1809826657, "learning_rate": 4.161794468180796e-07, "logits/chosen": -4.403204917907715, "logits/rejected": -4.276141166687012, "logps/chosen": -82.9153823852539, "logps/rejected": -116.17752838134766, "loss": 116607.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005769700743257999, "rewards/margins": 0.0020983137656003237, "rewards/rejected": -0.00786801427602768, "step": 2480 }, { "epoch": 0.25194778913285437, "grad_norm": 417014.2094698295, "learning_rate": 4.156172700697099e-07, "logits/chosen": -4.440662860870361, "logits/rejected": -4.252708435058594, "logps/chosen": -103.6407470703125, "logps/rejected": -112.54434967041016, "loss": 118614.4125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0070287748239934444, "rewards/margins": 0.0008007950964383781, "rewards/rejected": -0.007829570211470127, "step": 2490 }, { "epoch": 0.2529596276434281, "grad_norm": 329014.0401181709, "learning_rate": 4.150550933213402e-07, "logits/chosen": -4.290779113769531, "logits/rejected": -4.215601444244385, "logps/chosen": -93.05828857421875, "logps/rejected": -102.99113464355469, "loss": 135991.15, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004868585616350174, "rewards/margins": 0.001451080315746367, "rewards/rejected": -0.006319665815681219, "step": 2500 }, { "epoch": 0.25397146615400185, "grad_norm": 228988.5038533061, "learning_rate": 4.144929165729705e-07, "logits/chosen": -4.488986015319824, "logits/rejected": -4.633746147155762, "logps/chosen": -90.9441909790039, "logps/rejected": -93.44641876220703, "loss": 103125.4375, "rewards/accuracies": 0.5, "rewards/chosen": -0.004904269240796566, "rewards/margins": 0.0010938772466033697, "rewards/rejected": -0.005998146254569292, "step": 2510 }, { "epoch": 0.2549833046645755, "grad_norm": 265834.7936854152, "learning_rate": 4.1393073982460085e-07, "logits/chosen": -4.1303558349609375, "logits/rejected": -4.018924236297607, "logps/chosen": -91.94831085205078, "logps/rejected": -94.92243194580078, "loss": 121826.7125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005976690445095301, "rewards/margins": 0.0006163891521282494, "rewards/rejected": -0.006593079771846533, "step": 2520 }, { "epoch": 0.25599514317514926, "grad_norm": 355542.17305812467, "learning_rate": 4.1336856307623114e-07, "logits/chosen": -4.202084541320801, "logits/rejected": -4.285675525665283, "logps/chosen": -87.40703582763672, "logps/rejected": -102.72293853759766, "loss": 125755.85, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005657069385051727, "rewards/margins": 0.0019889110699295998, "rewards/rejected": -0.0076459795236587524, "step": 2530 }, { "epoch": 0.25700698168572295, "grad_norm": 402019.58372907987, "learning_rate": 4.128063863278615e-07, "logits/chosen": -4.250589370727539, "logits/rejected": -4.240958213806152, "logps/chosen": -112.7724380493164, "logps/rejected": -159.60247802734375, "loss": 135039.075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005499436054378748, "rewards/margins": 0.0015118387527763844, "rewards/rejected": -0.007011274807155132, "step": 2540 }, { "epoch": 0.2580188201962967, "grad_norm": 157904.43208188124, "learning_rate": 4.122442095794918e-07, "logits/chosen": -4.389643669128418, "logits/rejected": -4.423759937286377, "logps/chosen": -69.56539916992188, "logps/rejected": -94.60087585449219, "loss": 112742.7875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006060443818569183, "rewards/margins": 0.0011582353617995977, "rewards/rejected": -0.00721867848187685, "step": 2550 }, { "epoch": 0.25903065870687036, "grad_norm": 1506565.6719397434, "learning_rate": 4.1168203283112207e-07, "logits/chosen": -3.978386402130127, "logits/rejected": -4.117263317108154, "logps/chosen": -283.19061279296875, "logps/rejected": -315.6737060546875, "loss": 91609.4625, "rewards/accuracies": 0.5, "rewards/chosen": -0.005706795025616884, "rewards/margins": 0.0011518045794218779, "rewards/rejected": -0.0068585993722081184, "step": 2560 }, { "epoch": 0.2600424972174441, "grad_norm": 219680.99198232073, "learning_rate": 4.111198560827524e-07, "logits/chosen": -4.149661064147949, "logits/rejected": -4.18801212310791, "logps/chosen": -104.87120056152344, "logps/rejected": -88.0521011352539, "loss": 124892.9625, "rewards/accuracies": 0.5, "rewards/chosen": -0.007192754652351141, "rewards/margins": -0.00028708874015137553, "rewards/rejected": -0.006905666086822748, "step": 2570 }, { "epoch": 0.2610543357280178, "grad_norm": 177297.34365176596, "learning_rate": 4.105576793343827e-07, "logits/chosen": -4.387120723724365, "logits/rejected": -4.521063327789307, "logps/chosen": -89.09586334228516, "logps/rejected": -98.24824523925781, "loss": 114270.925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005158782936632633, "rewards/margins": 0.0031965405214577913, "rewards/rejected": -0.008355323225259781, "step": 2580 }, { "epoch": 0.2620661742385915, "grad_norm": 369258.7548032879, "learning_rate": 4.09995502586013e-07, "logits/chosen": -4.418820381164551, "logits/rejected": -4.497103214263916, "logps/chosen": -105.48272705078125, "logps/rejected": -101.08057403564453, "loss": 131912.4125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006161731667816639, "rewards/margins": 0.0004941770457662642, "rewards/rejected": -0.006655907724052668, "step": 2590 }, { "epoch": 0.26307801274916526, "grad_norm": 299702.6177984591, "learning_rate": 4.0943332583764335e-07, "logits/chosen": -4.136218070983887, "logits/rejected": -4.09130859375, "logps/chosen": -77.59257507324219, "logps/rejected": -92.10211944580078, "loss": 127533.325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0057595050893723965, "rewards/margins": 0.0007476548198610544, "rewards/rejected": -0.006507159676402807, "step": 2600 }, { "epoch": 0.26408985125973894, "grad_norm": 200545.77724885163, "learning_rate": 4.088711490892737e-07, "logits/chosen": -4.2305908203125, "logits/rejected": -4.10825252532959, "logps/chosen": -73.11328125, "logps/rejected": -87.08164978027344, "loss": 104928.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005477492697536945, "rewards/margins": 0.0025096230674535036, "rewards/rejected": -0.007987115532159805, "step": 2610 }, { "epoch": 0.2651016897703127, "grad_norm": 285079.4060311805, "learning_rate": 4.0830897234090393e-07, "logits/chosen": -4.225539207458496, "logits/rejected": -4.228917121887207, "logps/chosen": -94.39434814453125, "logps/rejected": -125.0582275390625, "loss": 108321.95, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005365004297345877, "rewards/margins": 0.0023889862932264805, "rewards/rejected": -0.007753990590572357, "step": 2620 }, { "epoch": 0.26611352828088636, "grad_norm": 266313.4559497035, "learning_rate": 4.077467955925343e-07, "logits/chosen": -4.279709339141846, "logits/rejected": -4.352081298828125, "logps/chosen": -50.2455940246582, "logps/rejected": -72.1024169921875, "loss": 116533.7375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0045697507448494434, "rewards/margins": 0.0015771437902003527, "rewards/rejected": -0.006146893836557865, "step": 2630 }, { "epoch": 0.2671253667914601, "grad_norm": 183000.2854422589, "learning_rate": 4.071846188441646e-07, "logits/chosen": -4.022402763366699, "logits/rejected": -4.0236616134643555, "logps/chosen": -88.97816467285156, "logps/rejected": -107.15156555175781, "loss": 119240.1, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0066880895756185055, "rewards/margins": -0.00019836782303173095, "rewards/rejected": -0.006489722523838282, "step": 2640 }, { "epoch": 0.2681372053020338, "grad_norm": 256018.76607036305, "learning_rate": 4.0662244209579486e-07, "logits/chosen": -4.092931270599365, "logits/rejected": -4.108860969543457, "logps/chosen": -91.12608337402344, "logps/rejected": -119.0372085571289, "loss": 115296.5875, "rewards/accuracies": 0.5, "rewards/chosen": -0.006537883076816797, "rewards/margins": 0.0013573484029620886, "rewards/rejected": -0.007895232178270817, "step": 2650 }, { "epoch": 0.2691490438126075, "grad_norm": 272241.8777105344, "learning_rate": 4.060602653474252e-07, "logits/chosen": -4.248331546783447, "logits/rejected": -4.27681827545166, "logps/chosen": -71.33721923828125, "logps/rejected": -84.22361755371094, "loss": 117060.975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005711617413908243, "rewards/margins": 0.0013602583203464746, "rewards/rejected": -0.0070718759670853615, "step": 2660 }, { "epoch": 0.2701608823231812, "grad_norm": 340480.5133695906, "learning_rate": 4.0549808859905555e-07, "logits/chosen": -4.294464588165283, "logits/rejected": -4.083519458770752, "logps/chosen": -86.90748596191406, "logps/rejected": -156.06874084472656, "loss": 116383.4125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005400433670729399, "rewards/margins": 0.0026751630939543247, "rewards/rejected": -0.008075596764683723, "step": 2670 }, { "epoch": 0.27117272083375493, "grad_norm": 343715.2453241215, "learning_rate": 4.0493591185068584e-07, "logits/chosen": -3.9231045246124268, "logits/rejected": -4.189379692077637, "logps/chosen": -113.33433532714844, "logps/rejected": -100.22477722167969, "loss": 120787.1125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004624345805495977, "rewards/margins": 0.0023917369544506073, "rewards/rejected": -0.007016082759946585, "step": 2680 }, { "epoch": 0.27218455934432867, "grad_norm": 316988.9057945129, "learning_rate": 4.0437373510231614e-07, "logits/chosen": -4.220592021942139, "logits/rejected": -4.138136386871338, "logps/chosen": -43.56897735595703, "logps/rejected": -74.30413818359375, "loss": 95539.3938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0048784250393509865, "rewards/margins": 0.0016222369158640504, "rewards/rejected": -0.006500661373138428, "step": 2690 }, { "epoch": 0.27319639785490235, "grad_norm": 317886.92117900547, "learning_rate": 4.038115583539465e-07, "logits/chosen": -4.21859884262085, "logits/rejected": -4.159200191497803, "logps/chosen": -90.20403289794922, "logps/rejected": -92.93489837646484, "loss": 119642.7875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004574848338961601, "rewards/margins": 0.0018031062791123986, "rewards/rejected": -0.006377955432981253, "step": 2700 }, { "epoch": 0.2742082363654761, "grad_norm": 397537.6565004374, "learning_rate": 4.032493816055768e-07, "logits/chosen": -4.497698783874512, "logits/rejected": -4.558823585510254, "logps/chosen": -91.39258575439453, "logps/rejected": -81.53842163085938, "loss": 109661.8, "rewards/accuracies": 0.5, "rewards/chosen": -0.005308427847921848, "rewards/margins": 0.00015488786448258907, "rewards/rejected": -0.005463315173983574, "step": 2710 }, { "epoch": 0.27522007487604977, "grad_norm": 130974.90977496028, "learning_rate": 4.0268720485720707e-07, "logits/chosen": -3.9454383850097656, "logits/rejected": -3.9639859199523926, "logps/chosen": -87.45436096191406, "logps/rejected": -87.42437744140625, "loss": 104441.6187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005743239540606737, "rewards/margins": 0.00040683423867449164, "rewards/rejected": -0.006150074303150177, "step": 2720 }, { "epoch": 0.2762319133866235, "grad_norm": 271989.8592627656, "learning_rate": 4.021250281088374e-07, "logits/chosen": -3.929703950881958, "logits/rejected": -3.8780341148376465, "logps/chosen": -84.32243347167969, "logps/rejected": -104.66678619384766, "loss": 106522.9875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006251279264688492, "rewards/margins": 0.001928055426105857, "rewards/rejected": -0.008179333992302418, "step": 2730 }, { "epoch": 0.2772437518971972, "grad_norm": 176404.20652093747, "learning_rate": 4.015628513604677e-07, "logits/chosen": -4.06400203704834, "logits/rejected": -4.109758377075195, "logps/chosen": -62.60185623168945, "logps/rejected": -75.8342056274414, "loss": 120739.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005690396763384342, "rewards/margins": 0.0019234681967645884, "rewards/rejected": -0.007613864727318287, "step": 2740 }, { "epoch": 0.2782555904077709, "grad_norm": 8675.94372188568, "learning_rate": 4.01000674612098e-07, "logits/chosen": -3.9262070655822754, "logits/rejected": -3.764385223388672, "logps/chosen": -125.70048522949219, "logps/rejected": -124.67744445800781, "loss": 114262.7875, "rewards/accuracies": 0.5, "rewards/chosen": -0.007175167091190815, "rewards/margins": 0.0003525557112880051, "rewards/rejected": -0.0075277225114405155, "step": 2750 }, { "epoch": 0.2792674289183446, "grad_norm": 173096.8655853289, "learning_rate": 4.0043849786372834e-07, "logits/chosen": -4.092230319976807, "logits/rejected": -4.1397576332092285, "logps/chosen": -72.81490325927734, "logps/rejected": -88.89212799072266, "loss": 130682.5125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0064132981933653355, "rewards/margins": -0.00034898793091997504, "rewards/rejected": -0.006064309738576412, "step": 2760 }, { "epoch": 0.28027926742891834, "grad_norm": 190120.5093426987, "learning_rate": 3.9987632111535863e-07, "logits/chosen": -4.308685779571533, "logits/rejected": -4.225769519805908, "logps/chosen": -90.99336242675781, "logps/rejected": -117.38420104980469, "loss": 124941.125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007797076366841793, "rewards/margins": 0.0018932241946458817, "rewards/rejected": -0.00969030149281025, "step": 2770 }, { "epoch": 0.2812911059394921, "grad_norm": 294965.61603752646, "learning_rate": 3.99314144366989e-07, "logits/chosen": -4.312330722808838, "logits/rejected": -4.248936653137207, "logps/chosen": -88.9434585571289, "logps/rejected": -106.7408218383789, "loss": 111875.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00642926013097167, "rewards/margins": 0.001118561252951622, "rewards/rejected": -0.007547820918262005, "step": 2780 }, { "epoch": 0.28230294445006576, "grad_norm": 336219.59482749016, "learning_rate": 3.9875196761861927e-07, "logits/chosen": -4.062894344329834, "logits/rejected": -4.065415382385254, "logps/chosen": -58.777435302734375, "logps/rejected": -89.47084045410156, "loss": 110997.7375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0045766751281917095, "rewards/margins": 0.0020377091132104397, "rewards/rejected": -0.006614384241402149, "step": 2790 }, { "epoch": 0.2833147829606395, "grad_norm": 360538.6195312993, "learning_rate": 3.9818979087024956e-07, "logits/chosen": -3.981794834136963, "logits/rejected": -3.8554999828338623, "logps/chosen": -81.1152572631836, "logps/rejected": -113.17411804199219, "loss": 112528.475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036605396308004856, "rewards/margins": 0.0024841083213686943, "rewards/rejected": -0.006144647486507893, "step": 2800 }, { "epoch": 0.2843266214712132, "grad_norm": 180784.71484152527, "learning_rate": 3.976276141218799e-07, "logits/chosen": -4.259577751159668, "logits/rejected": -4.024374961853027, "logps/chosen": -53.4041862487793, "logps/rejected": -101.00712585449219, "loss": 110003.8375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00484355166554451, "rewards/margins": 0.002806438598781824, "rewards/rejected": -0.007649990729987621, "step": 2810 }, { "epoch": 0.2853384599817869, "grad_norm": 226980.9683663872, "learning_rate": 3.970654373735102e-07, "logits/chosen": -3.727785110473633, "logits/rejected": -3.71907114982605, "logps/chosen": -95.5955810546875, "logps/rejected": -113.94929504394531, "loss": 116001.1375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004467967431992292, "rewards/margins": 0.0021979615557938814, "rewards/rejected": -0.0066659292206168175, "step": 2820 }, { "epoch": 0.2863502984923606, "grad_norm": 247958.254812009, "learning_rate": 3.965032606251405e-07, "logits/chosen": -4.268080711364746, "logits/rejected": -4.249407768249512, "logps/chosen": -79.71212005615234, "logps/rejected": -82.0971908569336, "loss": 111222.325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005452226381748915, "rewards/margins": -0.00039922440191730857, "rewards/rejected": -0.005053001921623945, "step": 2830 }, { "epoch": 0.28736213700293434, "grad_norm": 395654.44958327303, "learning_rate": 3.9594108387677084e-07, "logits/chosen": -4.009488105773926, "logits/rejected": -3.9110500812530518, "logps/chosen": -74.55033111572266, "logps/rejected": -106.54325866699219, "loss": 112449.5375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005144452676177025, "rewards/margins": 0.001429051742888987, "rewards/rejected": -0.00657350430265069, "step": 2840 }, { "epoch": 0.288373975513508, "grad_norm": 293646.57573100814, "learning_rate": 3.953789071284012e-07, "logits/chosen": -4.283346176147461, "logits/rejected": -4.335648536682129, "logps/chosen": -111.4189453125, "logps/rejected": -126.66926574707031, "loss": 114910.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006310665514320135, "rewards/margins": 0.0009458738495595753, "rewards/rejected": -0.007256539072841406, "step": 2850 }, { "epoch": 0.28938581402408176, "grad_norm": 248527.55270247484, "learning_rate": 3.948167303800314e-07, "logits/chosen": -3.922356128692627, "logits/rejected": -3.9619593620300293, "logps/chosen": -66.01215362548828, "logps/rejected": -85.35952758789062, "loss": 110010.7125, "rewards/accuracies": 0.5, "rewards/chosen": -0.005364900454878807, "rewards/margins": 0.0008888916927389801, "rewards/rejected": -0.006253792438656092, "step": 2860 }, { "epoch": 0.2903976525346555, "grad_norm": 257290.3208084374, "learning_rate": 3.9425455363166177e-07, "logits/chosen": -4.2704758644104, "logits/rejected": -4.19860315322876, "logps/chosen": -115.43266296386719, "logps/rejected": -73.44981384277344, "loss": 105225.3125, "rewards/accuracies": 0.5, "rewards/chosen": -0.004963643848896027, "rewards/margins": 0.0019291483331471682, "rewards/rejected": -0.006892792880535126, "step": 2870 }, { "epoch": 0.2914094910452292, "grad_norm": 459587.15484447655, "learning_rate": 3.936923768832921e-07, "logits/chosen": -3.9880542755126953, "logits/rejected": -4.0737080574035645, "logps/chosen": -135.56005859375, "logps/rejected": -107.9188461303711, "loss": 121644.3, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007809540722519159, "rewards/margins": 0.00039221401675604284, "rewards/rejected": -0.00820175465196371, "step": 2880 }, { "epoch": 0.2924213295558029, "grad_norm": 402102.1419841082, "learning_rate": 3.931302001349224e-07, "logits/chosen": -4.105749607086182, "logits/rejected": -4.244586944580078, "logps/chosen": -77.04512786865234, "logps/rejected": -92.55188751220703, "loss": 114565.8375, "rewards/accuracies": 0.75, "rewards/chosen": -0.005024910904467106, "rewards/margins": 0.0009497810970060527, "rewards/rejected": -0.005974692292511463, "step": 2890 }, { "epoch": 0.2934331680663766, "grad_norm": 311484.86627027596, "learning_rate": 3.925680233865527e-07, "logits/chosen": -4.084427833557129, "logits/rejected": -4.26537561416626, "logps/chosen": -76.62614440917969, "logps/rejected": -90.86482238769531, "loss": 125713.375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.004594235215336084, "rewards/margins": 0.002588392933830619, "rewards/rejected": -0.007182628847658634, "step": 2900 }, { "epoch": 0.29444500657695033, "grad_norm": 307469.5266300359, "learning_rate": 3.9200584663818304e-07, "logits/chosen": -4.238560676574707, "logits/rejected": -4.172345161437988, "logps/chosen": -60.80973434448242, "logps/rejected": -121.15556335449219, "loss": 125522.9875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005793160758912563, "rewards/margins": 0.0018166534136980772, "rewards/rejected": -0.007609814405441284, "step": 2910 }, { "epoch": 0.295456845087524, "grad_norm": 262730.3481160407, "learning_rate": 3.914436698898134e-07, "logits/chosen": -4.1864118576049805, "logits/rejected": -3.918598175048828, "logps/chosen": -91.04273986816406, "logps/rejected": -104.7997055053711, "loss": 137500.55, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0059900893829762936, "rewards/margins": 0.0011536881793290377, "rewards/rejected": -0.007143777795135975, "step": 2920 }, { "epoch": 0.29646868359809775, "grad_norm": 129502.62233418788, "learning_rate": 3.9088149314144363e-07, "logits/chosen": -4.087029457092285, "logits/rejected": -3.8957839012145996, "logps/chosen": -94.52580261230469, "logps/rejected": -138.2090606689453, "loss": 109751.55, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0071629807353019714, "rewards/margins": 0.002236114116385579, "rewards/rejected": -0.009399095550179482, "step": 2930 }, { "epoch": 0.29748052210867143, "grad_norm": 183931.80847424938, "learning_rate": 3.90319316393074e-07, "logits/chosen": -4.153307914733887, "logits/rejected": -4.315186023712158, "logps/chosen": -114.782470703125, "logps/rejected": -127.20556640625, "loss": 109780.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00645861541852355, "rewards/margins": 0.0020314487628638744, "rewards/rejected": -0.008490064181387424, "step": 2940 }, { "epoch": 0.29849236061924517, "grad_norm": 194363.12748196317, "learning_rate": 3.897571396447043e-07, "logits/chosen": -4.007717609405518, "logits/rejected": -3.9106993675231934, "logps/chosen": -75.82987213134766, "logps/rejected": -101.40800476074219, "loss": 121299.55, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0067690410651266575, "rewards/margins": 0.0007348139770328999, "rewards/rejected": -0.007503855042159557, "step": 2950 }, { "epoch": 0.2995041991298189, "grad_norm": 222744.8785368009, "learning_rate": 3.8919496289633456e-07, "logits/chosen": -4.341574668884277, "logits/rejected": -4.385587692260742, "logps/chosen": -134.70396423339844, "logps/rejected": -145.31382751464844, "loss": 131295.125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0046021416783332825, "rewards/margins": 0.0021115713752806187, "rewards/rejected": -0.0067137121222913265, "step": 2960 }, { "epoch": 0.3005160376403926, "grad_norm": 350009.8070685225, "learning_rate": 3.886327861479649e-07, "logits/chosen": -4.438260078430176, "logits/rejected": -4.290699481964111, "logps/chosen": -106.2628402709961, "logps/rejected": -122.4967041015625, "loss": 127065.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.005041769240051508, "rewards/margins": 0.0030885532032698393, "rewards/rejected": -0.008130323141813278, "step": 2970 }, { "epoch": 0.3015278761509663, "grad_norm": 339429.29322951834, "learning_rate": 3.8807060939959525e-07, "logits/chosen": -4.365471839904785, "logits/rejected": -4.392399787902832, "logps/chosen": -89.18717956542969, "logps/rejected": -91.52629089355469, "loss": 127380.65, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0058120181784033775, "rewards/margins": 0.0009214317542500794, "rewards/rejected": -0.00673345010727644, "step": 2980 }, { "epoch": 0.30253971466154, "grad_norm": 356517.57545252336, "learning_rate": 3.875084326512255e-07, "logits/chosen": -4.283871650695801, "logits/rejected": -4.216443061828613, "logps/chosen": -83.66820526123047, "logps/rejected": -68.4471435546875, "loss": 118079.5, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004215974826365709, "rewards/margins": 0.0004512976738624275, "rewards/rejected": -0.004667272325605154, "step": 2990 }, { "epoch": 0.30355155317211374, "grad_norm": 157526.09162602856, "learning_rate": 3.8694625590285583e-07, "logits/chosen": -4.035764694213867, "logits/rejected": -4.036248683929443, "logps/chosen": -53.127342224121094, "logps/rejected": -109.90873718261719, "loss": 114574.0, "rewards/accuracies": 0.75, "rewards/chosen": -0.004577045328915119, "rewards/margins": 0.0024952206294983625, "rewards/rejected": -0.0070722671225667, "step": 3000 }, { "epoch": 0.3045633916826874, "grad_norm": 273311.6446838727, "learning_rate": 3.863840791544862e-07, "logits/chosen": -4.0425801277160645, "logits/rejected": -4.142579078674316, "logps/chosen": -72.18057250976562, "logps/rejected": -65.29150390625, "loss": 111094.55, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006506604608148336, "rewards/margins": 0.0004208019527141005, "rewards/rejected": -0.006927406881004572, "step": 3010 }, { "epoch": 0.30557523019326116, "grad_norm": 182971.61563211313, "learning_rate": 3.8582190240611647e-07, "logits/chosen": -4.273650169372559, "logits/rejected": -4.302116870880127, "logps/chosen": -82.27653503417969, "logps/rejected": -112.18190002441406, "loss": 120043.2, "rewards/accuracies": 0.75, "rewards/chosen": -0.006409953348338604, "rewards/margins": 0.00281868246383965, "rewards/rejected": -0.009228634648025036, "step": 3020 }, { "epoch": 0.30658706870383484, "grad_norm": 198955.43214668307, "learning_rate": 3.8525972565774676e-07, "logits/chosen": -4.416872978210449, "logits/rejected": -4.51355504989624, "logps/chosen": -126.01815032958984, "logps/rejected": -109.32572937011719, "loss": 111979.7375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007286591921001673, "rewards/margins": 0.001105672214180231, "rewards/rejected": -0.008392264135181904, "step": 3030 }, { "epoch": 0.3075989072144086, "grad_norm": 286286.4034419298, "learning_rate": 3.846975489093771e-07, "logits/chosen": -4.305948734283447, "logits/rejected": -4.315639495849609, "logps/chosen": -123.84049224853516, "logps/rejected": -126.29598236083984, "loss": 122365.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005814682226628065, "rewards/margins": 0.0006677469937130809, "rewards/rejected": -0.006482429802417755, "step": 3040 }, { "epoch": 0.3086107457249823, "grad_norm": 276286.926757737, "learning_rate": 3.841353721610074e-07, "logits/chosen": -4.397176265716553, "logits/rejected": -4.4268035888671875, "logps/chosen": -48.70510482788086, "logps/rejected": -69.10664367675781, "loss": 104066.2188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005240567959845066, "rewards/margins": 0.0016911148559302092, "rewards/rejected": -0.006931682582944632, "step": 3050 }, { "epoch": 0.309622584235556, "grad_norm": 383116.5413087396, "learning_rate": 3.835731954126377e-07, "logits/chosen": -4.02132511138916, "logits/rejected": -4.170271873474121, "logps/chosen": -84.01457214355469, "logps/rejected": -110.21614837646484, "loss": 89904.2437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004102353937923908, "rewards/margins": 0.002352704294025898, "rewards/rejected": -0.006455057766288519, "step": 3060 }, { "epoch": 0.31063442274612973, "grad_norm": 200095.07369524377, "learning_rate": 3.8301101866426804e-07, "logits/chosen": -3.9820525646209717, "logits/rejected": -3.9972774982452393, "logps/chosen": -260.48602294921875, "logps/rejected": -287.3631896972656, "loss": 99067.475, "rewards/accuracies": 0.5, "rewards/chosen": -0.007182400673627853, "rewards/margins": 0.0008468286250717938, "rewards/rejected": -0.008029229938983917, "step": 3070 }, { "epoch": 0.3116462612567034, "grad_norm": 251804.0296751292, "learning_rate": 3.8244884191589833e-07, "logits/chosen": -4.127312660217285, "logits/rejected": -4.191464424133301, "logps/chosen": -74.554931640625, "logps/rejected": -96.6747055053711, "loss": 112565.8375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006717856973409653, "rewards/margins": 0.0005706021329388022, "rewards/rejected": -0.0072884587571024895, "step": 3080 }, { "epoch": 0.31265809976727715, "grad_norm": 228851.90630766255, "learning_rate": 3.818866651675287e-07, "logits/chosen": -3.660602569580078, "logits/rejected": -3.830601215362549, "logps/chosen": -70.38997650146484, "logps/rejected": -89.13545227050781, "loss": 117350.775, "rewards/accuracies": 0.75, "rewards/chosen": -0.00583095196634531, "rewards/margins": 0.002813191618770361, "rewards/rejected": -0.008644143119454384, "step": 3090 }, { "epoch": 0.31366993827785083, "grad_norm": 220921.50798510556, "learning_rate": 3.8132448841915897e-07, "logits/chosen": -4.140199184417725, "logits/rejected": -4.035303115844727, "logps/chosen": -63.8117561340332, "logps/rejected": -89.61557006835938, "loss": 113655.65, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006749679334461689, "rewards/margins": 0.002202653791755438, "rewards/rejected": -0.00895233266055584, "step": 3100 }, { "epoch": 0.31468177678842457, "grad_norm": 298538.12815897714, "learning_rate": 3.8076231167078926e-07, "logits/chosen": -4.009060859680176, "logits/rejected": -4.023919105529785, "logps/chosen": -114.4379653930664, "logps/rejected": -113.3924789428711, "loss": 118457.0375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005896461196243763, "rewards/margins": 0.001482933177612722, "rewards/rejected": -0.007379394955933094, "step": 3110 }, { "epoch": 0.31569361529899825, "grad_norm": 419542.1805869224, "learning_rate": 3.802001349224196e-07, "logits/chosen": -4.130070686340332, "logits/rejected": -4.047537803649902, "logps/chosen": -89.31668853759766, "logps/rejected": -105.0811996459961, "loss": 113923.9, "rewards/accuracies": 0.5, "rewards/chosen": -0.007032717578113079, "rewards/margins": 0.0016100716311484575, "rewards/rejected": -0.008642788976430893, "step": 3120 }, { "epoch": 0.316705453809572, "grad_norm": 355441.21894517983, "learning_rate": 3.796379581740499e-07, "logits/chosen": -4.230761528015137, "logits/rejected": -4.185552597045898, "logps/chosen": -98.66509246826172, "logps/rejected": -120.5124282836914, "loss": 101833.5875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005945032928138971, "rewards/margins": 0.0022771377116441727, "rewards/rejected": -0.008222171105444431, "step": 3130 }, { "epoch": 0.3177172923201457, "grad_norm": 270675.77640620794, "learning_rate": 3.790757814256802e-07, "logits/chosen": -3.92470121383667, "logits/rejected": -3.928856372833252, "logps/chosen": -108.80257415771484, "logps/rejected": -135.78079223632812, "loss": 122699.7, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006593511905521154, "rewards/margins": 0.0031670243479311466, "rewards/rejected": -0.009760535322129726, "step": 3140 }, { "epoch": 0.3187291308307194, "grad_norm": 214919.32679636238, "learning_rate": 3.7851360467731054e-07, "logits/chosen": -4.10321569442749, "logits/rejected": -4.097891330718994, "logps/chosen": -53.287559509277344, "logps/rejected": -61.60056686401367, "loss": 95946.425, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.003402318339794874, "rewards/margins": 0.0010633741039782763, "rewards/rejected": -0.004465693142265081, "step": 3150 }, { "epoch": 0.31974096934129315, "grad_norm": 212284.29274049858, "learning_rate": 3.779514279289409e-07, "logits/chosen": -3.988734722137451, "logits/rejected": -3.9732985496520996, "logps/chosen": -87.38238525390625, "logps/rejected": -104.5042495727539, "loss": 124485.25, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006758561823517084, "rewards/margins": 0.0011026656720787287, "rewards/rejected": -0.007861226797103882, "step": 3160 }, { "epoch": 0.32075280785186683, "grad_norm": 324930.79076033615, "learning_rate": 3.773892511805711e-07, "logits/chosen": -4.026372909545898, "logits/rejected": -3.952272415161133, "logps/chosen": -115.97798156738281, "logps/rejected": -142.42062377929688, "loss": 139614.7125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0081253070384264, "rewards/margins": 0.00265224720351398, "rewards/rejected": -0.010777552612125874, "step": 3170 }, { "epoch": 0.32176464636244057, "grad_norm": 189252.89335039997, "learning_rate": 3.7682707443220147e-07, "logits/chosen": -4.073111534118652, "logits/rejected": -3.961087465286255, "logps/chosen": -98.89385223388672, "logps/rejected": -130.2420654296875, "loss": 103169.1875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006896146573126316, "rewards/margins": 0.0008600965375080705, "rewards/rejected": -0.007756242994219065, "step": 3180 }, { "epoch": 0.32277648487301425, "grad_norm": 289430.11624960956, "learning_rate": 3.762648976838318e-07, "logits/chosen": -4.106657981872559, "logits/rejected": -4.14911413192749, "logps/chosen": -110.18965148925781, "logps/rejected": -138.95375061035156, "loss": 106863.15, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005918685346841812, "rewards/margins": 0.0011271049734205008, "rewards/rejected": -0.007045789621770382, "step": 3190 }, { "epoch": 0.323788323383588, "grad_norm": 237044.03395860663, "learning_rate": 3.7570272093546205e-07, "logits/chosen": -3.9573440551757812, "logits/rejected": -3.9778456687927246, "logps/chosen": -72.37495422363281, "logps/rejected": -76.92210388183594, "loss": 80310.5125, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.005998507142066956, "rewards/margins": 0.001244590850546956, "rewards/rejected": -0.007243098225444555, "step": 3200 }, { "epoch": 0.32480016189416167, "grad_norm": 172953.29955995383, "learning_rate": 3.751405441870924e-07, "logits/chosen": -4.677410125732422, "logits/rejected": -4.725589752197266, "logps/chosen": -59.458351135253906, "logps/rejected": -104.36271667480469, "loss": 114756.4625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00592556968331337, "rewards/margins": 0.0015740093076601624, "rewards/rejected": -0.007499578408896923, "step": 3210 }, { "epoch": 0.3258120004047354, "grad_norm": 217547.16018117988, "learning_rate": 3.7457836743872274e-07, "logits/chosen": -3.5794105529785156, "logits/rejected": -3.771486759185791, "logps/chosen": -102.8990478515625, "logps/rejected": -93.38089752197266, "loss": 120235.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006883515510708094, "rewards/margins": 0.0010748576605692506, "rewards/rejected": -0.007958373986184597, "step": 3220 }, { "epoch": 0.32682383891530914, "grad_norm": 326234.30785549566, "learning_rate": 3.74016190690353e-07, "logits/chosen": -4.086706161499023, "logits/rejected": -4.093437671661377, "logps/chosen": -81.30166625976562, "logps/rejected": -69.65373229980469, "loss": 116031.4625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0073194364085793495, "rewards/margins": -0.00013055077579338104, "rewards/rejected": -0.007188886404037476, "step": 3230 }, { "epoch": 0.3278356774258828, "grad_norm": 251689.00805481474, "learning_rate": 3.7345401394198333e-07, "logits/chosen": -4.2093915939331055, "logits/rejected": -4.261962890625, "logps/chosen": -88.76929473876953, "logps/rejected": -94.9545669555664, "loss": 122034.05, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.006986069492995739, "rewards/margins": 0.000744761957321316, "rewards/rejected": -0.007730831392109394, "step": 3240 }, { "epoch": 0.32884751593645656, "grad_norm": 254578.50979576592, "learning_rate": 3.7289183719361367e-07, "logits/chosen": -4.478003025054932, "logits/rejected": -4.426497459411621, "logps/chosen": -77.70759582519531, "logps/rejected": -94.90160369873047, "loss": 115403.7125, "rewards/accuracies": 0.5, "rewards/chosen": -0.0064351120963692665, "rewards/margins": -8.846633363646106e-07, "rewards/rejected": -0.006434228271245956, "step": 3250 }, { "epoch": 0.32985935444703024, "grad_norm": 150074.7482209239, "learning_rate": 3.7232966044524396e-07, "logits/chosen": -4.311535358428955, "logits/rejected": -4.52819299697876, "logps/chosen": -95.95474243164062, "logps/rejected": -99.40364074707031, "loss": 121250.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.006602225359529257, "rewards/margins": 0.0007590622408315539, "rewards/rejected": -0.00736128818243742, "step": 3260 }, { "epoch": 0.330871192957604, "grad_norm": 227881.94970852608, "learning_rate": 3.7176748369687426e-07, "logits/chosen": -3.9379782676696777, "logits/rejected": -3.926548480987549, "logps/chosen": -106.3033676147461, "logps/rejected": -110.21146392822266, "loss": 118252.525, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005675940774381161, "rewards/margins": 0.0017482424154877663, "rewards/rejected": -0.007424182258546352, "step": 3270 }, { "epoch": 0.33188303146817766, "grad_norm": 342283.87690701697, "learning_rate": 3.712053069485046e-07, "logits/chosen": -3.558351516723633, "logits/rejected": -3.683872938156128, "logps/chosen": -76.26903533935547, "logps/rejected": -86.7305679321289, "loss": 100613.175, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005514861084520817, "rewards/margins": 0.002069278620183468, "rewards/rejected": -0.007584139704704285, "step": 3280 }, { "epoch": 0.3328948699787514, "grad_norm": 250813.67324160898, "learning_rate": 3.706431302001349e-07, "logits/chosen": -4.1661577224731445, "logits/rejected": -4.036975860595703, "logps/chosen": -80.90918731689453, "logps/rejected": -99.37959289550781, "loss": 121095.1125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005203405395150185, "rewards/margins": 0.003987113479524851, "rewards/rejected": -0.009190520271658897, "step": 3290 }, { "epoch": 0.3339067084893251, "grad_norm": 235941.6861695934, "learning_rate": 3.700809534517652e-07, "logits/chosen": -4.239041805267334, "logits/rejected": -4.299758434295654, "logps/chosen": -85.06680297851562, "logps/rejected": -111.45206451416016, "loss": 109791.1375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0072499909438192844, "rewards/margins": 0.0018023535376414657, "rewards/rejected": -0.009052345529198647, "step": 3300 }, { "epoch": 0.3349185469998988, "grad_norm": 123127.81317176559, "learning_rate": 3.6951877670339553e-07, "logits/chosen": -4.219114303588867, "logits/rejected": -4.240973472595215, "logps/chosen": -95.42967224121094, "logps/rejected": -155.89004516601562, "loss": 116379.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006561583373695612, "rewards/margins": 0.0057620154693722725, "rewards/rejected": -0.012323597446084023, "step": 3310 }, { "epoch": 0.33593038551047255, "grad_norm": 573701.0667943398, "learning_rate": 3.689565999550258e-07, "logits/chosen": -4.059613227844238, "logits/rejected": -4.117576599121094, "logps/chosen": -120.52131652832031, "logps/rejected": -82.07005310058594, "loss": 123421.775, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.006678902544081211, "rewards/margins": 0.0008506158483214676, "rewards/rejected": -0.0075295185670256615, "step": 3320 }, { "epoch": 0.33694222402104623, "grad_norm": 245481.19619146737, "learning_rate": 3.6839442320665617e-07, "logits/chosen": -3.965231418609619, "logits/rejected": -4.116226673126221, "logps/chosen": -73.38761901855469, "logps/rejected": -93.57749938964844, "loss": 131883.65, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00557626411318779, "rewards/margins": 0.0012302454560995102, "rewards/rejected": -0.006806509103626013, "step": 3330 }, { "epoch": 0.33795406253161997, "grad_norm": 290861.52730400785, "learning_rate": 3.6783224645828646e-07, "logits/chosen": -4.4826340675354, "logits/rejected": -4.590544700622559, "logps/chosen": -99.5339584350586, "logps/rejected": -93.59089660644531, "loss": 124682.125, "rewards/accuracies": 0.75, "rewards/chosen": -0.0073218620382249355, "rewards/margins": 0.0018259059870615602, "rewards/rejected": -0.0091477669775486, "step": 3340 }, { "epoch": 0.33896590104219365, "grad_norm": 726624.7226998652, "learning_rate": 3.672700697099168e-07, "logits/chosen": -4.237706184387207, "logits/rejected": -4.382741928100586, "logps/chosen": -95.07945251464844, "logps/rejected": -72.81485748291016, "loss": 105994.475, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006763645447790623, "rewards/margins": 0.0016684377333149314, "rewards/rejected": -0.008432083763182163, "step": 3350 }, { "epoch": 0.3399777395527674, "grad_norm": 278849.89319792564, "learning_rate": 3.667078929615471e-07, "logits/chosen": -4.096681118011475, "logits/rejected": -3.893906831741333, "logps/chosen": -61.588172912597656, "logps/rejected": -75.87727355957031, "loss": 113826.0375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006742383353412151, "rewards/margins": 0.0007323737954720855, "rewards/rejected": -0.007474757730960846, "step": 3360 }, { "epoch": 0.34098957806334107, "grad_norm": 260390.20995120515, "learning_rate": 3.661457162131774e-07, "logits/chosen": -4.117178440093994, "logits/rejected": -4.273658275604248, "logps/chosen": -65.06780242919922, "logps/rejected": -80.27439880371094, "loss": 124593.4875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006201864220201969, "rewards/margins": 0.0022074447479099035, "rewards/rejected": -0.008409308269619942, "step": 3370 }, { "epoch": 0.3420014165739148, "grad_norm": 362560.8056163196, "learning_rate": 3.6558353946480774e-07, "logits/chosen": -4.074151992797852, "logits/rejected": -3.9598050117492676, "logps/chosen": -95.54754638671875, "logps/rejected": -152.28286743164062, "loss": 133276.5875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005799169652163982, "rewards/margins": 0.003716459032148123, "rewards/rejected": -0.009515629149973392, "step": 3380 }, { "epoch": 0.3430132550844885, "grad_norm": 54252.661787626006, "learning_rate": 3.6502136271643803e-07, "logits/chosen": -4.184952735900879, "logits/rejected": -4.232810974121094, "logps/chosen": -98.69471740722656, "logps/rejected": -119.82315826416016, "loss": 116995.7375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.004558411426842213, "rewards/margins": 0.0038907169364392757, "rewards/rejected": -0.008449128828942776, "step": 3390 }, { "epoch": 0.3440250935950622, "grad_norm": 296209.17021605, "learning_rate": 3.644591859680684e-07, "logits/chosen": -3.6960926055908203, "logits/rejected": -3.5796058177948, "logps/chosen": -92.38786315917969, "logps/rejected": -98.1763916015625, "loss": 121056.9375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0071410322561860085, "rewards/margins": 0.0006526632932946086, "rewards/rejected": -0.007793694734573364, "step": 3400 }, { "epoch": 0.34503693210563596, "grad_norm": 176843.04443359308, "learning_rate": 3.6389700921969867e-07, "logits/chosen": -3.6869194507598877, "logits/rejected": -3.6967639923095703, "logps/chosen": -92.87973022460938, "logps/rejected": -86.55528259277344, "loss": 129440.1875, "rewards/accuracies": 0.5, "rewards/chosen": -0.007740862667560577, "rewards/margins": 5.501345731317997e-05, "rewards/rejected": -0.007795876357704401, "step": 3410 }, { "epoch": 0.34604877061620964, "grad_norm": 177214.73245866186, "learning_rate": 3.6333483247132896e-07, "logits/chosen": -3.9619193077087402, "logits/rejected": -3.9006659984588623, "logps/chosen": -87.2044677734375, "logps/rejected": -95.22728729248047, "loss": 107936.125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006757957395166159, "rewards/margins": 0.0027157829608768225, "rewards/rejected": -0.009473741054534912, "step": 3420 }, { "epoch": 0.3470606091267834, "grad_norm": 1013290.9442891872, "learning_rate": 3.627726557229593e-07, "logits/chosen": -3.778398036956787, "logits/rejected": -3.7823004722595215, "logps/chosen": -142.75343322753906, "logps/rejected": -167.52073669433594, "loss": 129218.175, "rewards/accuracies": 0.75, "rewards/chosen": -0.005919320043176413, "rewards/margins": 0.004371718969196081, "rewards/rejected": -0.010291039012372494, "step": 3430 }, { "epoch": 0.34807244763735706, "grad_norm": 195089.81704221878, "learning_rate": 3.622104789745896e-07, "logits/chosen": -4.177172660827637, "logits/rejected": -4.199426174163818, "logps/chosen": -97.33732604980469, "logps/rejected": -85.13102722167969, "loss": 115007.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012291816994547844, "rewards/margins": -0.0011912209447473288, "rewards/rejected": -0.01110059767961502, "step": 3440 }, { "epoch": 0.3490842861479308, "grad_norm": 329545.42651436443, "learning_rate": 3.616483022262199e-07, "logits/chosen": -4.164553642272949, "logits/rejected": -4.007670879364014, "logps/chosen": -76.74851989746094, "logps/rejected": -105.27854919433594, "loss": 114836.25, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006111525930464268, "rewards/margins": 0.0031354662496596575, "rewards/rejected": -0.009246991947293282, "step": 3450 }, { "epoch": 0.3500961246585045, "grad_norm": 144380.31072826975, "learning_rate": 3.6108612547785023e-07, "logits/chosen": -3.9767043590545654, "logits/rejected": -3.884098768234253, "logps/chosen": -109.85722351074219, "logps/rejected": -111.21147155761719, "loss": 117343.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007830615155398846, "rewards/margins": 0.0004065916291438043, "rewards/rejected": -0.008237207308411598, "step": 3460 }, { "epoch": 0.3511079631690782, "grad_norm": 140140.85447891845, "learning_rate": 3.605239487294806e-07, "logits/chosen": -3.716268539428711, "logits/rejected": -3.470130443572998, "logps/chosen": -97.66951751708984, "logps/rejected": -91.12448120117188, "loss": 115490.25, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008458217605948448, "rewards/margins": 0.001727836555801332, "rewards/rejected": -0.010186055675148964, "step": 3470 }, { "epoch": 0.3521198016796519, "grad_norm": 143524.75736165832, "learning_rate": 3.599617719811108e-07, "logits/chosen": -4.329289436340332, "logits/rejected": -4.25441837310791, "logps/chosen": -76.98054504394531, "logps/rejected": -97.34771728515625, "loss": 116201.5875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006221697200089693, "rewards/margins": 0.001576925627887249, "rewards/rejected": -0.007798622362315655, "step": 3480 }, { "epoch": 0.35313164019022564, "grad_norm": 251215.4656970751, "learning_rate": 3.5939959523274116e-07, "logits/chosen": -4.3011698722839355, "logits/rejected": -4.320305824279785, "logps/chosen": -67.1611557006836, "logps/rejected": -95.34910583496094, "loss": 128492.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006993353366851807, "rewards/margins": 0.003351453226059675, "rewards/rejected": -0.010344806127250195, "step": 3490 }, { "epoch": 0.3541434787007994, "grad_norm": 472657.71353801806, "learning_rate": 3.588374184843715e-07, "logits/chosen": -4.23145055770874, "logits/rejected": -4.105343818664551, "logps/chosen": -64.99823760986328, "logps/rejected": -95.62611389160156, "loss": 117500.25, "rewards/accuracies": 0.5, "rewards/chosen": -0.006934764329344034, "rewards/margins": 0.0024632266722619534, "rewards/rejected": -0.009397991001605988, "step": 3500 }, { "epoch": 0.35515531721137306, "grad_norm": 201095.2536006453, "learning_rate": 3.5827524173600175e-07, "logits/chosen": -4.1215667724609375, "logits/rejected": -4.119564056396484, "logps/chosen": -93.80213165283203, "logps/rejected": -109.47325134277344, "loss": 123717.475, "rewards/accuracies": 0.75, "rewards/chosen": -0.006431470159441233, "rewards/margins": 0.002972402609884739, "rewards/rejected": -0.009403872303664684, "step": 3510 }, { "epoch": 0.3561671557219468, "grad_norm": 276887.7224744044, "learning_rate": 3.577130649876321e-07, "logits/chosen": -3.9861087799072266, "logits/rejected": -4.024430274963379, "logps/chosen": -95.04161834716797, "logps/rejected": -91.86715698242188, "loss": 125918.825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005659877322614193, "rewards/margins": 0.0025657557416707277, "rewards/rejected": -0.008225632831454277, "step": 3520 }, { "epoch": 0.3571789942325205, "grad_norm": 268464.4252527047, "learning_rate": 3.5715088823926244e-07, "logits/chosen": -4.048907279968262, "logits/rejected": -4.192914009094238, "logps/chosen": -71.19224548339844, "logps/rejected": -120.05412292480469, "loss": 116219.825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007053283508867025, "rewards/margins": 0.002138136886060238, "rewards/rejected": -0.00919142086058855, "step": 3530 }, { "epoch": 0.3581908327430942, "grad_norm": 346133.6895414932, "learning_rate": 3.565887114908927e-07, "logits/chosen": -4.092459678649902, "logits/rejected": -3.9697470664978027, "logps/chosen": -94.09215545654297, "logps/rejected": -105.96022033691406, "loss": 123681.4125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.009004145860671997, "rewards/margins": 0.0008392800809815526, "rewards/rejected": -0.009843425825238228, "step": 3540 }, { "epoch": 0.3592026712536679, "grad_norm": 236181.4714796197, "learning_rate": 3.56026534742523e-07, "logits/chosen": -4.027027130126953, "logits/rejected": -4.024503707885742, "logps/chosen": -79.65704345703125, "logps/rejected": -91.39644622802734, "loss": 113812.5375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009067866019904613, "rewards/margins": 0.0007871139096096158, "rewards/rejected": -0.009854980744421482, "step": 3550 }, { "epoch": 0.36021450976424163, "grad_norm": 347587.4781166357, "learning_rate": 3.5546435799415337e-07, "logits/chosen": -4.3567891120910645, "logits/rejected": -4.265317440032959, "logps/chosen": -111.814208984375, "logps/rejected": -109.38250732421875, "loss": 107323.6625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00646785506978631, "rewards/margins": 0.0001851636916399002, "rewards/rejected": -0.006653018295764923, "step": 3560 }, { "epoch": 0.3612263482748153, "grad_norm": 325513.13604450267, "learning_rate": 3.5490218124578366e-07, "logits/chosen": -4.424186706542969, "logits/rejected": -4.430316925048828, "logps/chosen": -74.7242202758789, "logps/rejected": -89.3385009765625, "loss": 117082.3125, "rewards/accuracies": 0.75, "rewards/chosen": -0.006734832189977169, "rewards/margins": 0.00423161406069994, "rewards/rejected": -0.010966447181999683, "step": 3570 }, { "epoch": 0.36223818678538905, "grad_norm": 276644.09112306894, "learning_rate": 3.5434000449741395e-07, "logits/chosen": -4.015086650848389, "logits/rejected": -4.067831993103027, "logps/chosen": -103.91715240478516, "logps/rejected": -126.12034606933594, "loss": 109372.375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0072726355865597725, "rewards/margins": 0.001924995332956314, "rewards/rejected": -0.009197630919516087, "step": 3580 }, { "epoch": 0.3632500252959628, "grad_norm": 198572.20332068202, "learning_rate": 3.537778277490443e-07, "logits/chosen": -4.049374580383301, "logits/rejected": -4.020202159881592, "logps/chosen": -79.1127700805664, "logps/rejected": -88.90418243408203, "loss": 99446.3438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006976007018238306, "rewards/margins": 0.001771527691744268, "rewards/rejected": -0.008747533895075321, "step": 3590 }, { "epoch": 0.36426186380653647, "grad_norm": 263785.5971725862, "learning_rate": 3.532156510006746e-07, "logits/chosen": -4.219168186187744, "logits/rejected": -4.208211898803711, "logps/chosen": -80.49102020263672, "logps/rejected": -65.38459777832031, "loss": 114054.2625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0072162821888923645, "rewards/margins": 0.0020961447153240442, "rewards/rejected": -0.009312426671385765, "step": 3600 }, { "epoch": 0.3652737023171102, "grad_norm": 139481.12667161704, "learning_rate": 3.526534742523049e-07, "logits/chosen": -4.294540882110596, "logits/rejected": -4.453879356384277, "logps/chosen": -56.43248748779297, "logps/rejected": -65.12635803222656, "loss": 97128.85, "rewards/accuracies": 0.5, "rewards/chosen": -0.008677991107106209, "rewards/margins": 0.0005773053271695971, "rewards/rejected": -0.009255295619368553, "step": 3610 }, { "epoch": 0.3662855408276839, "grad_norm": 296077.79718231893, "learning_rate": 3.5209129750393523e-07, "logits/chosen": -4.101724147796631, "logits/rejected": -4.084511756896973, "logps/chosen": -68.80757141113281, "logps/rejected": -83.64677429199219, "loss": 109713.825, "rewards/accuracies": 0.5, "rewards/chosen": -0.006503046955913305, "rewards/margins": 0.0016920322086662054, "rewards/rejected": -0.008195079863071442, "step": 3620 }, { "epoch": 0.3672973793382576, "grad_norm": 271790.6401686673, "learning_rate": 3.515291207555655e-07, "logits/chosen": -4.371684551239014, "logits/rejected": -4.3188934326171875, "logps/chosen": -82.68694305419922, "logps/rejected": -94.17552185058594, "loss": 103197.9, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.006706953048706055, "rewards/margins": 0.001330381608568132, "rewards/rejected": -0.008037334308028221, "step": 3630 }, { "epoch": 0.3683092178488313, "grad_norm": 244962.27063375124, "learning_rate": 3.5096694400719587e-07, "logits/chosen": -4.443483352661133, "logits/rejected": -4.461305141448975, "logps/chosen": -66.1627197265625, "logps/rejected": -84.84431457519531, "loss": 119480.65, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009512074291706085, "rewards/margins": 0.0014460369711741805, "rewards/rejected": -0.0109581109136343, "step": 3640 }, { "epoch": 0.36932105635940504, "grad_norm": 359925.0997852248, "learning_rate": 3.5040476725882616e-07, "logits/chosen": -3.9362499713897705, "logits/rejected": -3.963703155517578, "logps/chosen": -51.715415954589844, "logps/rejected": -124.69209289550781, "loss": 117214.775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.004780356306582689, "rewards/margins": 0.005330855026841164, "rewards/rejected": -0.010111210867762566, "step": 3650 }, { "epoch": 0.3703328948699787, "grad_norm": 233941.86265845396, "learning_rate": 3.4984259051045645e-07, "logits/chosen": -3.9321701526641846, "logits/rejected": -3.89582896232605, "logps/chosen": -102.57832336425781, "logps/rejected": -95.53697204589844, "loss": 115835.6, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.009326664730906487, "rewards/margins": -0.0007924813544377685, "rewards/rejected": -0.008534183725714684, "step": 3660 }, { "epoch": 0.37134473338055246, "grad_norm": 253423.3176346109, "learning_rate": 3.492804137620868e-07, "logits/chosen": -3.9908695220947266, "logits/rejected": -3.988466739654541, "logps/chosen": -101.31373596191406, "logps/rejected": -121.0211181640625, "loss": 111666.25, "rewards/accuracies": 0.5, "rewards/chosen": -0.006384999956935644, "rewards/margins": 0.0024485262110829353, "rewards/rejected": -0.008833525702357292, "step": 3670 }, { "epoch": 0.3723565718911262, "grad_norm": 225941.01481584163, "learning_rate": 3.487182370137171e-07, "logits/chosen": -3.8721649646759033, "logits/rejected": -4.060537338256836, "logps/chosen": -45.547019958496094, "logps/rejected": -64.37242126464844, "loss": 121179.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006277153734117746, "rewards/margins": 0.0014001852832734585, "rewards/rejected": -0.00767733808606863, "step": 3680 }, { "epoch": 0.3733684104016999, "grad_norm": 303.66258432110635, "learning_rate": 3.481560602653474e-07, "logits/chosen": -4.109983921051025, "logits/rejected": -4.092258453369141, "logps/chosen": -115.52205657958984, "logps/rejected": -117.75129699707031, "loss": 122766.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006829110439866781, "rewards/margins": 0.002538603264838457, "rewards/rejected": -0.009367713704705238, "step": 3690 }, { "epoch": 0.3743802489122736, "grad_norm": 305910.6472027137, "learning_rate": 3.475938835169777e-07, "logits/chosen": -3.6513819694519043, "logits/rejected": -3.8522109985351562, "logps/chosen": -240.2388153076172, "logps/rejected": -118.8506851196289, "loss": 112733.0375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011473400518298149, "rewards/margins": -0.0018145863432437181, "rewards/rejected": -0.00965881533920765, "step": 3700 }, { "epoch": 0.3753920874228473, "grad_norm": 302896.23404984415, "learning_rate": 3.4703170676860807e-07, "logits/chosen": -3.67265248298645, "logits/rejected": -3.6807847023010254, "logps/chosen": -74.54335021972656, "logps/rejected": -256.98980712890625, "loss": 125698.45, "rewards/accuracies": 0.75, "rewards/chosen": -0.005745295435190201, "rewards/margins": 0.0027555529959499836, "rewards/rejected": -0.008500848896801472, "step": 3710 }, { "epoch": 0.37640392593342104, "grad_norm": 323619.190322611, "learning_rate": 3.464695300202383e-07, "logits/chosen": -4.350294589996338, "logits/rejected": -4.298826694488525, "logps/chosen": -66.64483642578125, "logps/rejected": -71.70125579833984, "loss": 125613.6, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009619785472750664, "rewards/margins": 0.0023855220060795546, "rewards/rejected": -0.012005307711660862, "step": 3720 }, { "epoch": 0.3774157644439947, "grad_norm": 328198.91232710594, "learning_rate": 3.4590735327186866e-07, "logits/chosen": -3.9943950176239014, "logits/rejected": -3.821664333343506, "logps/chosen": -84.58578491210938, "logps/rejected": -128.99313354492188, "loss": 119290.65, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006625042762607336, "rewards/margins": 0.0022373846732079983, "rewards/rejected": -0.008862428367137909, "step": 3730 }, { "epoch": 0.37842760295456845, "grad_norm": 245162.40106188555, "learning_rate": 3.45345176523499e-07, "logits/chosen": -3.760101795196533, "logits/rejected": -3.809206485748291, "logps/chosen": -52.21990203857422, "logps/rejected": -67.65739440917969, "loss": 111867.1875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0056518553756177425, "rewards/margins": 0.002232467057183385, "rewards/rejected": -0.007884321734309196, "step": 3740 }, { "epoch": 0.3794394414651422, "grad_norm": 402052.62551233475, "learning_rate": 3.4478299977512924e-07, "logits/chosen": -4.089580059051514, "logits/rejected": -4.106094837188721, "logps/chosen": -119.04795837402344, "logps/rejected": -115.00699615478516, "loss": 132348.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007895209826529026, "rewards/margins": 0.0018969799857586622, "rewards/rejected": -0.009792190976440907, "step": 3750 }, { "epoch": 0.3804512799757159, "grad_norm": 287319.4389431849, "learning_rate": 3.442208230267596e-07, "logits/chosen": -4.458505153656006, "logits/rejected": -4.327106475830078, "logps/chosen": -88.35840606689453, "logps/rejected": -98.85968017578125, "loss": 122813.8375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0065836356952786446, "rewards/margins": 0.0020614597015082836, "rewards/rejected": -0.008645095862448215, "step": 3760 }, { "epoch": 0.3814631184862896, "grad_norm": 242117.28028808013, "learning_rate": 3.4365864627838993e-07, "logits/chosen": -4.3425798416137695, "logits/rejected": -4.34243106842041, "logps/chosen": -107.26338958740234, "logps/rejected": -114.0202865600586, "loss": 121121.0125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008304733783006668, "rewards/margins": 0.0005437986692413688, "rewards/rejected": -0.008848532103002071, "step": 3770 }, { "epoch": 0.3824749569968633, "grad_norm": 375345.26908256224, "learning_rate": 3.430964695300202e-07, "logits/chosen": -3.9720253944396973, "logits/rejected": -3.9819769859313965, "logps/chosen": -134.2942657470703, "logps/rejected": -108.4737319946289, "loss": 122679.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0065002115443348885, "rewards/margins": 0.0016912262653931975, "rewards/rejected": -0.008191436529159546, "step": 3780 }, { "epoch": 0.38348679550743703, "grad_norm": 360839.50288619334, "learning_rate": 3.425342927816505e-07, "logits/chosen": -3.602762222290039, "logits/rejected": -3.8214173316955566, "logps/chosen": -104.24473571777344, "logps/rejected": -116.8635025024414, "loss": 105132.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006343044340610504, "rewards/margins": 0.0016217948868870735, "rewards/rejected": -0.007964840158820152, "step": 3790 }, { "epoch": 0.3844986340180107, "grad_norm": 219161.24743685895, "learning_rate": 3.4197211603328086e-07, "logits/chosen": -2.791044235229492, "logits/rejected": -2.76558256149292, "logps/chosen": -57.56660079956055, "logps/rejected": -85.68640899658203, "loss": 110867.3375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00692066689953208, "rewards/margins": 0.0008096469682641327, "rewards/rejected": -0.007730314042419195, "step": 3800 }, { "epoch": 0.38551047252858445, "grad_norm": 281337.4330747228, "learning_rate": 3.414099392849112e-07, "logits/chosen": -3.8400180339813232, "logits/rejected": -3.768824338912964, "logps/chosen": -124.50813293457031, "logps/rejected": -139.11550903320312, "loss": 133911.65, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009716509841382504, "rewards/margins": 0.00011221617751289159, "rewards/rejected": -0.009828725829720497, "step": 3810 }, { "epoch": 0.38652231103915813, "grad_norm": 372084.17107379576, "learning_rate": 3.4084776253654145e-07, "logits/chosen": -3.9396042823791504, "logits/rejected": -4.0106658935546875, "logps/chosen": -87.35057830810547, "logps/rejected": -113.87452697753906, "loss": 111217.7125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0072819748893380165, "rewards/margins": 0.0013125272234901786, "rewards/rejected": -0.00859450176358223, "step": 3820 }, { "epoch": 0.38753414954973187, "grad_norm": 282154.1259994547, "learning_rate": 3.402855857881718e-07, "logits/chosen": -3.957207441329956, "logits/rejected": -4.1004958152771, "logps/chosen": -111.48481750488281, "logps/rejected": -88.44050598144531, "loss": 118895.6625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006890945136547089, "rewards/margins": 0.0016592738684266806, "rewards/rejected": -0.008550219237804413, "step": 3830 }, { "epoch": 0.3885459880603056, "grad_norm": 357803.8290171524, "learning_rate": 3.3972340903980214e-07, "logits/chosen": -4.024009704589844, "logits/rejected": -3.9018733501434326, "logps/chosen": -95.08039093017578, "logps/rejected": -136.569091796875, "loss": 122379.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007695109583437443, "rewards/margins": 0.002241352340206504, "rewards/rejected": -0.00993646215647459, "step": 3840 }, { "epoch": 0.3895578265708793, "grad_norm": 268761.74060778687, "learning_rate": 3.391612322914324e-07, "logits/chosen": -3.655306339263916, "logits/rejected": -3.703704833984375, "logps/chosen": -70.96498107910156, "logps/rejected": -107.5546875, "loss": 95304.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005131972022354603, "rewards/margins": 0.002526102354750037, "rewards/rejected": -0.007658074609935284, "step": 3850 }, { "epoch": 0.390569665081453, "grad_norm": 16749.16200614984, "learning_rate": 3.385990555430627e-07, "logits/chosen": -3.9457221031188965, "logits/rejected": -3.766051769256592, "logps/chosen": -78.43218994140625, "logps/rejected": -105.94374084472656, "loss": 103840.5125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00545541662722826, "rewards/margins": 0.0017821801593527198, "rewards/rejected": -0.00723759550601244, "step": 3860 }, { "epoch": 0.3915815035920267, "grad_norm": 230449.27363269005, "learning_rate": 3.3803687879469307e-07, "logits/chosen": -4.329379081726074, "logits/rejected": -4.269717693328857, "logps/chosen": -103.51222229003906, "logps/rejected": -127.59880065917969, "loss": 124840.6625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.006176885217428207, "rewards/margins": 0.0036526124458760023, "rewards/rejected": -0.009829497896134853, "step": 3870 }, { "epoch": 0.39259334210260044, "grad_norm": 246790.5775744862, "learning_rate": 3.3747470204632336e-07, "logits/chosen": -4.106161594390869, "logits/rejected": -3.9861130714416504, "logps/chosen": -70.66079711914062, "logps/rejected": -112.08197021484375, "loss": 100357.275, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006275405175983906, "rewards/margins": 0.002706703497096896, "rewards/rejected": -0.008982108905911446, "step": 3880 }, { "epoch": 0.3936051806131741, "grad_norm": 427158.04162346054, "learning_rate": 3.3691252529795365e-07, "logits/chosen": -4.2584757804870605, "logits/rejected": -4.32611608505249, "logps/chosen": -89.9172134399414, "logps/rejected": -98.41548919677734, "loss": 117604.8875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005249693989753723, "rewards/margins": 0.002562598790973425, "rewards/rejected": -0.007812292780727148, "step": 3890 }, { "epoch": 0.39461701912374786, "grad_norm": 246158.93797920394, "learning_rate": 3.36350348549584e-07, "logits/chosen": -3.9601082801818848, "logits/rejected": -3.5357584953308105, "logps/chosen": -85.01329803466797, "logps/rejected": -308.1094970703125, "loss": 113796.85, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007995465770363808, "rewards/margins": 0.003162189619615674, "rewards/rejected": -0.011157655157148838, "step": 3900 }, { "epoch": 0.39562885763432154, "grad_norm": 286361.1590338024, "learning_rate": 3.357881718012143e-07, "logits/chosen": -3.5759708881378174, "logits/rejected": -3.6346850395202637, "logps/chosen": -70.12195587158203, "logps/rejected": -112.18000793457031, "loss": 133455.7, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006524114403873682, "rewards/margins": 0.0030264512170106173, "rewards/rejected": -0.009550565853714943, "step": 3910 }, { "epoch": 0.3966406961448953, "grad_norm": 197406.88140675885, "learning_rate": 3.352259950528446e-07, "logits/chosen": -4.215527534484863, "logits/rejected": -4.201728343963623, "logps/chosen": -77.72110748291016, "logps/rejected": -86.99763488769531, "loss": 117081.8, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006234960630536079, "rewards/margins": 0.0010704625165089965, "rewards/rejected": -0.007305422332137823, "step": 3920 }, { "epoch": 0.397652534655469, "grad_norm": 285780.03249671176, "learning_rate": 3.346638183044749e-07, "logits/chosen": -4.287126064300537, "logits/rejected": -4.326098442077637, "logps/chosen": -86.53886413574219, "logps/rejected": -108.3321533203125, "loss": 125836.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007802044041454792, "rewards/margins": 0.0020702339243143797, "rewards/rejected": -0.00987227726727724, "step": 3930 }, { "epoch": 0.3986643731660427, "grad_norm": 292245.66570645204, "learning_rate": 3.341016415561052e-07, "logits/chosen": -4.184764862060547, "logits/rejected": -4.195816993713379, "logps/chosen": -69.78826904296875, "logps/rejected": -92.53251647949219, "loss": 125614.5375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006090592592954636, "rewards/margins": 0.002425045007839799, "rewards/rejected": -0.008515638299286366, "step": 3940 }, { "epoch": 0.39967621167661643, "grad_norm": 447276.17450950784, "learning_rate": 3.3353946480773556e-07, "logits/chosen": -4.268529415130615, "logits/rejected": -4.078540802001953, "logps/chosen": -101.80354309082031, "logps/rejected": -153.2575225830078, "loss": 124448.7375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005604474805295467, "rewards/margins": 0.0038016284815967083, "rewards/rejected": -0.009406103752553463, "step": 3950 }, { "epoch": 0.4006880501871901, "grad_norm": 542297.3167081309, "learning_rate": 3.3297728805936586e-07, "logits/chosen": -4.464956283569336, "logits/rejected": -4.457804203033447, "logps/chosen": -71.83341217041016, "logps/rejected": -83.27533721923828, "loss": 116148.475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009067311882972717, "rewards/margins": -0.00017417427443433553, "rewards/rejected": -0.00889313779771328, "step": 3960 }, { "epoch": 0.40169988869776385, "grad_norm": 344581.82091651094, "learning_rate": 3.3241511131099615e-07, "logits/chosen": -4.141509056091309, "logits/rejected": -4.341341018676758, "logps/chosen": -143.5304718017578, "logps/rejected": -121.1530990600586, "loss": 134597.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007399562746286392, "rewards/margins": 0.002408779924735427, "rewards/rejected": -0.009808341972529888, "step": 3970 }, { "epoch": 0.40271172720833753, "grad_norm": 336356.34677472245, "learning_rate": 3.318529345626265e-07, "logits/chosen": -3.9947104454040527, "logits/rejected": -3.9922919273376465, "logps/chosen": -289.41668701171875, "logps/rejected": -288.45538330078125, "loss": 94281.6375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.009480509907007217, "rewards/margins": 0.0006876828847452998, "rewards/rejected": -0.010168193839490414, "step": 3980 }, { "epoch": 0.40372356571891127, "grad_norm": 194790.362642898, "learning_rate": 3.312907578142568e-07, "logits/chosen": -4.116818428039551, "logits/rejected": -4.153632640838623, "logps/chosen": -77.83417510986328, "logps/rejected": -111.94779205322266, "loss": 117729.0125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006771234329789877, "rewards/margins": 0.002338208258152008, "rewards/rejected": -0.009109443984925747, "step": 3990 }, { "epoch": 0.40473540422948495, "grad_norm": 312847.30004988506, "learning_rate": 3.307285810658871e-07, "logits/chosen": -4.276520729064941, "logits/rejected": -4.371793746948242, "logps/chosen": -74.97560119628906, "logps/rejected": -82.43907165527344, "loss": 107581.1125, "rewards/accuracies": 0.75, "rewards/chosen": -0.006959783844649792, "rewards/margins": 0.0018008567858487368, "rewards/rejected": -0.008760642260313034, "step": 4000 }, { "epoch": 0.4057472427400587, "grad_norm": 416271.4962025714, "learning_rate": 3.301664043175174e-07, "logits/chosen": -4.103898048400879, "logits/rejected": -4.250204563140869, "logps/chosen": -93.13345336914062, "logps/rejected": -137.8826904296875, "loss": 109069.875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006646337453275919, "rewards/margins": 0.0038683940656483173, "rewards/rejected": -0.010514730587601662, "step": 4010 }, { "epoch": 0.4067590812506324, "grad_norm": 64651.32879255481, "learning_rate": 3.296042275691477e-07, "logits/chosen": -4.274317264556885, "logits/rejected": -4.347115516662598, "logps/chosen": -87.3676986694336, "logps/rejected": -83.87211608886719, "loss": 108619.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006058321800082922, "rewards/margins": 0.0012026398908346891, "rewards/rejected": -0.007260961923748255, "step": 4020 }, { "epoch": 0.4077709197612061, "grad_norm": 299593.82725634926, "learning_rate": 3.29042050820778e-07, "logits/chosen": -4.1234588623046875, "logits/rejected": -4.270385265350342, "logps/chosen": -101.04532623291016, "logps/rejected": -135.51364135742188, "loss": 112814.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006281470414251089, "rewards/margins": 0.003656588029116392, "rewards/rejected": -0.009938059374690056, "step": 4030 }, { "epoch": 0.40878275827177984, "grad_norm": 212018.01064026603, "learning_rate": 3.2847987407240835e-07, "logits/chosen": -4.048471450805664, "logits/rejected": -4.1169610023498535, "logps/chosen": -61.3314094543457, "logps/rejected": -65.13157653808594, "loss": 109114.1625, "rewards/accuracies": 0.5, "rewards/chosen": -0.004699826240539551, "rewards/margins": 0.0009062163298949599, "rewards/rejected": -0.005606042221188545, "step": 4040 }, { "epoch": 0.4097945967823535, "grad_norm": 316606.23381503124, "learning_rate": 3.279176973240387e-07, "logits/chosen": -3.8856360912323, "logits/rejected": -3.625424861907959, "logps/chosen": -74.24372100830078, "logps/rejected": -104.74066162109375, "loss": 121124.175, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007571632508188486, "rewards/margins": 0.0035490295849740505, "rewards/rejected": -0.011120663024485111, "step": 4050 }, { "epoch": 0.41080643529292726, "grad_norm": 133461.89838020987, "learning_rate": 3.2735552057566894e-07, "logits/chosen": -4.427303791046143, "logits/rejected": -4.418052673339844, "logps/chosen": -71.75377655029297, "logps/rejected": -76.66871643066406, "loss": 96589.975, "rewards/accuracies": 0.5, "rewards/chosen": -0.006395731121301651, "rewards/margins": 0.0009717298671603203, "rewards/rejected": -0.007367461919784546, "step": 4060 }, { "epoch": 0.41181827380350094, "grad_norm": 319255.4138049929, "learning_rate": 3.267933438272993e-07, "logits/chosen": -3.342381715774536, "logits/rejected": -3.3021864891052246, "logps/chosen": -276.35992431640625, "logps/rejected": -302.95147705078125, "loss": 111918.7, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009543064050376415, "rewards/margins": 0.002649485133588314, "rewards/rejected": -0.012192550115287304, "step": 4070 }, { "epoch": 0.4128301123140747, "grad_norm": 200675.29366876892, "learning_rate": 3.2623116707892963e-07, "logits/chosen": -4.045485496520996, "logits/rejected": -3.904252529144287, "logps/chosen": -82.3955307006836, "logps/rejected": -99.18270874023438, "loss": 128301.5, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008090635761618614, "rewards/margins": 0.0021223139483481646, "rewards/rejected": -0.010212949477136135, "step": 4080 }, { "epoch": 0.41384195082464836, "grad_norm": 180368.30173516247, "learning_rate": 3.2566899033055987e-07, "logits/chosen": -4.016995429992676, "logits/rejected": -4.072287082672119, "logps/chosen": -81.3165054321289, "logps/rejected": -95.12773895263672, "loss": 123397.475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0072554186917841434, "rewards/margins": 0.0011929916217923164, "rewards/rejected": -0.008448409847915173, "step": 4090 }, { "epoch": 0.4148537893352221, "grad_norm": 348506.55240413494, "learning_rate": 3.251068135821902e-07, "logits/chosen": -4.328078269958496, "logits/rejected": -4.303115367889404, "logps/chosen": -99.4124755859375, "logps/rejected": -122.7930908203125, "loss": 112414.1375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007619939744472504, "rewards/margins": 0.0013993491884320974, "rewards/rejected": -0.00901928823441267, "step": 4100 }, { "epoch": 0.41586562784579584, "grad_norm": 197933.76788446642, "learning_rate": 3.2454463683382056e-07, "logits/chosen": -3.6213314533233643, "logits/rejected": -3.599217176437378, "logps/chosen": -59.08942413330078, "logps/rejected": -70.08352661132812, "loss": 121438.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.008881531655788422, "rewards/margins": 0.0006679976941086352, "rewards/rejected": -0.00954953022301197, "step": 4110 }, { "epoch": 0.4168774663563695, "grad_norm": 374219.43100493145, "learning_rate": 3.2398246008545085e-07, "logits/chosen": -4.026267051696777, "logits/rejected": -4.110379695892334, "logps/chosen": -64.68559265136719, "logps/rejected": -83.48173522949219, "loss": 112497.425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007959112524986267, "rewards/margins": 0.0020034373737871647, "rewards/rejected": -0.009962549433112144, "step": 4120 }, { "epoch": 0.41788930486694326, "grad_norm": 316091.2798285718, "learning_rate": 3.2342028333708114e-07, "logits/chosen": -4.246005058288574, "logits/rejected": -4.4103546142578125, "logps/chosen": -71.29180908203125, "logps/rejected": -98.5321273803711, "loss": 119687.525, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0073347813449800014, "rewards/margins": 0.001977310748770833, "rewards/rejected": -0.009312091395258904, "step": 4130 }, { "epoch": 0.41890114337751694, "grad_norm": 284898.19404486986, "learning_rate": 3.228581065887115e-07, "logits/chosen": -3.978484630584717, "logits/rejected": -4.007920742034912, "logps/chosen": -82.6202621459961, "logps/rejected": -98.33290100097656, "loss": 112432.2625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007266145199537277, "rewards/margins": 0.0013281163992360234, "rewards/rejected": -0.008594261482357979, "step": 4140 }, { "epoch": 0.4199129818880907, "grad_norm": 306467.0706053048, "learning_rate": 3.222959298403418e-07, "logits/chosen": -4.167191028594971, "logits/rejected": -4.222339630126953, "logps/chosen": -70.61737060546875, "logps/rejected": -82.63954162597656, "loss": 132223.9125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005718776490539312, "rewards/margins": 0.0014451174065470695, "rewards/rejected": -0.007163894362747669, "step": 4150 }, { "epoch": 0.42092482039866436, "grad_norm": 342080.0902281161, "learning_rate": 3.2173375309197207e-07, "logits/chosen": -4.183539867401123, "logits/rejected": -4.117712497711182, "logps/chosen": -96.29336547851562, "logps/rejected": -131.7071533203125, "loss": 123163.55, "rewards/accuracies": 0.75, "rewards/chosen": -0.006184933241456747, "rewards/margins": 0.004345088731497526, "rewards/rejected": -0.010530022904276848, "step": 4160 }, { "epoch": 0.4219366589092381, "grad_norm": 246903.784805285, "learning_rate": 3.211715763436024e-07, "logits/chosen": -3.75938081741333, "logits/rejected": -3.8537838459014893, "logps/chosen": -121.8742446899414, "logps/rejected": -122.38858795166016, "loss": 106908.65, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0050249844789505005, "rewards/margins": 0.0019521566573530436, "rewards/rejected": -0.006977141834795475, "step": 4170 }, { "epoch": 0.4229484974198118, "grad_norm": 371136.2778564554, "learning_rate": 3.206093995952327e-07, "logits/chosen": -3.9029037952423096, "logits/rejected": -3.7760682106018066, "logps/chosen": -130.58193969726562, "logps/rejected": -123.72541809082031, "loss": 132493.9, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007998726330697536, "rewards/margins": 0.003663391573354602, "rewards/rejected": -0.011662118136882782, "step": 4180 }, { "epoch": 0.4239603359303855, "grad_norm": 385179.88507782185, "learning_rate": 3.2004722284686306e-07, "logits/chosen": -4.409487247467041, "logits/rejected": -4.397967338562012, "logps/chosen": -105.91357421875, "logps/rejected": -110.3058853149414, "loss": 121581.175, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011815974488854408, "rewards/margins": 0.0018129704985767603, "rewards/rejected": -0.013628944754600525, "step": 4190 }, { "epoch": 0.42497217444095925, "grad_norm": 371937.78320535435, "learning_rate": 3.1948504609849335e-07, "logits/chosen": -3.909374952316284, "logits/rejected": -3.8799571990966797, "logps/chosen": -112.0710220336914, "logps/rejected": -136.38726806640625, "loss": 119981.9625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006718868855386972, "rewards/margins": 0.0039389971643686295, "rewards/rejected": -0.010657867416739464, "step": 4200 }, { "epoch": 0.42598401295153293, "grad_norm": 270441.7968334441, "learning_rate": 3.189228693501237e-07, "logits/chosen": -3.8411478996276855, "logits/rejected": -3.8075459003448486, "logps/chosen": -94.33307647705078, "logps/rejected": -99.87940979003906, "loss": 112279.325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01006108708679676, "rewards/margins": 0.0012670194264501333, "rewards/rejected": -0.011328105814754963, "step": 4210 }, { "epoch": 0.42699585146210667, "grad_norm": 219745.4286472752, "learning_rate": 3.18360692601754e-07, "logits/chosen": -3.6549999713897705, "logits/rejected": -3.7542216777801514, "logps/chosen": -61.48891067504883, "logps/rejected": -90.12362670898438, "loss": 103767.5938, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006651648785918951, "rewards/margins": 0.0018033437663689256, "rewards/rejected": -0.00845499150454998, "step": 4220 }, { "epoch": 0.42800768997268035, "grad_norm": 214010.4671547592, "learning_rate": 3.177985158533843e-07, "logits/chosen": -4.070377349853516, "logits/rejected": -3.933922290802002, "logps/chosen": -75.3974380493164, "logps/rejected": -126.4360122680664, "loss": 117280.975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006471523083746433, "rewards/margins": 0.0036176200956106186, "rewards/rejected": -0.010089143179357052, "step": 4230 }, { "epoch": 0.4290195284832541, "grad_norm": 277518.66967263824, "learning_rate": 3.172363391050146e-07, "logits/chosen": -4.164217948913574, "logits/rejected": -4.252887725830078, "logps/chosen": -78.19668579101562, "logps/rejected": -90.85090637207031, "loss": 114985.85, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008304467424750328, "rewards/margins": 0.002393819624558091, "rewards/rejected": -0.010698286816477776, "step": 4240 }, { "epoch": 0.43003136699382777, "grad_norm": 218468.89089715303, "learning_rate": 3.166741623566449e-07, "logits/chosen": -3.8396449089050293, "logits/rejected": -3.908435821533203, "logps/chosen": -100.24004364013672, "logps/rejected": -104.21611022949219, "loss": 131010.7375, "rewards/accuracies": 0.5, "rewards/chosen": -0.009684218093752861, "rewards/margins": -0.0005088447360321879, "rewards/rejected": -0.009175373241305351, "step": 4250 }, { "epoch": 0.4310432055044015, "grad_norm": 291858.0843030655, "learning_rate": 3.161119856082752e-07, "logits/chosen": -4.06841516494751, "logits/rejected": -4.122876167297363, "logps/chosen": -83.24134826660156, "logps/rejected": -95.44794464111328, "loss": 115057.6875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0046736388467252254, "rewards/margins": 0.0012554296990856528, "rewards/rejected": -0.0059290686622262, "step": 4260 }, { "epoch": 0.4320550440149752, "grad_norm": 451640.1121655247, "learning_rate": 3.1554980885990555e-07, "logits/chosen": -3.886817216873169, "logits/rejected": -3.891205310821533, "logps/chosen": -258.2669677734375, "logps/rejected": -279.7670593261719, "loss": 117886.9, "rewards/accuracies": 0.5, "rewards/chosen": -0.0070439898408949375, "rewards/margins": 0.0007526368135586381, "rewards/rejected": -0.007796626538038254, "step": 4270 }, { "epoch": 0.4330668825255489, "grad_norm": 291904.4119922955, "learning_rate": 3.1498763211153585e-07, "logits/chosen": -3.9817862510681152, "logits/rejected": -3.8576908111572266, "logps/chosen": -74.59080505371094, "logps/rejected": -104.5184555053711, "loss": 122055.925, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007154981605708599, "rewards/margins": 0.0028802803717553616, "rewards/rejected": -0.010035260580480099, "step": 4280 }, { "epoch": 0.43407872103612266, "grad_norm": 348900.23139931855, "learning_rate": 3.144254553631662e-07, "logits/chosen": -3.83543062210083, "logits/rejected": -3.8008155822753906, "logps/chosen": -118.2776107788086, "logps/rejected": -124.30860900878906, "loss": 130717.85, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007073329295963049, "rewards/margins": 0.002695066388696432, "rewards/rejected": -0.009768394753336906, "step": 4290 }, { "epoch": 0.43509055954669634, "grad_norm": 183316.1268946346, "learning_rate": 3.138632786147965e-07, "logits/chosen": -4.3133134841918945, "logits/rejected": -4.122079849243164, "logps/chosen": -54.59674072265625, "logps/rejected": -85.32565307617188, "loss": 114849.4875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005093783605843782, "rewards/margins": 0.005042814649641514, "rewards/rejected": -0.010136597789824009, "step": 4300 }, { "epoch": 0.4361023980572701, "grad_norm": 223709.13741718905, "learning_rate": 3.133011018664268e-07, "logits/chosen": -4.0062994956970215, "logits/rejected": -4.0388288497924805, "logps/chosen": -109.17181396484375, "logps/rejected": -140.04702758789062, "loss": 100697.0063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009313276037573814, "rewards/margins": 0.002470775041729212, "rewards/rejected": -0.011784051544964314, "step": 4310 }, { "epoch": 0.43711423656784376, "grad_norm": 225977.22227888822, "learning_rate": 3.127389251180571e-07, "logits/chosen": -4.333625316619873, "logits/rejected": -4.203886985778809, "logps/chosen": -68.83822631835938, "logps/rejected": -94.05038452148438, "loss": 112687.4875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00657357182353735, "rewards/margins": 0.0022649129386991262, "rewards/rejected": -0.00883848499506712, "step": 4320 }, { "epoch": 0.4381260750784175, "grad_norm": 175936.14879221882, "learning_rate": 3.121767483696874e-07, "logits/chosen": -3.6195015907287598, "logits/rejected": -3.845827579498291, "logps/chosen": -113.09635925292969, "logps/rejected": -75.36845397949219, "loss": 98701.65, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005572709254920483, "rewards/margins": 0.003241226775571704, "rewards/rejected": -0.00881393626332283, "step": 4330 }, { "epoch": 0.4391379135889912, "grad_norm": 314000.8193571469, "learning_rate": 3.116145716213177e-07, "logits/chosen": -3.9857451915740967, "logits/rejected": -3.972649335861206, "logps/chosen": -65.1346435546875, "logps/rejected": -82.74072265625, "loss": 112270.8875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006559199187904596, "rewards/margins": 0.001760269864462316, "rewards/rejected": -0.008319469168782234, "step": 4340 }, { "epoch": 0.4401497520995649, "grad_norm": 168253.52665423093, "learning_rate": 3.1105239487294805e-07, "logits/chosen": -4.1807098388671875, "logits/rejected": -4.203177452087402, "logps/chosen": -87.67253112792969, "logps/rejected": -155.42950439453125, "loss": 106213.7, "rewards/accuracies": 0.75, "rewards/chosen": -0.008203146047890186, "rewards/margins": 0.00572409899905324, "rewards/rejected": -0.013927245512604713, "step": 4350 }, { "epoch": 0.4411615906101386, "grad_norm": 160439.4695874451, "learning_rate": 3.104902181245784e-07, "logits/chosen": -4.228505611419678, "logits/rejected": -4.162585735321045, "logps/chosen": -41.221031188964844, "logps/rejected": -79.22096252441406, "loss": 98205.3125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006562314927577972, "rewards/margins": 0.0031650899909436703, "rewards/rejected": -0.009727404452860355, "step": 4360 }, { "epoch": 0.44217342912071234, "grad_norm": 342141.267277197, "learning_rate": 3.0992804137620864e-07, "logits/chosen": -4.523064613342285, "logits/rejected": -4.439040660858154, "logps/chosen": -55.38287353515625, "logps/rejected": -70.2179946899414, "loss": 114976.6375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005795771721750498, "rewards/margins": 0.002652816940099001, "rewards/rejected": -0.008448588661849499, "step": 4370 }, { "epoch": 0.4431852676312861, "grad_norm": 409760.7965022041, "learning_rate": 3.09365864627839e-07, "logits/chosen": -3.9832255840301514, "logits/rejected": -4.054749488830566, "logps/chosen": -79.79869842529297, "logps/rejected": -101.57779693603516, "loss": 131138.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008556637912988663, "rewards/margins": 0.0034522837959229946, "rewards/rejected": -0.01200892310589552, "step": 4380 }, { "epoch": 0.44419710614185975, "grad_norm": 355850.404221734, "learning_rate": 3.088036878794693e-07, "logits/chosen": -4.517472267150879, "logits/rejected": -4.5789875984191895, "logps/chosen": -54.16780471801758, "logps/rejected": -87.91300201416016, "loss": 121212.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008476457558572292, "rewards/margins": 0.002467718906700611, "rewards/rejected": -0.010944177396595478, "step": 4390 }, { "epoch": 0.4452089446524335, "grad_norm": 568606.4178798993, "learning_rate": 3.0824151113109957e-07, "logits/chosen": -4.436930179595947, "logits/rejected": -4.354730129241943, "logps/chosen": -110.36478424072266, "logps/rejected": -106.44166564941406, "loss": 104583.8562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008918320760130882, "rewards/margins": 0.001341326511465013, "rewards/rejected": -0.010259647853672504, "step": 4400 }, { "epoch": 0.4462207831630072, "grad_norm": 334567.9062309241, "learning_rate": 3.076793343827299e-07, "logits/chosen": -4.04805850982666, "logits/rejected": -4.0539631843566895, "logps/chosen": -102.19934844970703, "logps/rejected": -111.38216400146484, "loss": 118719.75, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006125022657215595, "rewards/margins": 0.0023334717843681574, "rewards/rejected": -0.008458495140075684, "step": 4410 }, { "epoch": 0.4472326216735809, "grad_norm": 165137.4854729715, "learning_rate": 3.0711715763436026e-07, "logits/chosen": -3.889127254486084, "logits/rejected": -3.8969759941101074, "logps/chosen": -107.5342025756836, "logps/rejected": -123.19576263427734, "loss": 127185.0625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00634004408493638, "rewards/margins": 0.002582069719210267, "rewards/rejected": -0.008922114036977291, "step": 4420 }, { "epoch": 0.4482444601841546, "grad_norm": 229992.03939261948, "learning_rate": 3.0655498088599055e-07, "logits/chosen": -4.376828193664551, "logits/rejected": -4.292999744415283, "logps/chosen": -74.5629653930664, "logps/rejected": -74.73521423339844, "loss": 128896.4, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007810831069946289, "rewards/margins": 0.001008972991257906, "rewards/rejected": -0.008819804526865482, "step": 4430 }, { "epoch": 0.44925629869472833, "grad_norm": 202700.57690185215, "learning_rate": 3.0599280413762084e-07, "logits/chosen": -3.8440353870391846, "logits/rejected": -3.812506914138794, "logps/chosen": -72.23409271240234, "logps/rejected": -93.93791961669922, "loss": 111112.8, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006619976367801428, "rewards/margins": 0.0025672712363302708, "rewards/rejected": -0.009187247604131699, "step": 4440 }, { "epoch": 0.450268137205302, "grad_norm": 660676.9481616413, "learning_rate": 3.054306273892512e-07, "logits/chosen": -4.229220390319824, "logits/rejected": -4.234338283538818, "logps/chosen": -147.49664306640625, "logps/rejected": -128.4888458251953, "loss": 111938.5875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006852059159427881, "rewards/margins": 0.0012742809485644102, "rewards/rejected": -0.008126339875161648, "step": 4450 }, { "epoch": 0.45127997571587575, "grad_norm": 249073.646649532, "learning_rate": 3.048684506408815e-07, "logits/chosen": -4.188080787658691, "logits/rejected": -4.09865140914917, "logps/chosen": -87.43470001220703, "logps/rejected": -106.41251373291016, "loss": 100968.5437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0065536187030375, "rewards/margins": 0.002834225306287408, "rewards/rejected": -0.009387844242155552, "step": 4460 }, { "epoch": 0.4522918142264495, "grad_norm": 281647.64341178705, "learning_rate": 3.0430627389251177e-07, "logits/chosen": -3.8466808795928955, "logits/rejected": -3.609438419342041, "logps/chosen": -158.09930419921875, "logps/rejected": -143.3411102294922, "loss": 124905.075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012597938068211079, "rewards/margins": 0.002083001658320427, "rewards/rejected": -0.014680939726531506, "step": 4470 }, { "epoch": 0.45330365273702317, "grad_norm": 226407.02034918676, "learning_rate": 3.037440971441421e-07, "logits/chosen": -4.4177727699279785, "logits/rejected": -4.40433931350708, "logps/chosen": -107.67476654052734, "logps/rejected": -84.63127136230469, "loss": 124120.225, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009396782144904137, "rewards/margins": -0.0007563012768514454, "rewards/rejected": -0.008640481159090996, "step": 4480 }, { "epoch": 0.4543154912475969, "grad_norm": 259391.19880992977, "learning_rate": 3.031819203957724e-07, "logits/chosen": -4.088736057281494, "logits/rejected": -4.023133277893066, "logps/chosen": -58.24127960205078, "logps/rejected": -89.87220001220703, "loss": 122911.4875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00711425393819809, "rewards/margins": 0.004382799379527569, "rewards/rejected": -0.011497052386403084, "step": 4490 }, { "epoch": 0.4553273297581706, "grad_norm": 189832.836979283, "learning_rate": 3.026197436474027e-07, "logits/chosen": -4.371730804443359, "logits/rejected": -4.429788112640381, "logps/chosen": -112.57914733886719, "logps/rejected": -122.52091217041016, "loss": 123943.875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0070646838285028934, "rewards/margins": 0.001285230740904808, "rewards/rejected": -0.008349914103746414, "step": 4500 }, { "epoch": 0.4563391682687443, "grad_norm": 323061.6328366884, "learning_rate": 3.0205756689903305e-07, "logits/chosen": -4.0376482009887695, "logits/rejected": -4.130014896392822, "logps/chosen": -80.23419952392578, "logps/rejected": -129.1287078857422, "loss": 127664.3, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.006077860947698355, "rewards/margins": 0.00415876554325223, "rewards/rejected": -0.010236626490950584, "step": 4510 }, { "epoch": 0.457351006779318, "grad_norm": 10.356131777536472, "learning_rate": 3.0149539015066334e-07, "logits/chosen": -4.026772499084473, "logits/rejected": -4.074612140655518, "logps/chosen": -123.14564514160156, "logps/rejected": -126.39421081542969, "loss": 115909.5125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007377465255558491, "rewards/margins": 0.0019614757038652897, "rewards/rejected": -0.009338939562439919, "step": 4520 }, { "epoch": 0.45836284528989174, "grad_norm": 289518.2061888617, "learning_rate": 3.009332134022937e-07, "logits/chosen": -4.28493595123291, "logits/rejected": -4.203320026397705, "logps/chosen": -108.36088562011719, "logps/rejected": -109.01567077636719, "loss": 127012.8375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007023698650300503, "rewards/margins": -0.00046151457354426384, "rewards/rejected": -0.006562183611094952, "step": 4530 }, { "epoch": 0.4593746838004654, "grad_norm": 208017.94161010446, "learning_rate": 3.00371036653924e-07, "logits/chosen": -4.246933937072754, "logits/rejected": -4.293414115905762, "logps/chosen": -56.99611282348633, "logps/rejected": -64.67790222167969, "loss": 105477.325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006522466894239187, "rewards/margins": 0.0008836188353598118, "rewards/rejected": -0.007406085729598999, "step": 4540 }, { "epoch": 0.46038652231103916, "grad_norm": 398162.37359944434, "learning_rate": 2.9980885990555427e-07, "logits/chosen": -4.440638065338135, "logits/rejected": -4.320939064025879, "logps/chosen": -63.739593505859375, "logps/rejected": -55.26765823364258, "loss": 113056.6, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008947632275521755, "rewards/margins": -0.0008104606531560421, "rewards/rejected": -0.008137170225381851, "step": 4550 }, { "epoch": 0.4613983608216129, "grad_norm": 395760.5107935311, "learning_rate": 2.992466831571846e-07, "logits/chosen": -4.092816352844238, "logits/rejected": -4.019253253936768, "logps/chosen": -63.1044807434082, "logps/rejected": -92.35182189941406, "loss": 111347.525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00477320933714509, "rewards/margins": 0.004848351236432791, "rewards/rejected": -0.009621561504900455, "step": 4560 }, { "epoch": 0.4624101993321866, "grad_norm": 538452.423510882, "learning_rate": 2.986845064088149e-07, "logits/chosen": -3.963716983795166, "logits/rejected": -3.819676160812378, "logps/chosen": -122.91984558105469, "logps/rejected": -137.16146850585938, "loss": 119115.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008103164844214916, "rewards/margins": 0.0021419054828584194, "rewards/rejected": -0.010245069861412048, "step": 4570 }, { "epoch": 0.4634220378427603, "grad_norm": 272320.61082287435, "learning_rate": 2.981223296604452e-07, "logits/chosen": -3.9476592540740967, "logits/rejected": -4.022828578948975, "logps/chosen": -90.43220520019531, "logps/rejected": -87.90393829345703, "loss": 118898.8, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005762272048741579, "rewards/margins": 0.0027490495704114437, "rewards/rejected": -0.008511321619153023, "step": 4580 }, { "epoch": 0.464433876353334, "grad_norm": 462256.9838289818, "learning_rate": 2.9756015291207554e-07, "logits/chosen": -4.375389099121094, "logits/rejected": -4.517555236816406, "logps/chosen": -117.5323257446289, "logps/rejected": -99.20133972167969, "loss": 101635.0125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0067653050646185875, "rewards/margins": 0.0015974983107298613, "rewards/rejected": -0.008362804539501667, "step": 4590 }, { "epoch": 0.46544571486390773, "grad_norm": 365097.99077841005, "learning_rate": 2.969979761637059e-07, "logits/chosen": -4.094086647033691, "logits/rejected": -4.1859588623046875, "logps/chosen": -90.3816909790039, "logps/rejected": -85.53392028808594, "loss": 100304.2125, "rewards/accuracies": 0.5, "rewards/chosen": -0.006791324820369482, "rewards/margins": 0.0014293140266090631, "rewards/rejected": -0.008220640011131763, "step": 4600 }, { "epoch": 0.4664575533744814, "grad_norm": 322851.9884268186, "learning_rate": 2.9643579941533613e-07, "logits/chosen": -4.373453617095947, "logits/rejected": -4.361850261688232, "logps/chosen": -95.89521789550781, "logps/rejected": -101.7354965209961, "loss": 121689.3625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007994468323886395, "rewards/margins": 0.002557187806814909, "rewards/rejected": -0.010551655665040016, "step": 4610 }, { "epoch": 0.46746939188505515, "grad_norm": 476775.3218709932, "learning_rate": 2.9587362266696647e-07, "logits/chosen": -3.1435317993164062, "logits/rejected": -2.9575304985046387, "logps/chosen": -95.82823181152344, "logps/rejected": -137.47731018066406, "loss": 126762.25, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007575193885713816, "rewards/margins": 0.0037387434858828783, "rewards/rejected": -0.011313937604427338, "step": 4620 }, { "epoch": 0.46848123039562883, "grad_norm": 262492.4245235237, "learning_rate": 2.953114459185968e-07, "logits/chosen": -4.059875965118408, "logits/rejected": -4.031538486480713, "logps/chosen": -99.83750915527344, "logps/rejected": -143.05789184570312, "loss": 108332.475, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0062044099904596806, "rewards/margins": 0.003249177010729909, "rewards/rejected": -0.009453586302697659, "step": 4630 }, { "epoch": 0.46949306890620257, "grad_norm": 299737.5068853539, "learning_rate": 2.947492691702271e-07, "logits/chosen": -4.08688497543335, "logits/rejected": -4.06353759765625, "logps/chosen": -81.92110443115234, "logps/rejected": -82.93964385986328, "loss": 112279.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.005865360610187054, "rewards/margins": 0.003154914826154709, "rewards/rejected": -0.009020274505019188, "step": 4640 }, { "epoch": 0.4705049074167763, "grad_norm": 349901.1468609192, "learning_rate": 2.941870924218574e-07, "logits/chosen": -4.168639183044434, "logits/rejected": -4.1908793449401855, "logps/chosen": -70.93537902832031, "logps/rejected": -75.72044372558594, "loss": 126420.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00608386006206274, "rewards/margins": 0.00207681767642498, "rewards/rejected": -0.00816067773848772, "step": 4650 }, { "epoch": 0.47151674592735, "grad_norm": 252889.2251477937, "learning_rate": 2.9362491567348775e-07, "logits/chosen": -4.1213836669921875, "logits/rejected": -4.056728363037109, "logps/chosen": -90.46831512451172, "logps/rejected": -138.2503662109375, "loss": 112844.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.007920948788523674, "rewards/margins": 0.0033383690752089024, "rewards/rejected": -0.011259317398071289, "step": 4660 }, { "epoch": 0.4725285844379237, "grad_norm": 341731.18120396725, "learning_rate": 2.930627389251181e-07, "logits/chosen": -3.5948119163513184, "logits/rejected": -3.746203660964966, "logps/chosen": -151.99037170410156, "logps/rejected": -146.81170654296875, "loss": 124996.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0072942860424518585, "rewards/margins": 0.00288116792216897, "rewards/rejected": -0.010175453498959541, "step": 4670 }, { "epoch": 0.4735404229484974, "grad_norm": 168561.27101880396, "learning_rate": 2.9250056217674833e-07, "logits/chosen": -3.6789939403533936, "logits/rejected": -3.6909356117248535, "logps/chosen": -115.28389739990234, "logps/rejected": -125.97364807128906, "loss": 113132.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005496277008205652, "rewards/margins": 0.0016345412004739046, "rewards/rejected": -0.007130817975848913, "step": 4680 }, { "epoch": 0.47455226145907115, "grad_norm": 179274.49810079279, "learning_rate": 2.919383854283787e-07, "logits/chosen": -3.649329423904419, "logits/rejected": -3.6006851196289062, "logps/chosen": -110.97066497802734, "logps/rejected": -87.88517761230469, "loss": 127380.4, "rewards/accuracies": 0.75, "rewards/chosen": -0.007828997448086739, "rewards/margins": 0.0021996782161295414, "rewards/rejected": -0.010028674267232418, "step": 4690 }, { "epoch": 0.4755640999696448, "grad_norm": 277705.7000931728, "learning_rate": 2.91376208680009e-07, "logits/chosen": -4.274277687072754, "logits/rejected": -4.393567085266113, "logps/chosen": -82.35774993896484, "logps/rejected": -94.74359130859375, "loss": 111201.8, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006880573928356171, "rewards/margins": 0.0024976099375635386, "rewards/rejected": -0.00937818456441164, "step": 4700 }, { "epoch": 0.47657593848021856, "grad_norm": 381890.2175021402, "learning_rate": 2.9081403193163926e-07, "logits/chosen": -4.228414535522461, "logits/rejected": -4.2030229568481445, "logps/chosen": -98.0104751586914, "logps/rejected": -117.64727783203125, "loss": 124661.1375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007265196181833744, "rewards/margins": 0.001540489960461855, "rewards/rejected": -0.008805685676634312, "step": 4710 }, { "epoch": 0.47758777699079225, "grad_norm": 258341.45643050072, "learning_rate": 2.902518551832696e-07, "logits/chosen": -4.174311637878418, "logits/rejected": -4.14969539642334, "logps/chosen": -75.92037963867188, "logps/rejected": -82.56571960449219, "loss": 104920.325, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008212308399379253, "rewards/margins": 0.0012242270167917013, "rewards/rejected": -0.009436534717679024, "step": 4720 }, { "epoch": 0.478599615501366, "grad_norm": 305138.2395253884, "learning_rate": 2.8968967843489995e-07, "logits/chosen": -4.017868995666504, "logits/rejected": -4.048256874084473, "logps/chosen": -269.4040222167969, "logps/rejected": -320.32135009765625, "loss": 107543.1375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007116069085896015, "rewards/margins": 0.002584780566394329, "rewards/rejected": -0.009700849652290344, "step": 4730 }, { "epoch": 0.4796114540119397, "grad_norm": 117099.19649210635, "learning_rate": 2.891275016865302e-07, "logits/chosen": -4.494725704193115, "logits/rejected": -4.454279899597168, "logps/chosen": -101.83387756347656, "logps/rejected": -117.09184265136719, "loss": 93500.3, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005579223390668631, "rewards/margins": 0.0017388993874192238, "rewards/rejected": -0.007318123243749142, "step": 4740 }, { "epoch": 0.4806232925225134, "grad_norm": 297417.8629004171, "learning_rate": 2.8856532493816054e-07, "logits/chosen": -4.0443267822265625, "logits/rejected": -4.053589344024658, "logps/chosen": -92.92659759521484, "logps/rejected": -154.657958984375, "loss": 113457.0875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010539586655795574, "rewards/margins": 0.0037010586820542812, "rewards/rejected": -0.014240646734833717, "step": 4750 }, { "epoch": 0.48163513103308714, "grad_norm": 194716.78979293417, "learning_rate": 2.880031481897909e-07, "logits/chosen": -4.411336898803711, "logits/rejected": -4.469693660736084, "logps/chosen": -93.66873168945312, "logps/rejected": -130.40444946289062, "loss": 113179.375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006476563401520252, "rewards/margins": 0.00458966288715601, "rewards/rejected": -0.011066226288676262, "step": 4760 }, { "epoch": 0.4826469695436608, "grad_norm": 337542.3227859468, "learning_rate": 2.874409714414212e-07, "logits/chosen": -4.5598063468933105, "logits/rejected": -4.652605056762695, "logps/chosen": -89.70829772949219, "logps/rejected": -106.36637115478516, "loss": 116799.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006857897154986858, "rewards/margins": 0.0017515195067971945, "rewards/rejected": -0.008609415963292122, "step": 4770 }, { "epoch": 0.48365880805423456, "grad_norm": 224488.1050759128, "learning_rate": 2.8687879469305147e-07, "logits/chosen": -4.004267692565918, "logits/rejected": -3.983720064163208, "logps/chosen": -90.36203002929688, "logps/rejected": -109.1877670288086, "loss": 120742.65, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00954481028020382, "rewards/margins": 0.002512627746909857, "rewards/rejected": -0.012057437561452389, "step": 4780 }, { "epoch": 0.48467064656480824, "grad_norm": 312897.96554958395, "learning_rate": 2.863166179446818e-07, "logits/chosen": -4.133662223815918, "logits/rejected": -4.106595516204834, "logps/chosen": -83.65428161621094, "logps/rejected": -139.8418426513672, "loss": 125744.7875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.005760936997830868, "rewards/margins": 0.0040904078632593155, "rewards/rejected": -0.009851345792412758, "step": 4790 }, { "epoch": 0.485682485075382, "grad_norm": 238656.32716654794, "learning_rate": 2.857544411963121e-07, "logits/chosen": -3.89274525642395, "logits/rejected": -4.05690336227417, "logps/chosen": -60.06257247924805, "logps/rejected": -80.16548156738281, "loss": 110789.2875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0068127973936498165, "rewards/margins": 0.0012426013126969337, "rewards/rejected": -0.008055400103330612, "step": 4800 }, { "epoch": 0.48669432358595566, "grad_norm": 132749.87632027746, "learning_rate": 2.851922644479424e-07, "logits/chosen": -4.460705280303955, "logits/rejected": -4.565571308135986, "logps/chosen": -76.18386840820312, "logps/rejected": -83.32876586914062, "loss": 110766.7875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006822848226875067, "rewards/margins": 0.0020886254496872425, "rewards/rejected": -0.00891147367656231, "step": 4810 }, { "epoch": 0.4877061620965294, "grad_norm": 297276.6233920937, "learning_rate": 2.8463008769957274e-07, "logits/chosen": -3.6808338165283203, "logits/rejected": -3.7337615489959717, "logps/chosen": -95.69789123535156, "logps/rejected": -151.53041076660156, "loss": 130258.45, "rewards/accuracies": 0.75, "rewards/chosen": -0.009467652067542076, "rewards/margins": 0.005353307817131281, "rewards/rejected": -0.01482095755636692, "step": 4820 }, { "epoch": 0.48871800060710313, "grad_norm": 394618.2100454029, "learning_rate": 2.8406791095120304e-07, "logits/chosen": -4.359339237213135, "logits/rejected": -4.297983169555664, "logps/chosen": -106.2806625366211, "logps/rejected": -95.74982452392578, "loss": 129132.0625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007890681736171246, "rewards/margins": 0.0008976779063232243, "rewards/rejected": -0.008788359351456165, "step": 4830 }, { "epoch": 0.4897298391176768, "grad_norm": 258901.0433501128, "learning_rate": 2.835057342028334e-07, "logits/chosen": -4.530887126922607, "logits/rejected": -4.4319329261779785, "logps/chosen": -63.360191345214844, "logps/rejected": -74.84185791015625, "loss": 123978.1125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004360563587397337, "rewards/margins": 0.005421533714979887, "rewards/rejected": -0.00978209637105465, "step": 4840 }, { "epoch": 0.49074167762825055, "grad_norm": 361670.9295330625, "learning_rate": 2.8294355745446367e-07, "logits/chosen": -4.190200328826904, "logits/rejected": -4.121565818786621, "logps/chosen": -94.03240966796875, "logps/rejected": -107.52728271484375, "loss": 132240.975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009877270087599754, "rewards/margins": 0.0036943063605576754, "rewards/rejected": -0.013571575284004211, "step": 4850 }, { "epoch": 0.49175351613882423, "grad_norm": 311660.283086222, "learning_rate": 2.8238138070609397e-07, "logits/chosen": -4.181434154510498, "logits/rejected": -4.110726356506348, "logps/chosen": -121.98297119140625, "logps/rejected": -136.4197998046875, "loss": 121412.425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.007023426704108715, "rewards/margins": 0.0041784849017858505, "rewards/rejected": -0.01120191253721714, "step": 4860 }, { "epoch": 0.49276535464939797, "grad_norm": 264350.0208090655, "learning_rate": 2.818192039577243e-07, "logits/chosen": -4.5661211013793945, "logits/rejected": -4.602374076843262, "logps/chosen": -61.58784866333008, "logps/rejected": -81.78787994384766, "loss": 125565.425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005583214107900858, "rewards/margins": 0.0037870672531425953, "rewards/rejected": -0.009370281361043453, "step": 4870 }, { "epoch": 0.49377719315997165, "grad_norm": 400734.01976913307, "learning_rate": 2.812570272093546e-07, "logits/chosen": -4.414447784423828, "logits/rejected": -4.420076847076416, "logps/chosen": -89.91688537597656, "logps/rejected": -119.71720886230469, "loss": 141109.275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00812186487019062, "rewards/margins": 0.0031143995001912117, "rewards/rejected": -0.011236264370381832, "step": 4880 }, { "epoch": 0.4947890316705454, "grad_norm": 278993.74860467983, "learning_rate": 2.806948504609849e-07, "logits/chosen": -3.998802661895752, "logits/rejected": -4.034219264984131, "logps/chosen": -91.57530975341797, "logps/rejected": -103.70735168457031, "loss": 119737.5625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.004821970127522945, "rewards/margins": 0.0033339853398501873, "rewards/rejected": -0.008155955001711845, "step": 4890 }, { "epoch": 0.49580087018111907, "grad_norm": 205737.58064084747, "learning_rate": 2.8013267371261524e-07, "logits/chosen": -4.2230916023254395, "logits/rejected": -4.242398738861084, "logps/chosen": -104.7779312133789, "logps/rejected": -120.63533782958984, "loss": 94790.675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007361465599387884, "rewards/margins": 0.0030456124804913998, "rewards/rejected": -0.010407078079879284, "step": 4900 }, { "epoch": 0.4968127086916928, "grad_norm": 174237.5160646385, "learning_rate": 2.795704969642456e-07, "logits/chosen": -3.75708270072937, "logits/rejected": -3.6641411781311035, "logps/chosen": -70.85429382324219, "logps/rejected": -87.84095764160156, "loss": 113032.7, "rewards/accuracies": 0.5, "rewards/chosen": -0.007130843587219715, "rewards/margins": 0.0024430896155536175, "rewards/rejected": -0.00957393180578947, "step": 4910 }, { "epoch": 0.49782454720226654, "grad_norm": 305738.7358099986, "learning_rate": 2.790083202158758e-07, "logits/chosen": -3.9310760498046875, "logits/rejected": -3.878685474395752, "logps/chosen": -274.2588195800781, "logps/rejected": -295.86004638671875, "loss": 127686.5125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.00838838703930378, "rewards/margins": 0.0056747146882116795, "rewards/rejected": -0.014063102193176746, "step": 4920 }, { "epoch": 0.4988363857128402, "grad_norm": 545502.6615286177, "learning_rate": 2.7844614346750617e-07, "logits/chosen": -4.026177883148193, "logits/rejected": -3.963606357574463, "logps/chosen": -104.69754791259766, "logps/rejected": -108.67735290527344, "loss": 122964.2125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007081848569214344, "rewards/margins": 0.004930655937641859, "rewards/rejected": -0.012012503109872341, "step": 4930 }, { "epoch": 0.49984822422341396, "grad_norm": 237266.41841483332, "learning_rate": 2.778839667191365e-07, "logits/chosen": -4.124194145202637, "logits/rejected": -4.194062232971191, "logps/chosen": -95.9461441040039, "logps/rejected": -153.78396606445312, "loss": 106179.4375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008314231410622597, "rewards/margins": 0.0019223600393161178, "rewards/rejected": -0.010236592032015324, "step": 4940 }, { "epoch": 0.5008600627339876, "grad_norm": 261192.95716094473, "learning_rate": 2.7732178997076675e-07, "logits/chosen": -4.375563621520996, "logits/rejected": -4.367779731750488, "logps/chosen": -61.20611572265625, "logps/rejected": -83.81687927246094, "loss": 108340.5125, "rewards/accuracies": 0.5, "rewards/chosen": -0.007492915727198124, "rewards/margins": 0.0017027573194354773, "rewards/rejected": -0.00919567234814167, "step": 4950 }, { "epoch": 0.5018719012445614, "grad_norm": 331226.39806391887, "learning_rate": 2.767596132223971e-07, "logits/chosen": -3.9510066509246826, "logits/rejected": -3.9064621925354004, "logps/chosen": -81.71531677246094, "logps/rejected": -125.4736557006836, "loss": 126388.0625, "rewards/accuracies": 0.5, "rewards/chosen": -0.006122888997197151, "rewards/margins": 0.0036517370026558638, "rewards/rejected": -0.009774626232683659, "step": 4960 }, { "epoch": 0.5028837397551351, "grad_norm": 484434.5730087583, "learning_rate": 2.7619743647402745e-07, "logits/chosen": -4.074257850646973, "logits/rejected": -4.10714054107666, "logps/chosen": -137.38412475585938, "logps/rejected": -126.72269439697266, "loss": 120503.475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009970604442059994, "rewards/margins": 0.001072772080078721, "rewards/rejected": -0.011043376289308071, "step": 4970 }, { "epoch": 0.5038955782657087, "grad_norm": 329107.7365956124, "learning_rate": 2.756352597256577e-07, "logits/chosen": -3.856200695037842, "logits/rejected": -3.9614853858947754, "logps/chosen": -80.73448181152344, "logps/rejected": -101.33137512207031, "loss": 100766.8625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008029035292565823, "rewards/margins": 0.002583557041361928, "rewards/rejected": -0.010612592101097107, "step": 4980 }, { "epoch": 0.5049074167762825, "grad_norm": 339490.26307711174, "learning_rate": 2.7507308297728803e-07, "logits/chosen": -4.27529239654541, "logits/rejected": -4.069708824157715, "logps/chosen": -104.0208511352539, "logps/rejected": -149.94200134277344, "loss": 128726.4, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008640700951218605, "rewards/margins": 0.003411628305912018, "rewards/rejected": -0.012052329257130623, "step": 4990 }, { "epoch": 0.5059192552868562, "grad_norm": 295171.0070865726, "learning_rate": 2.745109062289184e-07, "logits/chosen": -4.116396903991699, "logits/rejected": -3.9798121452331543, "logps/chosen": -50.290077209472656, "logps/rejected": -75.16651916503906, "loss": 112278.475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006583778653293848, "rewards/margins": 0.003961200825870037, "rewards/rejected": -0.010544979944825172, "step": 5000 }, { "epoch": 0.50693109379743, "grad_norm": 113484.7630268265, "learning_rate": 2.7394872948054867e-07, "logits/chosen": -4.074153900146484, "logits/rejected": -3.9715964794158936, "logps/chosen": -65.23469543457031, "logps/rejected": -84.65730285644531, "loss": 94088.0562, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006654699798673391, "rewards/margins": 0.0019290198106318712, "rewards/rejected": -0.008583719842135906, "step": 5010 }, { "epoch": 0.5079429323080037, "grad_norm": 169303.92803639147, "learning_rate": 2.7338655273217896e-07, "logits/chosen": -4.188525199890137, "logits/rejected": -4.176980018615723, "logps/chosen": -68.05644989013672, "logps/rejected": -101.90254211425781, "loss": 95139.0562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00497778132557869, "rewards/margins": 0.006547040306031704, "rewards/rejected": -0.011524821631610394, "step": 5020 }, { "epoch": 0.5089547708185773, "grad_norm": 242841.32319162186, "learning_rate": 2.728243759838093e-07, "logits/chosen": -4.441540718078613, "logits/rejected": -4.48829460144043, "logps/chosen": -108.1966781616211, "logps/rejected": -106.0685806274414, "loss": 127459.15, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006846254225820303, "rewards/margins": 0.0010536487679928541, "rewards/rejected": -0.007899902760982513, "step": 5030 }, { "epoch": 0.509966609329151, "grad_norm": 211612.80589043905, "learning_rate": 2.722621992354396e-07, "logits/chosen": -4.263078212738037, "logits/rejected": -4.141936302185059, "logps/chosen": -73.14064025878906, "logps/rejected": -95.51811218261719, "loss": 122213.725, "rewards/accuracies": 0.75, "rewards/chosen": -0.0074457041919231415, "rewards/margins": 0.003499868791550398, "rewards/rejected": -0.010945572517812252, "step": 5040 }, { "epoch": 0.5109784478397248, "grad_norm": 361980.8601642229, "learning_rate": 2.717000224870699e-07, "logits/chosen": -3.9980785846710205, "logits/rejected": -4.073044776916504, "logps/chosen": -118.4569091796875, "logps/rejected": -112.1273422241211, "loss": 130192.55, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007406730204820633, "rewards/margins": 0.002123942133039236, "rewards/rejected": -0.009530672803521156, "step": 5050 }, { "epoch": 0.5119902863502985, "grad_norm": 244749.58919184856, "learning_rate": 2.7113784573870024e-07, "logits/chosen": -4.040448188781738, "logits/rejected": -4.085093021392822, "logps/chosen": -74.1008071899414, "logps/rejected": -77.01417541503906, "loss": 112381.175, "rewards/accuracies": 0.5, "rewards/chosen": -0.0068960548378527164, "rewards/margins": 0.0017045937711372972, "rewards/rejected": -0.008600648492574692, "step": 5060 }, { "epoch": 0.5130021248608722, "grad_norm": 264564.61459631554, "learning_rate": 2.705756689903306e-07, "logits/chosen": -3.7567131519317627, "logits/rejected": -3.7703781127929688, "logps/chosen": -63.53290939331055, "logps/rejected": -65.05554962158203, "loss": 110284.475, "rewards/accuracies": 0.5, "rewards/chosen": -0.0036330472212284803, "rewards/margins": 0.001146214664913714, "rewards/rejected": -0.004779261536896229, "step": 5070 }, { "epoch": 0.5140139633714459, "grad_norm": 391861.49889519735, "learning_rate": 2.7001349224196087e-07, "logits/chosen": -3.8405232429504395, "logits/rejected": -3.9802756309509277, "logps/chosen": -109.93714904785156, "logps/rejected": -122.34474182128906, "loss": 115019.675, "rewards/accuracies": 0.75, "rewards/chosen": -0.006043580826371908, "rewards/margins": 0.003602838609367609, "rewards/rejected": -0.009646420367062092, "step": 5080 }, { "epoch": 0.5150258018820196, "grad_norm": 278005.54741099925, "learning_rate": 2.6945131549359116e-07, "logits/chosen": -4.184579372406006, "logits/rejected": -4.158846378326416, "logps/chosen": -89.86322784423828, "logps/rejected": -91.66035461425781, "loss": 103646.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007413449697196484, "rewards/margins": 0.001681817346252501, "rewards/rejected": -0.009095266461372375, "step": 5090 }, { "epoch": 0.5160376403925934, "grad_norm": 392426.89934988494, "learning_rate": 2.688891387452215e-07, "logits/chosen": -3.8563473224639893, "logits/rejected": -3.8541202545166016, "logps/chosen": -87.18865966796875, "logps/rejected": -117.5961685180664, "loss": 117943.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007904052734375, "rewards/margins": 0.002460057847201824, "rewards/rejected": -0.01036410965025425, "step": 5100 }, { "epoch": 0.5170494789031671, "grad_norm": 212037.5272049691, "learning_rate": 2.683269619968518e-07, "logits/chosen": -3.8904404640197754, "logits/rejected": -3.828655958175659, "logps/chosen": -97.83990478515625, "logps/rejected": -76.68882751464844, "loss": 119723.3, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0074574993923306465, "rewards/margins": 0.0011112098582088947, "rewards/rejected": -0.008568709716200829, "step": 5110 }, { "epoch": 0.5180613174137407, "grad_norm": 237559.51971919803, "learning_rate": 2.677647852484821e-07, "logits/chosen": -4.247109413146973, "logits/rejected": -4.089200019836426, "logps/chosen": -73.30498504638672, "logps/rejected": -117.19602966308594, "loss": 119285.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0034692517947405577, "rewards/margins": 0.004842083435505629, "rewards/rejected": -0.008311335928738117, "step": 5120 }, { "epoch": 0.5190731559243145, "grad_norm": 331046.7372870099, "learning_rate": 2.6720260850011244e-07, "logits/chosen": -4.180168151855469, "logits/rejected": -4.1768975257873535, "logps/chosen": -91.44224548339844, "logps/rejected": -92.85357666015625, "loss": 125965.7625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009614823386073112, "rewards/margins": 0.0019234981155022979, "rewards/rejected": -0.011538321152329445, "step": 5130 }, { "epoch": 0.5200849944348882, "grad_norm": 210157.7125037683, "learning_rate": 2.6664043175174273e-07, "logits/chosen": -3.8141703605651855, "logits/rejected": -3.7960948944091797, "logps/chosen": -119.96993255615234, "logps/rejected": -130.73106384277344, "loss": 108654.0625, "rewards/accuracies": 0.75, "rewards/chosen": -0.008954890072345734, "rewards/margins": 0.002788939280435443, "rewards/rejected": -0.011743830516934395, "step": 5140 }, { "epoch": 0.5210968329454619, "grad_norm": 58812.522333006265, "learning_rate": 2.660782550033731e-07, "logits/chosen": -4.1924848556518555, "logits/rejected": -4.3341898918151855, "logps/chosen": -75.97572326660156, "logps/rejected": -72.91433715820312, "loss": 105243.4625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0073233419097959995, "rewards/margins": 0.0023446395061910152, "rewards/rejected": -0.009667981415987015, "step": 5150 }, { "epoch": 0.5221086714560356, "grad_norm": 446780.6893030064, "learning_rate": 2.6551607825500337e-07, "logits/chosen": -4.191488742828369, "logits/rejected": -4.228306770324707, "logps/chosen": -79.53953552246094, "logps/rejected": -111.50419616699219, "loss": 111907.1375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007944758050143719, "rewards/margins": 0.002362077357247472, "rewards/rejected": -0.010306836105883121, "step": 5160 }, { "epoch": 0.5231205099666093, "grad_norm": 236866.72905854022, "learning_rate": 2.6495390150663366e-07, "logits/chosen": -3.982475996017456, "logits/rejected": -3.8858158588409424, "logps/chosen": -52.3195686340332, "logps/rejected": -100.17192077636719, "loss": 126035.7375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.004834076389670372, "rewards/margins": 0.004991631023585796, "rewards/rejected": -0.009825708344578743, "step": 5170 }, { "epoch": 0.524132348477183, "grad_norm": 155145.67179410713, "learning_rate": 2.64391724758264e-07, "logits/chosen": -4.110353946685791, "logits/rejected": -3.9287261962890625, "logps/chosen": -111.73831939697266, "logps/rejected": -132.41204833984375, "loss": 126327.1, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007474998943507671, "rewards/margins": 0.005334409419447184, "rewards/rejected": -0.012809407897293568, "step": 5180 }, { "epoch": 0.5251441869877568, "grad_norm": 398625.9263971245, "learning_rate": 2.638295480098943e-07, "logits/chosen": -3.7623813152313232, "logits/rejected": -3.71608304977417, "logps/chosen": -76.16471862792969, "logps/rejected": -104.79795837402344, "loss": 112462.9875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008175244554877281, "rewards/margins": 0.003809334011748433, "rewards/rejected": -0.011984577402472496, "step": 5190 }, { "epoch": 0.5261560254983305, "grad_norm": 364279.59660545614, "learning_rate": 2.632673712615246e-07, "logits/chosen": -4.298056602478027, "logits/rejected": -4.143904209136963, "logps/chosen": -94.27371215820312, "logps/rejected": -100.45915222167969, "loss": 133492.025, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007951593026518822, "rewards/margins": 0.0008294832659885287, "rewards/rejected": -0.008781076408922672, "step": 5200 }, { "epoch": 0.5271678640089041, "grad_norm": 352180.593873833, "learning_rate": 2.6270519451315494e-07, "logits/chosen": -4.100150108337402, "logits/rejected": -4.0712480545043945, "logps/chosen": -78.02449798583984, "logps/rejected": -85.26112365722656, "loss": 131008.325, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00985637865960598, "rewards/margins": 0.0020854040049016476, "rewards/rejected": -0.01194178406149149, "step": 5210 }, { "epoch": 0.5281797025194779, "grad_norm": 258828.31693164937, "learning_rate": 2.621430177647853e-07, "logits/chosen": -3.8315253257751465, "logits/rejected": -3.9296393394470215, "logps/chosen": -86.36878967285156, "logps/rejected": -109.4948501586914, "loss": 124394.875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00754228699952364, "rewards/margins": 0.003064835909754038, "rewards/rejected": -0.01060712244361639, "step": 5220 }, { "epoch": 0.5291915410300516, "grad_norm": 399980.99453476636, "learning_rate": 2.615808410164155e-07, "logits/chosen": -4.373072624206543, "logits/rejected": -4.290961265563965, "logps/chosen": -136.12979125976562, "logps/rejected": -125.2859115600586, "loss": 117905.2125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009183937683701515, "rewards/margins": 0.003464330453425646, "rewards/rejected": -0.012648266740143299, "step": 5230 }, { "epoch": 0.5302033795406254, "grad_norm": 245783.81961919987, "learning_rate": 2.6101866426804587e-07, "logits/chosen": -3.9085469245910645, "logits/rejected": -4.002504825592041, "logps/chosen": -54.28902053833008, "logps/rejected": -92.11211395263672, "loss": 120734.35, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006878932472318411, "rewards/margins": 0.006250493228435516, "rewards/rejected": -0.013129426166415215, "step": 5240 }, { "epoch": 0.531215218051199, "grad_norm": 328123.98744703975, "learning_rate": 2.604564875196762e-07, "logits/chosen": -4.3388285636901855, "logits/rejected": -4.558104991912842, "logps/chosen": -117.4281997680664, "logps/rejected": -131.99386596679688, "loss": 108575.2625, "rewards/accuracies": 0.75, "rewards/chosen": -0.010996361263096333, "rewards/margins": 0.0032045994885265827, "rewards/rejected": -0.014200960285961628, "step": 5250 }, { "epoch": 0.5322270565617727, "grad_norm": 222314.42875476604, "learning_rate": 2.5989431077130645e-07, "logits/chosen": -4.081136226654053, "logits/rejected": -4.202538967132568, "logps/chosen": -94.23622131347656, "logps/rejected": -125.14749908447266, "loss": 112610.6625, "rewards/accuracies": 0.5, "rewards/chosen": -0.008513567969202995, "rewards/margins": 0.003262485843151808, "rewards/rejected": -0.011776053346693516, "step": 5260 }, { "epoch": 0.5332388950723465, "grad_norm": 181561.2292998781, "learning_rate": 2.593321340229368e-07, "logits/chosen": -4.503609657287598, "logits/rejected": -4.426753997802734, "logps/chosen": -73.81087493896484, "logps/rejected": -96.4300308227539, "loss": 107749.675, "rewards/accuracies": 0.5, "rewards/chosen": -0.008094998076558113, "rewards/margins": 0.001958846813067794, "rewards/rejected": -0.01005384512245655, "step": 5270 }, { "epoch": 0.5342507335829202, "grad_norm": 620902.3997630735, "learning_rate": 2.5876995727456714e-07, "logits/chosen": -4.261660575866699, "logits/rejected": -4.132785320281982, "logps/chosen": -143.64715576171875, "logps/rejected": -142.40579223632812, "loss": 130164.1875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01273087877780199, "rewards/margins": 0.0033220991026610136, "rewards/rejected": -0.016052978113293648, "step": 5280 }, { "epoch": 0.5352625720934939, "grad_norm": 354998.7492287407, "learning_rate": 2.582077805261974e-07, "logits/chosen": -4.436038970947266, "logits/rejected": -4.344414710998535, "logps/chosen": -82.71414184570312, "logps/rejected": -122.99937438964844, "loss": 120320.45, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00982988253235817, "rewards/margins": 0.00276002730242908, "rewards/rejected": -0.012589909136295319, "step": 5290 }, { "epoch": 0.5362744106040676, "grad_norm": 364562.3371791606, "learning_rate": 2.5764560377782773e-07, "logits/chosen": -4.410015106201172, "logits/rejected": -4.465394496917725, "logps/chosen": -60.1081428527832, "logps/rejected": -104.0895004272461, "loss": 94272.2, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00811795238405466, "rewards/margins": 0.004220784176141024, "rewards/rejected": -0.012338736094534397, "step": 5300 }, { "epoch": 0.5372862491146413, "grad_norm": 321880.89190960646, "learning_rate": 2.5708342702945807e-07, "logits/chosen": -3.878108263015747, "logits/rejected": -3.8679862022399902, "logps/chosen": -107.7907485961914, "logps/rejected": -108.47222900390625, "loss": 131258.7, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009610423818230629, "rewards/margins": 0.002577531151473522, "rewards/rejected": -0.012187954969704151, "step": 5310 }, { "epoch": 0.538298087625215, "grad_norm": 281191.2972934178, "learning_rate": 2.5652125028108836e-07, "logits/chosen": -3.5568511486053467, "logits/rejected": -3.7452781200408936, "logps/chosen": -96.04780578613281, "logps/rejected": -139.54991149902344, "loss": 130520.8375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.008677994832396507, "rewards/margins": 0.004220165777951479, "rewards/rejected": -0.012898160144686699, "step": 5320 }, { "epoch": 0.5393099261357888, "grad_norm": 273940.2106886484, "learning_rate": 2.5595907353271866e-07, "logits/chosen": -3.884448528289795, "logits/rejected": -3.967571258544922, "logps/chosen": -59.58466720581055, "logps/rejected": -88.45026397705078, "loss": 114433.025, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010107407346367836, "rewards/margins": 0.003323100507259369, "rewards/rejected": -0.013430507853627205, "step": 5330 }, { "epoch": 0.5403217646463624, "grad_norm": 253894.9286220954, "learning_rate": 2.55396896784349e-07, "logits/chosen": -4.262056827545166, "logits/rejected": -4.177132606506348, "logps/chosen": -75.8594970703125, "logps/rejected": -99.4611587524414, "loss": 117965.4875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00911188218742609, "rewards/margins": 0.005331072490662336, "rewards/rejected": -0.014442955143749714, "step": 5340 }, { "epoch": 0.5413336031569361, "grad_norm": 267572.650149877, "learning_rate": 2.548347200359793e-07, "logits/chosen": -4.149306297302246, "logits/rejected": -4.2056989669799805, "logps/chosen": -66.65931701660156, "logps/rejected": -98.30459594726562, "loss": 88228.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006510412786155939, "rewards/margins": 0.0041292523965239525, "rewards/rejected": -0.010639665648341179, "step": 5350 }, { "epoch": 0.5423454416675099, "grad_norm": 249565.6730536703, "learning_rate": 2.542725432876096e-07, "logits/chosen": -3.938894748687744, "logits/rejected": -3.874540328979492, "logps/chosen": -86.12214660644531, "logps/rejected": -107.40665435791016, "loss": 124642.8, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011189760640263557, "rewards/margins": 0.0010990374721586704, "rewards/rejected": -0.012288798578083515, "step": 5360 }, { "epoch": 0.5433572801780836, "grad_norm": 297456.1681239155, "learning_rate": 2.5371036653923993e-07, "logits/chosen": -3.8256237506866455, "logits/rejected": -3.801128387451172, "logps/chosen": -87.6981201171875, "logps/rejected": -107.6256332397461, "loss": 121992.375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009207581169903278, "rewards/margins": 0.004198581911623478, "rewards/rejected": -0.013406163081526756, "step": 5370 }, { "epoch": 0.5443691186886573, "grad_norm": 273131.63823022245, "learning_rate": 2.531481897908702e-07, "logits/chosen": -4.7786173820495605, "logits/rejected": -4.653148651123047, "logps/chosen": -75.71565246582031, "logps/rejected": -133.7788543701172, "loss": 101391.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008491739630699158, "rewards/margins": 0.007337156683206558, "rewards/rejected": -0.015828896313905716, "step": 5380 }, { "epoch": 0.545380957199231, "grad_norm": 282569.91992967715, "learning_rate": 2.5258601304250057e-07, "logits/chosen": -4.4293084144592285, "logits/rejected": -4.356001853942871, "logps/chosen": -90.23554992675781, "logps/rejected": -108.9134750366211, "loss": 114607.45, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009942044503986835, "rewards/margins": 0.0001631057821214199, "rewards/rejected": -0.010105150751769543, "step": 5390 }, { "epoch": 0.5463927957098047, "grad_norm": 274878.16977920017, "learning_rate": 2.5202383629413086e-07, "logits/chosen": -4.092188835144043, "logits/rejected": -4.002153396606445, "logps/chosen": -100.65376281738281, "logps/rejected": -139.36962890625, "loss": 114460.075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010551448911428452, "rewards/margins": 0.0026123682036995888, "rewards/rejected": -0.01316381711512804, "step": 5400 }, { "epoch": 0.5474046342203784, "grad_norm": 386490.55946369603, "learning_rate": 2.5146165954576115e-07, "logits/chosen": -3.922823429107666, "logits/rejected": -3.848965883255005, "logps/chosen": -74.0289535522461, "logps/rejected": -96.87993621826172, "loss": 102835.25, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006094923242926598, "rewards/margins": 0.006763486657291651, "rewards/rejected": -0.012858408503234386, "step": 5410 }, { "epoch": 0.5484164727309522, "grad_norm": 320209.28709462646, "learning_rate": 2.508994827973915e-07, "logits/chosen": -4.133163928985596, "logits/rejected": -3.8821868896484375, "logps/chosen": -46.97970962524414, "logps/rejected": -65.877197265625, "loss": 113045.025, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006418194621801376, "rewards/margins": 0.0030305604450404644, "rewards/rejected": -0.009448755532503128, "step": 5420 }, { "epoch": 0.5494283112415258, "grad_norm": 350845.37050761445, "learning_rate": 2.503373060490218e-07, "logits/chosen": -4.130343437194824, "logits/rejected": -3.9273734092712402, "logps/chosen": -87.68330383300781, "logps/rejected": -225.2353973388672, "loss": 108236.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007707727607339621, "rewards/margins": 0.0020561886485666037, "rewards/rejected": -0.00976391602307558, "step": 5430 }, { "epoch": 0.5504401497520995, "grad_norm": 416924.27439170907, "learning_rate": 2.4977512930065214e-07, "logits/chosen": -3.6336617469787598, "logits/rejected": -3.6493523120880127, "logps/chosen": -134.40682983398438, "logps/rejected": -126.9375228881836, "loss": 93939.9312, "rewards/accuracies": 0.5, "rewards/chosen": -0.005761849693953991, "rewards/margins": 0.0020231115631759167, "rewards/rejected": -0.007784961257129908, "step": 5440 }, { "epoch": 0.5514519882626733, "grad_norm": 294832.576171811, "learning_rate": 2.4921295255228243e-07, "logits/chosen": -4.553071022033691, "logits/rejected": -4.395932197570801, "logps/chosen": -102.45066833496094, "logps/rejected": -91.63182067871094, "loss": 112865.0, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0118449991568923, "rewards/margins": 0.0012813499197363853, "rewards/rejected": -0.013126349076628685, "step": 5450 }, { "epoch": 0.552463826773247, "grad_norm": 395774.78770348465, "learning_rate": 2.486507758039128e-07, "logits/chosen": -4.121151924133301, "logits/rejected": -4.082960605621338, "logps/chosen": -95.35685729980469, "logps/rejected": -101.2879867553711, "loss": 115200.275, "rewards/accuracies": 0.5, "rewards/chosen": -0.007455932907760143, "rewards/margins": 0.0034088853280991316, "rewards/rejected": -0.010864818468689919, "step": 5460 }, { "epoch": 0.5534756652838208, "grad_norm": 258249.58198507628, "learning_rate": 2.4808859905554307e-07, "logits/chosen": -3.747859239578247, "logits/rejected": -3.7273247241973877, "logps/chosen": -144.87503051757812, "logps/rejected": -122.61796569824219, "loss": 123680.8375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010016398504376411, "rewards/margins": 0.003029522020369768, "rewards/rejected": -0.013045919127762318, "step": 5470 }, { "epoch": 0.5544875037943944, "grad_norm": 203224.72654115423, "learning_rate": 2.4752642230717336e-07, "logits/chosen": -4.016385078430176, "logits/rejected": -3.948450803756714, "logps/chosen": -132.71041870117188, "logps/rejected": -114.1968994140625, "loss": 131533.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.00840016733855009, "rewards/margins": 0.000582480279263109, "rewards/rejected": -0.008982648141682148, "step": 5480 }, { "epoch": 0.5554993423049681, "grad_norm": 229724.47603345948, "learning_rate": 2.469642455588037e-07, "logits/chosen": -3.9551353454589844, "logits/rejected": -3.9867606163024902, "logps/chosen": -95.61125946044922, "logps/rejected": -87.11729431152344, "loss": 114588.6375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008474880829453468, "rewards/margins": 0.0016644919523969293, "rewards/rejected": -0.010139373131096363, "step": 5490 }, { "epoch": 0.5565111808155419, "grad_norm": 217176.57361417348, "learning_rate": 2.46402068810434e-07, "logits/chosen": -3.9144225120544434, "logits/rejected": -4.092955589294434, "logps/chosen": -93.45954895019531, "logps/rejected": -61.95598220825195, "loss": 121151.6125, "rewards/accuracies": 0.5, "rewards/chosen": -0.005332403350621462, "rewards/margins": 0.000491451530251652, "rewards/rejected": -0.005823854357004166, "step": 5500 }, { "epoch": 0.5575230193261156, "grad_norm": 213709.33577749162, "learning_rate": 2.458398920620643e-07, "logits/chosen": -4.149624824523926, "logits/rejected": -4.074247360229492, "logps/chosen": -91.315673828125, "logps/rejected": -88.74171447753906, "loss": 106417.3625, "rewards/accuracies": 0.5, "rewards/chosen": -0.005969343241304159, "rewards/margins": 0.002569273579865694, "rewards/rejected": -0.008538616821169853, "step": 5510 }, { "epoch": 0.5585348578366892, "grad_norm": 544353.8663502363, "learning_rate": 2.4527771531369463e-07, "logits/chosen": -4.224281311035156, "logits/rejected": -4.279822826385498, "logps/chosen": -108.61106872558594, "logps/rejected": -101.37373352050781, "loss": 118057.7, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010534971952438354, "rewards/margins": -0.0003659687645267695, "rewards/rejected": -0.01016900222748518, "step": 5520 }, { "epoch": 0.559546696347263, "grad_norm": 294744.57209866605, "learning_rate": 2.4471553856532493e-07, "logits/chosen": -4.125882625579834, "logits/rejected": -4.112275123596191, "logps/chosen": -72.44903564453125, "logps/rejected": -104.79264068603516, "loss": 124456.625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0038285627961158752, "rewards/margins": 0.006633625831454992, "rewards/rejected": -0.010462189093232155, "step": 5530 }, { "epoch": 0.5605585348578367, "grad_norm": 252409.88269531398, "learning_rate": 2.441533618169552e-07, "logits/chosen": -4.044971942901611, "logits/rejected": -3.9772956371307373, "logps/chosen": -136.12274169921875, "logps/rejected": -310.58599853515625, "loss": 116814.0625, "rewards/accuracies": 0.5, "rewards/chosen": -0.00954408384859562, "rewards/margins": 0.0044512697495520115, "rewards/rejected": -0.013995354063808918, "step": 5540 }, { "epoch": 0.5615703733684104, "grad_norm": 183783.23054308898, "learning_rate": 2.4359118506858556e-07, "logits/chosen": -4.093567371368408, "logits/rejected": -4.11002779006958, "logps/chosen": -108.62078857421875, "logps/rejected": -109.40504455566406, "loss": 115941.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.005912060849368572, "rewards/margins": 0.0057022892870008945, "rewards/rejected": -0.011614350602030754, "step": 5550 }, { "epoch": 0.5625822118789842, "grad_norm": 302136.2688631389, "learning_rate": 2.4302900832021586e-07, "logits/chosen": -4.019949913024902, "logits/rejected": -3.8241138458251953, "logps/chosen": -97.03929138183594, "logps/rejected": -107.4571762084961, "loss": 104977.725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005357510410249233, "rewards/margins": 0.0005217852885834873, "rewards/rejected": -0.0058792969211936, "step": 5560 }, { "epoch": 0.5635940503895578, "grad_norm": 157224.7978110753, "learning_rate": 2.4246683157184615e-07, "logits/chosen": -4.053389072418213, "logits/rejected": -3.8803329467773438, "logps/chosen": -107.1116714477539, "logps/rejected": -110.8895034790039, "loss": 125430.55, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008979353122413158, "rewards/margins": 0.003154970705509186, "rewards/rejected": -0.012134323827922344, "step": 5570 }, { "epoch": 0.5646058889001315, "grad_norm": 345543.22999113007, "learning_rate": 2.419046548234765e-07, "logits/chosen": -4.136568546295166, "logits/rejected": -4.300292015075684, "logps/chosen": -87.42945861816406, "logps/rejected": -124.48689270019531, "loss": 125014.075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.007067511323839426, "rewards/margins": 0.00657079741358757, "rewards/rejected": -0.013638308271765709, "step": 5580 }, { "epoch": 0.5656177274107053, "grad_norm": 160065.4126662616, "learning_rate": 2.413424780751068e-07, "logits/chosen": -3.9591965675354004, "logits/rejected": -4.122430324554443, "logps/chosen": -109.79052734375, "logps/rejected": -119.658935546875, "loss": 130998.6125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011490700766444206, "rewards/margins": 0.00242353486828506, "rewards/rejected": -0.013914234936237335, "step": 5590 }, { "epoch": 0.566629565921279, "grad_norm": 361627.2858399711, "learning_rate": 2.4078030132673713e-07, "logits/chosen": -4.189683437347412, "logits/rejected": -4.281468868255615, "logps/chosen": -60.49809646606445, "logps/rejected": -67.14775085449219, "loss": 122922.9, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005816822871565819, "rewards/margins": 0.0022928037215024233, "rewards/rejected": -0.008109627291560173, "step": 5600 }, { "epoch": 0.5676414044318526, "grad_norm": 81920.64797398789, "learning_rate": 2.402181245783674e-07, "logits/chosen": -4.479868412017822, "logits/rejected": -4.617487907409668, "logps/chosen": -114.81038665771484, "logps/rejected": -105.3761215209961, "loss": 102455.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006381137762218714, "rewards/margins": 0.004608192481100559, "rewards/rejected": -0.010989329777657986, "step": 5610 }, { "epoch": 0.5686532429424264, "grad_norm": 350827.71806548064, "learning_rate": 2.396559478299977e-07, "logits/chosen": -4.185822010040283, "logits/rejected": -4.193787574768066, "logps/chosen": -79.85542297363281, "logps/rejected": -97.21898651123047, "loss": 124914.2125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009377049282193184, "rewards/margins": 0.002430147025734186, "rewards/rejected": -0.011807197704911232, "step": 5620 }, { "epoch": 0.5696650814530001, "grad_norm": 251282.7467794302, "learning_rate": 2.3909377108162806e-07, "logits/chosen": -4.047341823577881, "logits/rejected": -4.002614498138428, "logps/chosen": -67.80310821533203, "logps/rejected": -72.70999908447266, "loss": 121569.8625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009702622890472412, "rewards/margins": 0.0012007481418550014, "rewards/rejected": -0.010903370566666126, "step": 5630 }, { "epoch": 0.5706769199635738, "grad_norm": 278228.8961326667, "learning_rate": 2.3853159433325835e-07, "logits/chosen": -4.008553981781006, "logits/rejected": -3.9895052909851074, "logps/chosen": -51.908935546875, "logps/rejected": -88.51464080810547, "loss": 117073.7125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00520454254001379, "rewards/margins": 0.0035531967878341675, "rewards/rejected": -0.008757739327847958, "step": 5640 }, { "epoch": 0.5716887584741476, "grad_norm": 468585.228346289, "learning_rate": 2.3796941758488867e-07, "logits/chosen": -3.333169460296631, "logits/rejected": -3.3044369220733643, "logps/chosen": -118.9327621459961, "logps/rejected": -133.22177124023438, "loss": 109268.7375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007767961826175451, "rewards/margins": 0.002808037679642439, "rewards/rejected": -0.010576000437140465, "step": 5650 }, { "epoch": 0.5727005969847212, "grad_norm": 239489.24155644348, "learning_rate": 2.37407240836519e-07, "logits/chosen": -4.119927883148193, "logits/rejected": -4.10949182510376, "logps/chosen": -72.02600860595703, "logps/rejected": -90.81416320800781, "loss": 104136.9438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007135299034416676, "rewards/margins": 0.00318335322663188, "rewards/rejected": -0.010318652726709843, "step": 5660 }, { "epoch": 0.5737124354952949, "grad_norm": 312297.48975060915, "learning_rate": 2.368450640881493e-07, "logits/chosen": -4.096045017242432, "logits/rejected": -4.095069408416748, "logps/chosen": -97.95755767822266, "logps/rejected": -94.86279296875, "loss": 110553.0375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007351423613727093, "rewards/margins": 0.0008188943611457944, "rewards/rejected": -0.00817031878978014, "step": 5670 }, { "epoch": 0.5747242740058687, "grad_norm": 265711.8447436605, "learning_rate": 2.362828873397796e-07, "logits/chosen": -4.424670696258545, "logits/rejected": -4.434237480163574, "logps/chosen": -57.339332580566406, "logps/rejected": -74.572021484375, "loss": 119712.625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0061994316056370735, "rewards/margins": 0.0035680632572621107, "rewards/rejected": -0.009767494164407253, "step": 5680 }, { "epoch": 0.5757361125164424, "grad_norm": 316707.9936976243, "learning_rate": 2.3572071059140995e-07, "logits/chosen": -3.7532482147216797, "logits/rejected": -3.7162442207336426, "logps/chosen": -62.77406692504883, "logps/rejected": -85.26363372802734, "loss": 120492.825, "rewards/accuracies": 0.5, "rewards/chosen": -0.007885348051786423, "rewards/margins": 0.0025506089441478252, "rewards/rejected": -0.01043595653027296, "step": 5690 }, { "epoch": 0.576747951027016, "grad_norm": 426123.4617378135, "learning_rate": 2.3515853384304024e-07, "logits/chosen": -4.007862091064453, "logits/rejected": -4.010933876037598, "logps/chosen": -92.65364837646484, "logps/rejected": -91.99781799316406, "loss": 133307.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00486310850828886, "rewards/margins": 0.0037961334455758333, "rewards/rejected": -0.008659242652356625, "step": 5700 }, { "epoch": 0.5777597895375898, "grad_norm": 261836.5320811089, "learning_rate": 2.3459635709467053e-07, "logits/chosen": -4.276823997497559, "logits/rejected": -4.24368953704834, "logps/chosen": -77.1429443359375, "logps/rejected": -98.56842041015625, "loss": 121200.05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006231953855603933, "rewards/margins": 0.007847290486097336, "rewards/rejected": -0.01407924760133028, "step": 5710 }, { "epoch": 0.5787716280481635, "grad_norm": 245919.34983694163, "learning_rate": 2.3403418034630088e-07, "logits/chosen": -4.126413822174072, "logits/rejected": -4.068448066711426, "logps/chosen": -58.37605667114258, "logps/rejected": -81.24440002441406, "loss": 107316.8875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006581499241292477, "rewards/margins": 0.0020024501718580723, "rewards/rejected": -0.008583949878811836, "step": 5720 }, { "epoch": 0.5797834665587372, "grad_norm": 359010.09959526267, "learning_rate": 2.3347200359793117e-07, "logits/chosen": -4.274033546447754, "logits/rejected": -4.159280776977539, "logps/chosen": -54.44480514526367, "logps/rejected": -110.68973541259766, "loss": 114988.9875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005277964286506176, "rewards/margins": 0.005717786960303783, "rewards/rejected": -0.010995752178132534, "step": 5730 }, { "epoch": 0.580795305069311, "grad_norm": 241846.0193428425, "learning_rate": 2.329098268495615e-07, "logits/chosen": -4.359861850738525, "logits/rejected": -4.117361068725586, "logps/chosen": -71.9825439453125, "logps/rejected": -100.2998275756836, "loss": 104898.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006879757158458233, "rewards/margins": 0.003422768786549568, "rewards/rejected": -0.010302526876330376, "step": 5740 }, { "epoch": 0.5818071435798846, "grad_norm": 392291.59952182637, "learning_rate": 2.323476501011918e-07, "logits/chosen": -4.162008285522461, "logits/rejected": -4.097489356994629, "logps/chosen": -112.70103454589844, "logps/rejected": -165.8423309326172, "loss": 111051.375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01135916169732809, "rewards/margins": 0.0035247206687927246, "rewards/rejected": -0.01488388329744339, "step": 5750 }, { "epoch": 0.5828189820904583, "grad_norm": 196095.8566390804, "learning_rate": 2.3178547335282213e-07, "logits/chosen": -4.022917747497559, "logits/rejected": -3.9420619010925293, "logps/chosen": -77.5042953491211, "logps/rejected": -107.50724792480469, "loss": 106930.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004384762607514858, "rewards/margins": 0.005845712963491678, "rewards/rejected": -0.010230476036667824, "step": 5760 }, { "epoch": 0.5838308206010321, "grad_norm": 371569.2100584009, "learning_rate": 2.3122329660445242e-07, "logits/chosen": -4.47867488861084, "logits/rejected": -4.360047340393066, "logps/chosen": -110.4482421875, "logps/rejected": -99.08575439453125, "loss": 104362.4438, "rewards/accuracies": 0.5, "rewards/chosen": -0.008204294368624687, "rewards/margins": 0.000866724003572017, "rewards/rejected": -0.009071016684174538, "step": 5770 }, { "epoch": 0.5848426591116058, "grad_norm": 197432.48606160664, "learning_rate": 2.3066111985608274e-07, "logits/chosen": -4.227038383483887, "logits/rejected": -4.153075218200684, "logps/chosen": -103.4447250366211, "logps/rejected": -116.6081314086914, "loss": 111146.2, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008111286908388138, "rewards/margins": 0.003021760145202279, "rewards/rejected": -0.011133046820759773, "step": 5780 }, { "epoch": 0.5858544976221794, "grad_norm": 301847.1658420433, "learning_rate": 2.3009894310771306e-07, "logits/chosen": -4.322071075439453, "logits/rejected": -4.326940059661865, "logps/chosen": -121.31120300292969, "logps/rejected": -130.17477416992188, "loss": 131198.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01171043049544096, "rewards/margins": 0.0025794324465095997, "rewards/rejected": -0.014289863407611847, "step": 5790 }, { "epoch": 0.5868663361327532, "grad_norm": 419111.16064351157, "learning_rate": 2.2953676635934335e-07, "logits/chosen": -3.6278998851776123, "logits/rejected": -3.6124680042266846, "logps/chosen": -113.87451171875, "logps/rejected": -130.17176818847656, "loss": 115121.3875, "rewards/accuracies": 0.5, "rewards/chosen": -0.0058378796093165874, "rewards/margins": 0.0012153811985626817, "rewards/rejected": -0.007053261157125235, "step": 5800 }, { "epoch": 0.5878781746433269, "grad_norm": 202442.7030074059, "learning_rate": 2.289745896109737e-07, "logits/chosen": -4.414885520935059, "logits/rejected": -4.413935661315918, "logps/chosen": -64.65196228027344, "logps/rejected": -80.55256652832031, "loss": 112206.675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.006562368478626013, "rewards/margins": 0.006879452615976334, "rewards/rejected": -0.013441820628941059, "step": 5810 }, { "epoch": 0.5888900131539007, "grad_norm": 410683.64603402646, "learning_rate": 2.28412412862604e-07, "logits/chosen": -4.253517150878906, "logits/rejected": -4.21481990814209, "logps/chosen": -85.73411560058594, "logps/rejected": -157.13333129882812, "loss": 78340.375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006239605136215687, "rewards/margins": 0.006195499561727047, "rewards/rejected": -0.01243510376662016, "step": 5820 }, { "epoch": 0.5899018516644744, "grad_norm": 454435.6859646044, "learning_rate": 2.278502361142343e-07, "logits/chosen": -4.119645595550537, "logits/rejected": -4.1876301765441895, "logps/chosen": -99.94697570800781, "logps/rejected": -87.06485748291016, "loss": 105597.95, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005859859753400087, "rewards/margins": 0.002075278665870428, "rewards/rejected": -0.007935138419270515, "step": 5830 }, { "epoch": 0.590913690175048, "grad_norm": 355669.95285109, "learning_rate": 2.2728805936586462e-07, "logits/chosen": -4.352395534515381, "logits/rejected": -4.438633918762207, "logps/chosen": -65.95746612548828, "logps/rejected": -69.03788757324219, "loss": 110435.2125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008274046704173088, "rewards/margins": 0.0015868933405727148, "rewards/rejected": -0.009860940277576447, "step": 5840 }, { "epoch": 0.5919255286856218, "grad_norm": 386117.4482829386, "learning_rate": 2.2672588261749492e-07, "logits/chosen": -3.647312879562378, "logits/rejected": -3.714470624923706, "logps/chosen": -63.4927978515625, "logps/rejected": -86.93946838378906, "loss": 104707.4563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008397946134209633, "rewards/margins": 0.006224769167602062, "rewards/rejected": -0.01462271623313427, "step": 5850 }, { "epoch": 0.5929373671961955, "grad_norm": 329532.62815504806, "learning_rate": 2.2616370586912526e-07, "logits/chosen": -3.8236021995544434, "logits/rejected": -3.8321938514709473, "logps/chosen": -78.07383728027344, "logps/rejected": -93.39601135253906, "loss": 114722.5375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005157073959708214, "rewards/margins": 0.004500485025346279, "rewards/rejected": -0.009657558985054493, "step": 5860 }, { "epoch": 0.5939492057067692, "grad_norm": 138247.72328557566, "learning_rate": 2.2560152912075555e-07, "logits/chosen": -3.9401161670684814, "logits/rejected": -3.91984486579895, "logps/chosen": -112.2628173828125, "logps/rejected": -146.70700073242188, "loss": 127624.3875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.008038724772632122, "rewards/margins": 0.006523112300783396, "rewards/rejected": -0.01456183660775423, "step": 5870 }, { "epoch": 0.5949610442173429, "grad_norm": 24070.25182398797, "learning_rate": 2.2503935237238587e-07, "logits/chosen": -4.314845561981201, "logits/rejected": -4.181915283203125, "logps/chosen": -100.8605728149414, "logps/rejected": -96.36210632324219, "loss": 112678.525, "rewards/accuracies": 0.75, "rewards/chosen": -0.007757651153951883, "rewards/margins": 0.003821908263489604, "rewards/rejected": -0.011579559184610844, "step": 5880 }, { "epoch": 0.5959728827279166, "grad_norm": 258480.99371519824, "learning_rate": 2.244771756240162e-07, "logits/chosen": -4.349948883056641, "logits/rejected": -4.3244829177856445, "logps/chosen": -76.93922424316406, "logps/rejected": -139.9644012451172, "loss": 107511.875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011338595300912857, "rewards/margins": 0.005693518090993166, "rewards/rejected": -0.017032112926244736, "step": 5890 }, { "epoch": 0.5969847212384903, "grad_norm": 339464.2549530912, "learning_rate": 2.2391499887564648e-07, "logits/chosen": -4.483763217926025, "logits/rejected": -4.413929462432861, "logps/chosen": -92.27161407470703, "logps/rejected": -113.97747802734375, "loss": 101087.3687, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00890915933996439, "rewards/margins": 0.0014799393247812986, "rewards/rejected": -0.010389097966253757, "step": 5900 }, { "epoch": 0.5979965597490641, "grad_norm": 334534.05856335716, "learning_rate": 2.233528221272768e-07, "logits/chosen": -3.8783469200134277, "logits/rejected": -3.9239914417266846, "logps/chosen": -125.42411804199219, "logps/rejected": -151.3960723876953, "loss": 115838.7375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01413021981716156, "rewards/margins": 0.0037526176311075687, "rewards/rejected": -0.01788284070789814, "step": 5910 }, { "epoch": 0.5990083982596378, "grad_norm": 397432.6146186157, "learning_rate": 2.2279064537890712e-07, "logits/chosen": -3.971054792404175, "logits/rejected": -4.0194220542907715, "logps/chosen": -84.6874008178711, "logps/rejected": -90.33110046386719, "loss": 118647.1, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00610805070027709, "rewards/margins": 0.003337529953569174, "rewards/rejected": -0.009445580653846264, "step": 5920 }, { "epoch": 0.6000202367702114, "grad_norm": 341380.21480302815, "learning_rate": 2.2222846863053744e-07, "logits/chosen": -3.7719435691833496, "logits/rejected": -3.616020917892456, "logps/chosen": -67.26173400878906, "logps/rejected": -83.4201889038086, "loss": 101349.6375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007207865826785564, "rewards/margins": 0.00324685568921268, "rewards/rejected": -0.010454722680151463, "step": 5930 }, { "epoch": 0.6010320752807852, "grad_norm": 315022.8930641843, "learning_rate": 2.2166629188216773e-07, "logits/chosen": -4.120293617248535, "logits/rejected": -4.011865139007568, "logps/chosen": -103.4238052368164, "logps/rejected": -95.5591812133789, "loss": 119694.1375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013124537654221058, "rewards/margins": -0.00011153926607221365, "rewards/rejected": -0.013012999668717384, "step": 5940 }, { "epoch": 0.6020439137913589, "grad_norm": 286699.67152071255, "learning_rate": 2.2110411513379805e-07, "logits/chosen": -4.251965522766113, "logits/rejected": -4.269593715667725, "logps/chosen": -78.47334289550781, "logps/rejected": -67.1713638305664, "loss": 116166.1875, "rewards/accuracies": 0.5, "rewards/chosen": -0.010677671059966087, "rewards/margins": -0.00017050918540917337, "rewards/rejected": -0.010507160797715187, "step": 5950 }, { "epoch": 0.6030557523019326, "grad_norm": 281530.3558075206, "learning_rate": 2.2054193838542837e-07, "logits/chosen": -4.114254951477051, "logits/rejected": -4.171900272369385, "logps/chosen": -100.582763671875, "logps/rejected": -127.763671875, "loss": 129129.7125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012319345027208328, "rewards/margins": 0.0018658734625205398, "rewards/rejected": -0.014185218140482903, "step": 5960 }, { "epoch": 0.6040675908125063, "grad_norm": 205338.73422330176, "learning_rate": 2.1997976163705866e-07, "logits/chosen": -4.261553764343262, "logits/rejected": -4.200680732727051, "logps/chosen": -96.38507843017578, "logps/rejected": -111.83575439453125, "loss": 128545.3, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007810370530933142, "rewards/margins": 0.004640919156372547, "rewards/rejected": -0.012451289221644402, "step": 5970 }, { "epoch": 0.60507942932308, "grad_norm": 417439.32775972097, "learning_rate": 2.19417584888689e-07, "logits/chosen": -3.847234010696411, "logits/rejected": -3.8264098167419434, "logps/chosen": -73.75779724121094, "logps/rejected": -78.3750228881836, "loss": 113262.7375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00660474319010973, "rewards/margins": 0.004123731516301632, "rewards/rejected": -0.010728476569056511, "step": 5980 }, { "epoch": 0.6060912678336537, "grad_norm": 360720.57751328696, "learning_rate": 2.188554081403193e-07, "logits/chosen": -4.405751705169678, "logits/rejected": -4.270488262176514, "logps/chosen": -104.88375091552734, "logps/rejected": -112.7432632446289, "loss": 120743.175, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.008630535565316677, "rewards/margins": 0.00512698432430625, "rewards/rejected": -0.013757521286606789, "step": 5990 }, { "epoch": 0.6071031063442275, "grad_norm": 275523.35276426974, "learning_rate": 2.1829323139194962e-07, "logits/chosen": -4.452993869781494, "logits/rejected": -4.406987190246582, "logps/chosen": -80.84308624267578, "logps/rejected": -106.5774154663086, "loss": 122552.2375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008209132589399815, "rewards/margins": 0.0053443750366568565, "rewards/rejected": -0.013553505763411522, "step": 6000 }, { "epoch": 0.6081149448548012, "grad_norm": 391599.49119643, "learning_rate": 2.1773105464357994e-07, "logits/chosen": -4.507946968078613, "logits/rejected": -4.315684795379639, "logps/chosen": -37.582191467285156, "logps/rejected": -55.353248596191406, "loss": 109813.975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005356866866350174, "rewards/margins": 0.004316064994782209, "rewards/rejected": -0.00967293232679367, "step": 6010 }, { "epoch": 0.6091267833653748, "grad_norm": 368899.47098041134, "learning_rate": 2.1716887789521023e-07, "logits/chosen": -4.064260005950928, "logits/rejected": -4.135598659515381, "logps/chosen": -69.04777526855469, "logps/rejected": -102.8220443725586, "loss": 119837.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007471649907529354, "rewards/margins": 0.004689342342317104, "rewards/rejected": -0.012160991318523884, "step": 6020 }, { "epoch": 0.6101386218759486, "grad_norm": 315646.4585652768, "learning_rate": 2.1660670114684058e-07, "logits/chosen": -4.486457347869873, "logits/rejected": -4.5273590087890625, "logps/chosen": -62.63958740234375, "logps/rejected": -77.76780700683594, "loss": 129642.3375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.007308857049793005, "rewards/margins": 0.003250218229368329, "rewards/rejected": -0.010559075511991978, "step": 6030 }, { "epoch": 0.6111504603865223, "grad_norm": 177160.06775221872, "learning_rate": 2.1604452439847087e-07, "logits/chosen": -3.6933693885803223, "logits/rejected": -3.868438720703125, "logps/chosen": -86.05882263183594, "logps/rejected": -88.821533203125, "loss": 121267.575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006369117647409439, "rewards/margins": 0.003100831527262926, "rewards/rejected": -0.009469949640333652, "step": 6040 }, { "epoch": 0.6121622988970961, "grad_norm": 397781.2141722864, "learning_rate": 2.1548234765010119e-07, "logits/chosen": -3.7922234535217285, "logits/rejected": -4.0998029708862305, "logps/chosen": -114.0832748413086, "logps/rejected": -95.81932067871094, "loss": 116006.8, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01033351942896843, "rewards/margins": 0.0012506901985034347, "rewards/rejected": -0.011584209278225899, "step": 6050 }, { "epoch": 0.6131741374076697, "grad_norm": 289141.2008448067, "learning_rate": 2.149201709017315e-07, "logits/chosen": -4.233329772949219, "logits/rejected": -4.139989852905273, "logps/chosen": -87.96719360351562, "logps/rejected": -112.10001373291016, "loss": 105245.5125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008966144174337387, "rewards/margins": 0.0032400668133050203, "rewards/rejected": -0.01220620982348919, "step": 6060 }, { "epoch": 0.6141859759182434, "grad_norm": 411547.9473436682, "learning_rate": 2.143579941533618e-07, "logits/chosen": -4.428210735321045, "logits/rejected": -4.505699157714844, "logps/chosen": -82.75213623046875, "logps/rejected": -90.84590148925781, "loss": 121934.3625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.007226325571537018, "rewards/margins": 0.002971161622554064, "rewards/rejected": -0.010197486728429794, "step": 6070 }, { "epoch": 0.6151978144288172, "grad_norm": 273845.82982760563, "learning_rate": 2.1379581740499212e-07, "logits/chosen": -3.988194704055786, "logits/rejected": -4.0378899574279785, "logps/chosen": -128.0701446533203, "logps/rejected": -122.5140609741211, "loss": 120004.65, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0072878687642514706, "rewards/margins": 0.00371719291433692, "rewards/rejected": -0.01100506167858839, "step": 6080 }, { "epoch": 0.6162096529393909, "grad_norm": 194185.32110879067, "learning_rate": 2.1323364065662244e-07, "logits/chosen": -3.9721386432647705, "logits/rejected": -3.957576036453247, "logps/chosen": -80.4486083984375, "logps/rejected": -120.92649841308594, "loss": 115788.6875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007836690172553062, "rewards/margins": 0.0061447047628462315, "rewards/rejected": -0.013981396332383156, "step": 6090 }, { "epoch": 0.6172214914499646, "grad_norm": 129233.61462902659, "learning_rate": 2.1267146390825275e-07, "logits/chosen": -3.991481304168701, "logits/rejected": -4.007437705993652, "logps/chosen": -183.93028259277344, "logps/rejected": -203.1132354736328, "loss": 116677.775, "rewards/accuracies": 0.75, "rewards/chosen": -0.014025723561644554, "rewards/margins": 0.0060669737868011, "rewards/rejected": -0.02009269781410694, "step": 6100 }, { "epoch": 0.6182333299605383, "grad_norm": 277971.62440154824, "learning_rate": 2.1210928715988305e-07, "logits/chosen": -4.144837379455566, "logits/rejected": -4.030789375305176, "logps/chosen": -114.19850158691406, "logps/rejected": -118.67930603027344, "loss": 108615.2625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010188603773713112, "rewards/margins": 0.0032861537765711546, "rewards/rejected": -0.013474756851792336, "step": 6110 }, { "epoch": 0.619245168471112, "grad_norm": 466350.4937310387, "learning_rate": 2.115471104115134e-07, "logits/chosen": -4.404642581939697, "logits/rejected": -4.505222320556641, "logps/chosen": -95.26502227783203, "logps/rejected": -93.94677734375, "loss": 115919.4125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0077307880856096745, "rewards/margins": 0.0009559566387906671, "rewards/rejected": -0.00868674460798502, "step": 6120 }, { "epoch": 0.6202570069816857, "grad_norm": 541315.2052737275, "learning_rate": 2.1098493366314368e-07, "logits/chosen": -4.148293972015381, "logits/rejected": -3.99198842048645, "logps/chosen": -90.0133056640625, "logps/rejected": -165.4237060546875, "loss": 122007.1125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012127473950386047, "rewards/margins": 0.009383110329508781, "rewards/rejected": -0.02151058427989483, "step": 6130 }, { "epoch": 0.6212688454922595, "grad_norm": 293380.11832762376, "learning_rate": 2.1042275691477398e-07, "logits/chosen": -4.113030910491943, "logits/rejected": -3.991772413253784, "logps/chosen": -104.35147857666016, "logps/rejected": -105.547119140625, "loss": 126133.6, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010609106160700321, "rewards/margins": 0.000759853923227638, "rewards/rejected": -0.01136896014213562, "step": 6140 }, { "epoch": 0.6222806840028331, "grad_norm": 289632.62691140507, "learning_rate": 2.0986058016640432e-07, "logits/chosen": -4.213861465454102, "logits/rejected": -4.240361213684082, "logps/chosen": -74.21389770507812, "logps/rejected": -112.12995910644531, "loss": 92746.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009691601619124413, "rewards/margins": 0.004286601208150387, "rewards/rejected": -0.013978203758597374, "step": 6150 }, { "epoch": 0.6232925225134068, "grad_norm": 291142.209159203, "learning_rate": 2.0929840341803461e-07, "logits/chosen": -4.202354907989502, "logits/rejected": -4.292402744293213, "logps/chosen": -74.57303619384766, "logps/rejected": -91.3738784790039, "loss": 110744.0125, "rewards/accuracies": 0.5, "rewards/chosen": -0.012238125316798687, "rewards/margins": 0.002489125821739435, "rewards/rejected": -0.014727252535521984, "step": 6160 }, { "epoch": 0.6243043610239806, "grad_norm": 147904.26052705143, "learning_rate": 2.0873622666966493e-07, "logits/chosen": -4.173557281494141, "logits/rejected": -4.119800090789795, "logps/chosen": -120.87847900390625, "logps/rejected": -105.56319427490234, "loss": 100075.0813, "rewards/accuracies": 0.5, "rewards/chosen": -0.010569063015282154, "rewards/margins": -0.0013821999309584498, "rewards/rejected": -0.009186862036585808, "step": 6170 }, { "epoch": 0.6253161995345543, "grad_norm": 274820.01459187723, "learning_rate": 2.0817404992129525e-07, "logits/chosen": -4.009035587310791, "logits/rejected": -3.828958034515381, "logps/chosen": -103.15052795410156, "logps/rejected": -134.97457885742188, "loss": 112799.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00974858459085226, "rewards/margins": 0.00557335652410984, "rewards/rejected": -0.015321940183639526, "step": 6180 }, { "epoch": 0.626328038045128, "grad_norm": 415430.18079187116, "learning_rate": 2.0761187317292554e-07, "logits/chosen": -3.612377166748047, "logits/rejected": -3.6337475776672363, "logps/chosen": -57.60808181762695, "logps/rejected": -92.75422668457031, "loss": 99918.4563, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00621788389980793, "rewards/margins": 0.00529667641967535, "rewards/rejected": -0.01151456031948328, "step": 6190 }, { "epoch": 0.6273398765557017, "grad_norm": 445446.28859421494, "learning_rate": 2.0704969642455586e-07, "logits/chosen": -3.710300922393799, "logits/rejected": -3.793381452560425, "logps/chosen": -77.54451751708984, "logps/rejected": -141.87515258789062, "loss": 124385.8375, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0072606815956532955, "rewards/margins": 0.006706187967211008, "rewards/rejected": -0.013966868631541729, "step": 6200 }, { "epoch": 0.6283517150662754, "grad_norm": 374595.91761375515, "learning_rate": 2.0648751967618618e-07, "logits/chosen": -4.019393444061279, "logits/rejected": -4.036262512207031, "logps/chosen": -136.3063507080078, "logps/rejected": -132.25619506835938, "loss": 140586.2625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010141145437955856, "rewards/margins": 0.0022293711081147194, "rewards/rejected": -0.01237051747739315, "step": 6210 }, { "epoch": 0.6293635535768491, "grad_norm": 367.8848981427559, "learning_rate": 2.059253429278165e-07, "logits/chosen": -3.521132707595825, "logits/rejected": -3.5167980194091797, "logps/chosen": -103.0267562866211, "logps/rejected": -119.46856689453125, "loss": 110451.1875, "rewards/accuracies": 0.75, "rewards/chosen": -0.007312456611543894, "rewards/margins": 0.007401457522064447, "rewards/rejected": -0.014713913202285767, "step": 6220 }, { "epoch": 0.6303753920874229, "grad_norm": 178524.44456479858, "learning_rate": 2.053631661794468e-07, "logits/chosen": -4.415305137634277, "logits/rejected": -4.381497383117676, "logps/chosen": -99.74191284179688, "logps/rejected": -98.42131042480469, "loss": 88447.0875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008474888280034065, "rewards/margins": 0.002019264502450824, "rewards/rejected": -0.010494152083992958, "step": 6230 }, { "epoch": 0.6313872305979965, "grad_norm": 361841.1643855386, "learning_rate": 2.0480098943107714e-07, "logits/chosen": -4.022806644439697, "logits/rejected": -4.065142631530762, "logps/chosen": -85.74759674072266, "logps/rejected": -106.85379791259766, "loss": 129392.6375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005751702934503555, "rewards/margins": 0.004790020640939474, "rewards/rejected": -0.010541723109781742, "step": 6240 }, { "epoch": 0.6323990691085702, "grad_norm": 262464.8206542194, "learning_rate": 2.0423881268270743e-07, "logits/chosen": -4.172398090362549, "logits/rejected": -4.080264568328857, "logps/chosen": -70.19078063964844, "logps/rejected": -83.94783782958984, "loss": 109536.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010137634351849556, "rewards/margins": 0.002692105481401086, "rewards/rejected": -0.012829738669097424, "step": 6250 }, { "epoch": 0.633410907619144, "grad_norm": 317027.985368864, "learning_rate": 2.0367663593433775e-07, "logits/chosen": -4.128166675567627, "logits/rejected": -4.147816181182861, "logps/chosen": -68.04066467285156, "logps/rejected": -89.21851348876953, "loss": 115167.4625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00715451966971159, "rewards/margins": 0.002495033433660865, "rewards/rejected": -0.009649552404880524, "step": 6260 }, { "epoch": 0.6344227461297177, "grad_norm": 351015.1204149851, "learning_rate": 2.0311445918596807e-07, "logits/chosen": -4.504401206970215, "logits/rejected": -4.422949314117432, "logps/chosen": -48.92380142211914, "logps/rejected": -68.0481185913086, "loss": 117515.675, "rewards/accuracies": 0.75, "rewards/chosen": -0.006437239237129688, "rewards/margins": 0.0025760536082088947, "rewards/rejected": -0.00901329331099987, "step": 6270 }, { "epoch": 0.6354345846402915, "grad_norm": 286506.9120017988, "learning_rate": 2.0255228243759836e-07, "logits/chosen": -4.302903652191162, "logits/rejected": -4.308192253112793, "logps/chosen": -81.68921661376953, "logps/rejected": -87.50761413574219, "loss": 120821.4, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.008637899532914162, "rewards/margins": 0.001110336510464549, "rewards/rejected": -0.009748237207531929, "step": 6280 }, { "epoch": 0.6364464231508651, "grad_norm": 475972.21770094853, "learning_rate": 2.019901056892287e-07, "logits/chosen": -4.015780448913574, "logits/rejected": -3.8116047382354736, "logps/chosen": -114.80123138427734, "logps/rejected": -219.0806121826172, "loss": 117034.35, "rewards/accuracies": 0.75, "rewards/chosen": -0.008226113393902779, "rewards/margins": 0.003567999228835106, "rewards/rejected": -0.011794112622737885, "step": 6290 }, { "epoch": 0.6374582616614388, "grad_norm": 291155.0298447634, "learning_rate": 2.01427928940859e-07, "logits/chosen": -3.8353817462921143, "logits/rejected": -3.938526153564453, "logps/chosen": -40.459075927734375, "logps/rejected": -52.19451904296875, "loss": 95885.7063, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006834626197814941, "rewards/margins": 0.0012341259280219674, "rewards/rejected": -0.0080687515437603, "step": 6300 }, { "epoch": 0.6384701001720126, "grad_norm": 404657.5889680836, "learning_rate": 2.008657521924893e-07, "logits/chosen": -4.382012367248535, "logits/rejected": -4.345575332641602, "logps/chosen": -95.2391357421875, "logps/rejected": -105.92706298828125, "loss": 129477.3125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01333907712250948, "rewards/margins": 0.0032470934092998505, "rewards/rejected": -0.016586169600486755, "step": 6310 }, { "epoch": 0.6394819386825863, "grad_norm": 202499.07374377974, "learning_rate": 2.0030357544411964e-07, "logits/chosen": -4.030858993530273, "logits/rejected": -4.082146644592285, "logps/chosen": -79.58134460449219, "logps/rejected": -99.16566467285156, "loss": 118457.5125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008441333658993244, "rewards/margins": 0.005205988883972168, "rewards/rejected": -0.013647320680320263, "step": 6320 }, { "epoch": 0.6404937771931599, "grad_norm": 236942.41870023846, "learning_rate": 1.9974139869574993e-07, "logits/chosen": -3.7632946968078613, "logits/rejected": -3.7978146076202393, "logps/chosen": -121.74632263183594, "logps/rejected": -92.1229019165039, "loss": 117845.4125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00811687670648098, "rewards/margins": 0.0027729819994419813, "rewards/rejected": -0.01088985987007618, "step": 6330 }, { "epoch": 0.6415056157037337, "grad_norm": 380425.4086559553, "learning_rate": 1.9917922194738025e-07, "logits/chosen": -4.041003227233887, "logits/rejected": -3.779327869415283, "logps/chosen": -110.4465103149414, "logps/rejected": -135.90524291992188, "loss": 123917.3, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.011323734186589718, "rewards/margins": 0.005395103245973587, "rewards/rejected": -0.01671883836388588, "step": 6340 }, { "epoch": 0.6425174542143074, "grad_norm": 259656.78905797558, "learning_rate": 1.9861704519901057e-07, "logits/chosen": -3.9455199241638184, "logits/rejected": -4.019543170928955, "logps/chosen": -79.8201675415039, "logps/rejected": -76.47807312011719, "loss": 124936.25, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006268862634897232, "rewards/margins": 0.0027416343800723553, "rewards/rejected": -0.009010495617985725, "step": 6350 }, { "epoch": 0.6435292927248811, "grad_norm": 272360.83268650476, "learning_rate": 1.9805486845064088e-07, "logits/chosen": -4.179303169250488, "logits/rejected": -3.979006290435791, "logps/chosen": -101.29718017578125, "logps/rejected": -97.22586822509766, "loss": 117097.3, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01120795588940382, "rewards/margins": 0.0010217679664492607, "rewards/rejected": -0.012229722924530506, "step": 6360 }, { "epoch": 0.6445411312354549, "grad_norm": 288545.0703520188, "learning_rate": 1.9749269170227118e-07, "logits/chosen": -3.847257137298584, "logits/rejected": -3.8754189014434814, "logps/chosen": -105.5389175415039, "logps/rejected": -119.45654296875, "loss": 114186.45, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013900360092520714, "rewards/margins": 0.002715774578973651, "rewards/rejected": -0.016616135835647583, "step": 6370 }, { "epoch": 0.6455529697460285, "grad_norm": 166996.51343900233, "learning_rate": 1.969305149539015e-07, "logits/chosen": -3.907139301300049, "logits/rejected": -3.8826611042022705, "logps/chosen": -79.48760223388672, "logps/rejected": -108.32861328125, "loss": 122389.3125, "rewards/accuracies": 0.75, "rewards/chosen": -0.009224530309438705, "rewards/margins": 0.004500552546232939, "rewards/rejected": -0.013725082390010357, "step": 6380 }, { "epoch": 0.6465648082566022, "grad_norm": 528500.9477623409, "learning_rate": 1.9636833820553181e-07, "logits/chosen": -4.034038066864014, "logits/rejected": -3.9911704063415527, "logps/chosen": -105.90971374511719, "logps/rejected": -112.26484680175781, "loss": 137307.525, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010359941981732845, "rewards/margins": -8.663059998070821e-05, "rewards/rejected": -0.010273311287164688, "step": 6390 }, { "epoch": 0.647576646767176, "grad_norm": 271917.13487407187, "learning_rate": 1.958061614571621e-07, "logits/chosen": -3.7516770362854004, "logits/rejected": -3.6352057456970215, "logps/chosen": -74.47164154052734, "logps/rejected": -85.87071990966797, "loss": 111531.7125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007356105837970972, "rewards/margins": 0.002075077500194311, "rewards/rejected": -0.009431185200810432, "step": 6400 }, { "epoch": 0.6485884852777497, "grad_norm": 438503.1629270059, "learning_rate": 1.9524398470879245e-07, "logits/chosen": -4.0651445388793945, "logits/rejected": -4.217373847961426, "logps/chosen": -120.6236801147461, "logps/rejected": -123.8014144897461, "loss": 130453.8, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008476929739117622, "rewards/margins": 0.0017057812074199319, "rewards/rejected": -0.010182710364460945, "step": 6410 }, { "epoch": 0.6496003237883233, "grad_norm": 271284.9685361674, "learning_rate": 1.9468180796042274e-07, "logits/chosen": -4.099814414978027, "logits/rejected": -4.1988935470581055, "logps/chosen": -116.56180572509766, "logps/rejected": -143.22323608398438, "loss": 117733.325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009525131434202194, "rewards/margins": 0.003682434558868408, "rewards/rejected": -0.013207566924393177, "step": 6420 }, { "epoch": 0.6506121622988971, "grad_norm": 413950.79556992237, "learning_rate": 1.9411963121205304e-07, "logits/chosen": -4.186008453369141, "logits/rejected": -4.19289493560791, "logps/chosen": -65.26598358154297, "logps/rejected": -82.48644256591797, "loss": 111981.2625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.006238493137061596, "rewards/margins": 0.0019023508066311479, "rewards/rejected": -0.00814084429293871, "step": 6430 }, { "epoch": 0.6516240008094708, "grad_norm": 315193.8626300457, "learning_rate": 1.9355745446368338e-07, "logits/chosen": -4.3447794914245605, "logits/rejected": -4.308469772338867, "logps/chosen": -94.72651672363281, "logps/rejected": -96.54049682617188, "loss": 111923.25, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007402357645332813, "rewards/margins": 0.005322879180312157, "rewards/rejected": -0.01272523682564497, "step": 6440 }, { "epoch": 0.6526358393200445, "grad_norm": 221686.05514583184, "learning_rate": 1.9299527771531367e-07, "logits/chosen": -4.0267205238342285, "logits/rejected": -3.9296207427978516, "logps/chosen": -96.14549255371094, "logps/rejected": -135.9312286376953, "loss": 100801.7812, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01189177855849266, "rewards/margins": 0.007660713978111744, "rewards/rejected": -0.01955248974263668, "step": 6450 }, { "epoch": 0.6536476778306183, "grad_norm": 379865.79098208033, "learning_rate": 1.9243310096694402e-07, "logits/chosen": -4.197896480560303, "logits/rejected": -4.1178107261657715, "logps/chosen": -91.96208190917969, "logps/rejected": -99.28191375732422, "loss": 110748.2375, "rewards/accuracies": 0.5, "rewards/chosen": -0.01595609448850155, "rewards/margins": -0.0034022084437310696, "rewards/rejected": -0.012553887441754341, "step": 6460 }, { "epoch": 0.6546595163411919, "grad_norm": 312870.21485498454, "learning_rate": 1.918709242185743e-07, "logits/chosen": -4.2208147048950195, "logits/rejected": -4.158278942108154, "logps/chosen": -125.07845306396484, "logps/rejected": -100.28009033203125, "loss": 100592.7312, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.004744291305541992, "rewards/margins": 0.0013537025079131126, "rewards/rejected": -0.006097993813455105, "step": 6470 }, { "epoch": 0.6556713548517656, "grad_norm": 306632.1371421999, "learning_rate": 1.9130874747020463e-07, "logits/chosen": -4.248154640197754, "logits/rejected": -4.2348504066467285, "logps/chosen": -93.60775756835938, "logps/rejected": -79.15135192871094, "loss": 127507.5375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008186688646674156, "rewards/margins": 0.0012180439662188292, "rewards/rejected": -0.009404732845723629, "step": 6480 }, { "epoch": 0.6566831933623394, "grad_norm": 229480.6155539673, "learning_rate": 1.9074657072183495e-07, "logits/chosen": -4.130061626434326, "logits/rejected": -4.216886043548584, "logps/chosen": -108.9324951171875, "logps/rejected": -117.36012268066406, "loss": 107040.825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006652146577835083, "rewards/margins": 0.00379412854090333, "rewards/rejected": -0.010446273721754551, "step": 6490 }, { "epoch": 0.6576950318729131, "grad_norm": 508673.1947335884, "learning_rate": 1.9018439397346524e-07, "logits/chosen": -3.6989383697509766, "logits/rejected": -3.854252338409424, "logps/chosen": -61.9836311340332, "logps/rejected": -121.41017150878906, "loss": 127854.5375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.007794146426022053, "rewards/margins": 0.007739419583231211, "rewards/rejected": -0.0155335683375597, "step": 6500 }, { "epoch": 0.6587068703834867, "grad_norm": 205219.57881109137, "learning_rate": 1.8962221722509556e-07, "logits/chosen": -4.2602033615112305, "logits/rejected": -4.058106422424316, "logps/chosen": -89.57212829589844, "logps/rejected": -97.05436706542969, "loss": 114858.7125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007870439440011978, "rewards/margins": 0.0038945614360272884, "rewards/rejected": -0.011765001341700554, "step": 6510 }, { "epoch": 0.6597187088940605, "grad_norm": 284644.49163770984, "learning_rate": 1.8906004047672588e-07, "logits/chosen": -3.8584976196289062, "logits/rejected": -4.033276557922363, "logps/chosen": -131.52378845214844, "logps/rejected": -123.79156494140625, "loss": 104148.7937, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008237019181251526, "rewards/margins": 0.005691757891327143, "rewards/rejected": -0.013928776606917381, "step": 6520 }, { "epoch": 0.6607305474046342, "grad_norm": 301034.78787375946, "learning_rate": 1.884978637283562e-07, "logits/chosen": -3.7389721870422363, "logits/rejected": -3.7939364910125732, "logps/chosen": -163.0010986328125, "logps/rejected": -174.10800170898438, "loss": 117043.5625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012572321109473705, "rewards/margins": 0.005177865270525217, "rewards/rejected": -0.01775018498301506, "step": 6530 }, { "epoch": 0.661742385915208, "grad_norm": 262399.9947691974, "learning_rate": 1.879356869799865e-07, "logits/chosen": -4.2494916915893555, "logits/rejected": -4.16949462890625, "logps/chosen": -89.8538818359375, "logps/rejected": -106.51298522949219, "loss": 118041.875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007697573397308588, "rewards/margins": 0.006071886513382196, "rewards/rejected": -0.013769459910690784, "step": 6540 }, { "epoch": 0.6627542244257817, "grad_norm": 331793.1623808011, "learning_rate": 1.873735102316168e-07, "logits/chosen": -4.275174617767334, "logits/rejected": -4.327399253845215, "logps/chosen": -90.14825439453125, "logps/rejected": -104.62347412109375, "loss": 117239.7375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011965354904532433, "rewards/margins": 0.003383346600458026, "rewards/rejected": -0.015348700806498528, "step": 6550 }, { "epoch": 0.6637660629363553, "grad_norm": 211528.93680675805, "learning_rate": 1.8681133348324713e-07, "logits/chosen": -4.116864204406738, "logits/rejected": -4.179358005523682, "logps/chosen": -81.94615173339844, "logps/rejected": -102.05317687988281, "loss": 117548.5625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00838920846581459, "rewards/margins": 0.00299254572018981, "rewards/rejected": -0.011381754651665688, "step": 6560 }, { "epoch": 0.664777901446929, "grad_norm": 322911.2661216704, "learning_rate": 1.8624915673487742e-07, "logits/chosen": -4.149007797241211, "logits/rejected": -4.012667655944824, "logps/chosen": -50.555076599121094, "logps/rejected": -101.46513366699219, "loss": 104102.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0055433595553040504, "rewards/margins": 0.004927411675453186, "rewards/rejected": -0.010470771230757236, "step": 6570 }, { "epoch": 0.6657897399575028, "grad_norm": 381353.32237869554, "learning_rate": 1.8568697998650776e-07, "logits/chosen": -4.162862300872803, "logits/rejected": -4.2909255027771, "logps/chosen": -145.2236328125, "logps/rejected": -145.99659729003906, "loss": 123952.35, "rewards/accuracies": 0.75, "rewards/chosen": -0.009624254889786243, "rewards/margins": 0.006484621204435825, "rewards/rejected": -0.01610887423157692, "step": 6580 }, { "epoch": 0.6668015784680765, "grad_norm": 364213.47284321865, "learning_rate": 1.8512480323813806e-07, "logits/chosen": -3.9925923347473145, "logits/rejected": -3.95166015625, "logps/chosen": -89.13395690917969, "logps/rejected": -103.19862365722656, "loss": 115930.375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00575671112164855, "rewards/margins": 0.004014153964817524, "rewards/rejected": -0.009770864620804787, "step": 6590 }, { "epoch": 0.6678134169786502, "grad_norm": 258718.67777231114, "learning_rate": 1.8456262648976838e-07, "logits/chosen": -4.3917646408081055, "logits/rejected": -4.361847877502441, "logps/chosen": -74.68910217285156, "logps/rejected": -101.30796813964844, "loss": 115521.625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009728992357850075, "rewards/margins": 0.006113402545452118, "rewards/rejected": -0.015842396765947342, "step": 6600 }, { "epoch": 0.6688252554892239, "grad_norm": 338574.1519264322, "learning_rate": 1.840004497413987e-07, "logits/chosen": -3.964437961578369, "logits/rejected": -3.980739116668701, "logps/chosen": -58.29877853393555, "logps/rejected": -67.64945983886719, "loss": 104538.5375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007911115884780884, "rewards/margins": 0.003372797742486, "rewards/rejected": -0.011283913627266884, "step": 6610 }, { "epoch": 0.6698370939997976, "grad_norm": 422497.8031938126, "learning_rate": 1.83438272993029e-07, "logits/chosen": -3.9723598957061768, "logits/rejected": -3.9443976879119873, "logps/chosen": -80.35944366455078, "logps/rejected": -166.54611206054688, "loss": 111555.8125, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.008247876539826393, "rewards/margins": 0.009512849152088165, "rewards/rejected": -0.01776072382926941, "step": 6620 }, { "epoch": 0.6708489325103714, "grad_norm": 245038.91933052812, "learning_rate": 1.828760962446593e-07, "logits/chosen": -4.109666347503662, "logits/rejected": -3.9079749584198, "logps/chosen": -73.79692077636719, "logps/rejected": -93.49116516113281, "loss": 126381.85, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008714275434613228, "rewards/margins": 0.004357738420367241, "rewards/rejected": -0.013072013854980469, "step": 6630 }, { "epoch": 0.6718607710209451, "grad_norm": 373097.8409592206, "learning_rate": 1.8231391949628962e-07, "logits/chosen": -4.521551609039307, "logits/rejected": -4.61533260345459, "logps/chosen": -88.66931915283203, "logps/rejected": -94.53910827636719, "loss": 111216.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.006976371165364981, "rewards/margins": 0.0038310536183416843, "rewards/rejected": -0.01080742571502924, "step": 6640 }, { "epoch": 0.6728726095315187, "grad_norm": 373548.7739337056, "learning_rate": 1.8175174274791994e-07, "logits/chosen": -4.2318220138549805, "logits/rejected": -4.241496562957764, "logps/chosen": -95.77127838134766, "logps/rejected": -117.58194732666016, "loss": 118251.625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010243142023682594, "rewards/margins": 0.0015052380040287971, "rewards/rejected": -0.011748380027711391, "step": 6650 }, { "epoch": 0.6738844480420925, "grad_norm": 435524.48492101196, "learning_rate": 1.8118956599955024e-07, "logits/chosen": -4.477503299713135, "logits/rejected": -4.630113124847412, "logps/chosen": -116.12213134765625, "logps/rejected": -131.28848266601562, "loss": 128541.0125, "rewards/accuracies": 0.5, "rewards/chosen": -0.015237663872539997, "rewards/margins": 0.004862593486905098, "rewards/rejected": -0.02010026015341282, "step": 6660 }, { "epoch": 0.6748962865526662, "grad_norm": 89239.15225009764, "learning_rate": 1.8062738925118055e-07, "logits/chosen": -4.02420711517334, "logits/rejected": -4.180179595947266, "logps/chosen": -66.16912841796875, "logps/rejected": -100.1077880859375, "loss": 120772.575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0061073158867657185, "rewards/margins": 0.004442027769982815, "rewards/rejected": -0.010549342259764671, "step": 6670 }, { "epoch": 0.6759081250632399, "grad_norm": 360110.64034249645, "learning_rate": 1.8006521250281087e-07, "logits/chosen": -4.084765911102295, "logits/rejected": -4.045774459838867, "logps/chosen": -80.33618927001953, "logps/rejected": -102.09674072265625, "loss": 118372.175, "rewards/accuracies": 0.75, "rewards/chosen": -0.008981121703982353, "rewards/margins": 0.00570939015597105, "rewards/rejected": -0.014690510928630829, "step": 6680 }, { "epoch": 0.6769199635738136, "grad_norm": 200800.25520400555, "learning_rate": 1.795030357544412e-07, "logits/chosen": -4.222363471984863, "logits/rejected": -4.129399299621582, "logps/chosen": -85.50511932373047, "logps/rejected": -85.02246856689453, "loss": 114812.15, "rewards/accuracies": 0.75, "rewards/chosen": -0.008695880882441998, "rewards/margins": 0.002391321584582329, "rewards/rejected": -0.011087203398346901, "step": 6690 }, { "epoch": 0.6779318020843873, "grad_norm": 458496.0171610367, "learning_rate": 1.789408590060715e-07, "logits/chosen": -4.022574424743652, "logits/rejected": -3.979355573654175, "logps/chosen": -131.2284393310547, "logps/rejected": -156.2109375, "loss": 113476.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007959049195051193, "rewards/margins": 0.0034019325394183397, "rewards/rejected": -0.01136098150163889, "step": 6700 }, { "epoch": 0.678943640594961, "grad_norm": 178946.96390302663, "learning_rate": 1.783786822577018e-07, "logits/chosen": -4.242757797241211, "logits/rejected": -4.132214546203613, "logps/chosen": -100.657470703125, "logps/rejected": -102.77400970458984, "loss": 110732.775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006574706174433231, "rewards/margins": 0.003416580380871892, "rewards/rejected": -0.00999128632247448, "step": 6710 }, { "epoch": 0.6799554791055348, "grad_norm": 205068.0401217884, "learning_rate": 1.7781650550933215e-07, "logits/chosen": -3.419506072998047, "logits/rejected": -3.5543246269226074, "logps/chosen": -124.29248046875, "logps/rejected": -141.8451690673828, "loss": 111607.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011629531159996986, "rewards/margins": 0.0035535558126866817, "rewards/rejected": -0.01518308836966753, "step": 6720 }, { "epoch": 0.6809673176161085, "grad_norm": 401534.6011172749, "learning_rate": 1.7725432876096244e-07, "logits/chosen": -4.0172247886657715, "logits/rejected": -4.056666374206543, "logps/chosen": -110.16436767578125, "logps/rejected": -134.0843048095703, "loss": 124677.1375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0053099351935088634, "rewards/margins": 0.004519811365753412, "rewards/rejected": -0.009829746559262276, "step": 6730 }, { "epoch": 0.6819791561266821, "grad_norm": 475933.5239449672, "learning_rate": 1.7669215201259273e-07, "logits/chosen": -3.9362971782684326, "logits/rejected": -4.0795512199401855, "logps/chosen": -98.66606903076172, "logps/rejected": -110.65189361572266, "loss": 120091.4125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011285439133644104, "rewards/margins": 0.004298001993447542, "rewards/rejected": -0.015583442524075508, "step": 6740 }, { "epoch": 0.6829909946372559, "grad_norm": 269263.6442812749, "learning_rate": 1.7612997526422308e-07, "logits/chosen": -3.666454315185547, "logits/rejected": -3.686868190765381, "logps/chosen": -281.5959167480469, "logps/rejected": -282.31109619140625, "loss": 109727.5, "rewards/accuracies": 0.5, "rewards/chosen": -0.010401390492916107, "rewards/margins": 0.0019936382304877043, "rewards/rejected": -0.012395027093589306, "step": 6750 }, { "epoch": 0.6840028331478296, "grad_norm": 337142.8849470702, "learning_rate": 1.7556779851585337e-07, "logits/chosen": -4.371169567108154, "logits/rejected": -4.3662824630737305, "logps/chosen": -113.13484954833984, "logps/rejected": -145.02549743652344, "loss": 125835.4875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01295931451022625, "rewards/margins": 0.003453632351011038, "rewards/rejected": -0.016412947326898575, "step": 6760 }, { "epoch": 0.6850146716584034, "grad_norm": 493800.6948669211, "learning_rate": 1.750056217674837e-07, "logits/chosen": -3.951874256134033, "logits/rejected": -3.9759960174560547, "logps/chosen": -107.4618911743164, "logps/rejected": -108.2830810546875, "loss": 115966.8625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.012523439712822437, "rewards/margins": 0.0008755962480790913, "rewards/rejected": -0.013399036601185799, "step": 6770 }, { "epoch": 0.686026510168977, "grad_norm": 272024.34386788774, "learning_rate": 1.74443445019114e-07, "logits/chosen": -4.479785919189453, "logits/rejected": -4.47466516494751, "logps/chosen": -75.9669189453125, "logps/rejected": -110.630126953125, "loss": 107336.65, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006980572827160358, "rewards/margins": 0.008425334468483925, "rewards/rejected": -0.015405906364321709, "step": 6780 }, { "epoch": 0.6870383486795507, "grad_norm": 317086.0788239275, "learning_rate": 1.738812682707443e-07, "logits/chosen": -4.102072238922119, "logits/rejected": -4.119812965393066, "logps/chosen": -106.1201171875, "logps/rejected": -99.8458023071289, "loss": 121569.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.007731990423053503, "rewards/margins": 0.004920256324112415, "rewards/rejected": -0.012652246281504631, "step": 6790 }, { "epoch": 0.6880501871901245, "grad_norm": 380083.796550466, "learning_rate": 1.7331909152237462e-07, "logits/chosen": -3.884880781173706, "logits/rejected": -3.97540545463562, "logps/chosen": -132.17189025878906, "logps/rejected": -145.6526336669922, "loss": 111498.7, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015453708358108997, "rewards/margins": 0.005029621999710798, "rewards/rejected": -0.02048332989215851, "step": 6800 }, { "epoch": 0.6890620257006982, "grad_norm": 426487.34002240235, "learning_rate": 1.7275691477400494e-07, "logits/chosen": -4.013321876525879, "logits/rejected": -3.9090514183044434, "logps/chosen": -81.99415588378906, "logps/rejected": -93.94791412353516, "loss": 116251.85, "rewards/accuracies": 0.5, "rewards/chosen": -0.008784973993897438, "rewards/margins": -0.0003343412245158106, "rewards/rejected": -0.00845063291490078, "step": 6810 }, { "epoch": 0.6900738642112719, "grad_norm": 296667.76712428336, "learning_rate": 1.7219473802563526e-07, "logits/chosen": -4.039399147033691, "logits/rejected": -4.084476947784424, "logps/chosen": -65.41081237792969, "logps/rejected": -134.79129028320312, "loss": 114062.6875, "rewards/accuracies": 0.75, "rewards/chosen": -0.0075118765234947205, "rewards/margins": 0.010287766344845295, "rewards/rejected": -0.01779964193701744, "step": 6820 }, { "epoch": 0.6910857027218456, "grad_norm": 302422.4900680924, "learning_rate": 1.7163256127726555e-07, "logits/chosen": -3.9893558025360107, "logits/rejected": -4.104022979736328, "logps/chosen": -122.4087905883789, "logps/rejected": -149.83595275878906, "loss": 126680.6625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011718057096004486, "rewards/margins": 0.004177234135568142, "rewards/rejected": -0.015895290300250053, "step": 6830 }, { "epoch": 0.6920975412324193, "grad_norm": 317480.59531103366, "learning_rate": 1.710703845288959e-07, "logits/chosen": -3.376593828201294, "logits/rejected": -3.3916351795196533, "logps/chosen": -112.31951904296875, "logps/rejected": -117.19173431396484, "loss": 114216.425, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008969960734248161, "rewards/margins": 0.0036341254599392414, "rewards/rejected": -0.01260408479720354, "step": 6840 }, { "epoch": 0.693109379742993, "grad_norm": 349207.5345949685, "learning_rate": 1.705082077805262e-07, "logits/chosen": -3.890679121017456, "logits/rejected": -3.7773241996765137, "logps/chosen": -59.03314971923828, "logps/rejected": -89.21458435058594, "loss": 104623.3687, "rewards/accuracies": 0.5, "rewards/chosen": -0.007317684590816498, "rewards/margins": 0.004472630564123392, "rewards/rejected": -0.011790316551923752, "step": 6850 }, { "epoch": 0.6941212182535668, "grad_norm": 433756.12761242315, "learning_rate": 1.6994603103215648e-07, "logits/chosen": -4.023014545440674, "logits/rejected": -3.992276430130005, "logps/chosen": -269.34259033203125, "logps/rejected": -328.9161682128906, "loss": 126461.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010496168397367, "rewards/margins": 0.010937342420220375, "rewards/rejected": -0.0214335098862648, "step": 6860 }, { "epoch": 0.6951330567641404, "grad_norm": 83596.59096998497, "learning_rate": 1.6938385428378682e-07, "logits/chosen": -4.581016540527344, "logits/rejected": -4.487705707550049, "logps/chosen": -91.47392272949219, "logps/rejected": -93.411376953125, "loss": 112925.5375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010043027810752392, "rewards/margins": 0.0008835754124447703, "rewards/rejected": -0.010926604270935059, "step": 6870 }, { "epoch": 0.6961448952747141, "grad_norm": 225992.86177358587, "learning_rate": 1.6882167753541712e-07, "logits/chosen": -3.5176169872283936, "logits/rejected": -3.5760836601257324, "logps/chosen": -92.52481842041016, "logps/rejected": -145.43991088867188, "loss": 121184.05, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007699557580053806, "rewards/margins": 0.007969756610691547, "rewards/rejected": -0.015669312328100204, "step": 6880 }, { "epoch": 0.6971567337852879, "grad_norm": 336683.27690456086, "learning_rate": 1.6825950078704746e-07, "logits/chosen": -4.0941314697265625, "logits/rejected": -3.9264206886291504, "logps/chosen": -72.15359497070312, "logps/rejected": -86.63169860839844, "loss": 131025.05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010031615383923054, "rewards/margins": 0.0035311144310981035, "rewards/rejected": -0.013562729582190514, "step": 6890 }, { "epoch": 0.6981685722958616, "grad_norm": 354361.1963898716, "learning_rate": 1.6769732403867775e-07, "logits/chosen": -4.250523567199707, "logits/rejected": -4.171509742736816, "logps/chosen": -100.19121551513672, "logps/rejected": -117.02716064453125, "loss": 123780.075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007403278257697821, "rewards/margins": 0.0044370973482728004, "rewards/rejected": -0.01184037420898676, "step": 6900 }, { "epoch": 0.6991804108064353, "grad_norm": 322095.2322587243, "learning_rate": 1.6713514729030805e-07, "logits/chosen": -4.0813493728637695, "logits/rejected": -4.137513637542725, "logps/chosen": -104.30393981933594, "logps/rejected": -138.09881591796875, "loss": 104070.4438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.008060330525040627, "rewards/margins": 0.0048683760687708855, "rewards/rejected": -0.012928706593811512, "step": 6910 }, { "epoch": 0.700192249317009, "grad_norm": 358410.8576649803, "learning_rate": 1.665729705419384e-07, "logits/chosen": -3.806340456008911, "logits/rejected": -3.881279468536377, "logps/chosen": -74.3115234375, "logps/rejected": -118.5141830444336, "loss": 107903.175, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010429365560412407, "rewards/margins": 0.0076709045097231865, "rewards/rejected": -0.018100271001458168, "step": 6920 }, { "epoch": 0.7012040878275827, "grad_norm": 157962.3491661796, "learning_rate": 1.6601079379356868e-07, "logits/chosen": -4.387915134429932, "logits/rejected": -4.389068603515625, "logps/chosen": -80.39924621582031, "logps/rejected": -162.8675079345703, "loss": 97618.0125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007421628572046757, "rewards/margins": 0.009181911125779152, "rewards/rejected": -0.016603536903858185, "step": 6930 }, { "epoch": 0.7022159263381564, "grad_norm": 419960.5367312852, "learning_rate": 1.65448617045199e-07, "logits/chosen": -4.362674713134766, "logits/rejected": -4.162303447723389, "logps/chosen": -111.36199951171875, "logps/rejected": -142.84071350097656, "loss": 127611.8625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013419339433312416, "rewards/margins": 0.003522706450894475, "rewards/rejected": -0.016942044720053673, "step": 6940 }, { "epoch": 0.7032277648487302, "grad_norm": 314688.6986126117, "learning_rate": 1.6488644029682932e-07, "logits/chosen": -4.012229919433594, "logits/rejected": -3.9360435009002686, "logps/chosen": -67.36015319824219, "logps/rejected": -71.97065734863281, "loss": 108878.05, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009590307250618935, "rewards/margins": 0.002632357645779848, "rewards/rejected": -0.012222664430737495, "step": 6950 }, { "epoch": 0.7042396033593038, "grad_norm": 261235.18609997662, "learning_rate": 1.6432426354845964e-07, "logits/chosen": -4.189845085144043, "logits/rejected": -4.1768479347229, "logps/chosen": -68.76982116699219, "logps/rejected": -64.14081573486328, "loss": 97663.05, "rewards/accuracies": 0.75, "rewards/chosen": -0.007975892163813114, "rewards/margins": 0.0033580181188881397, "rewards/rejected": -0.011333908885717392, "step": 6960 }, { "epoch": 0.7052514418698775, "grad_norm": 229195.70223549393, "learning_rate": 1.6376208680008993e-07, "logits/chosen": -4.074643135070801, "logits/rejected": -4.110268592834473, "logps/chosen": -87.52381896972656, "logps/rejected": -104.61527252197266, "loss": 105632.0375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.013871654868125916, "rewards/margins": 0.0018637629691511393, "rewards/rejected": -0.01573541946709156, "step": 6970 }, { "epoch": 0.7062632803804513, "grad_norm": 501407.27224393387, "learning_rate": 1.6319991005172025e-07, "logits/chosen": -3.9869885444641113, "logits/rejected": -4.079958915710449, "logps/chosen": -81.16397857666016, "logps/rejected": -80.0054931640625, "loss": 109934.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009044541977345943, "rewards/margins": 0.0030967923812568188, "rewards/rejected": -0.012141333892941475, "step": 6980 }, { "epoch": 0.707275118891025, "grad_norm": 394326.9710139749, "learning_rate": 1.6263773330335057e-07, "logits/chosen": -4.163660049438477, "logits/rejected": -3.9983601570129395, "logps/chosen": -104.23709869384766, "logps/rejected": -132.5585174560547, "loss": 85535.1187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.010485155507922173, "rewards/margins": 0.0006753504276275635, "rewards/rejected": -0.01116050686687231, "step": 6990 }, { "epoch": 0.7082869574015987, "grad_norm": 584369.0926668016, "learning_rate": 1.6207555655498086e-07, "logits/chosen": -4.0389084815979, "logits/rejected": -4.037142753601074, "logps/chosen": -90.98493194580078, "logps/rejected": -118.60693359375, "loss": 118792.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00978943146765232, "rewards/margins": 0.003903087927028537, "rewards/rejected": -0.013692520558834076, "step": 7000 }, { "epoch": 0.7092987959121724, "grad_norm": 430951.660243552, "learning_rate": 1.615133798066112e-07, "logits/chosen": -4.243557453155518, "logits/rejected": -4.369147777557373, "logps/chosen": -87.63861083984375, "logps/rejected": -100.66313171386719, "loss": 118731.4625, "rewards/accuracies": 0.5, "rewards/chosen": -0.01241231244057417, "rewards/margins": 0.003810325637459755, "rewards/rejected": -0.0162226390093565, "step": 7010 }, { "epoch": 0.7103106344227461, "grad_norm": 294830.24551867385, "learning_rate": 1.609512030582415e-07, "logits/chosen": -4.454041481018066, "logits/rejected": -4.484493255615234, "logps/chosen": -88.67780303955078, "logps/rejected": -106.39588928222656, "loss": 119912.1375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012294736690819263, "rewards/margins": 0.001799101708456874, "rewards/rejected": -0.014093836769461632, "step": 7020 }, { "epoch": 0.7113224729333198, "grad_norm": 334819.58832271886, "learning_rate": 1.603890263098718e-07, "logits/chosen": -4.274775505065918, "logits/rejected": -4.187775611877441, "logps/chosen": -135.5701141357422, "logps/rejected": -127.0424575805664, "loss": 121380.4625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008140349760651588, "rewards/margins": 0.0046190847642719746, "rewards/rejected": -0.01275943499058485, "step": 7030 }, { "epoch": 0.7123343114438936, "grad_norm": 236197.85150401283, "learning_rate": 1.5982684956150214e-07, "logits/chosen": -3.7509098052978516, "logits/rejected": -3.6193904876708984, "logps/chosen": -70.86202239990234, "logps/rejected": -93.43525695800781, "loss": 118830.7, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.011354105547070503, "rewards/margins": 0.004091857001185417, "rewards/rejected": -0.015445960685610771, "step": 7040 }, { "epoch": 0.7133461499544672, "grad_norm": 276595.4139708429, "learning_rate": 1.5926467281313243e-07, "logits/chosen": -4.210007667541504, "logits/rejected": -4.054026126861572, "logps/chosen": -98.32853698730469, "logps/rejected": -98.53377532958984, "loss": 101945.9688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.011927408166229725, "rewards/margins": -0.0021297833882272243, "rewards/rejected": -0.009797626174986362, "step": 7050 }, { "epoch": 0.714357988465041, "grad_norm": 397738.76201380586, "learning_rate": 1.5870249606476275e-07, "logits/chosen": -4.123385429382324, "logits/rejected": -4.116199970245361, "logps/chosen": -94.85334777832031, "logps/rejected": -96.76173400878906, "loss": 117675.75, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008615790866315365, "rewards/margins": 0.0053347391076385975, "rewards/rejected": -0.013950529508292675, "step": 7060 }, { "epoch": 0.7153698269756147, "grad_norm": 190109.27305332647, "learning_rate": 1.5814031931639307e-07, "logits/chosen": -4.421329975128174, "logits/rejected": -4.2897233963012695, "logps/chosen": -65.0705337524414, "logps/rejected": -53.93207931518555, "loss": 110120.375, "rewards/accuracies": 0.5, "rewards/chosen": -0.006214595399796963, "rewards/margins": 0.0022331734653562307, "rewards/rejected": -0.008447768166661263, "step": 7070 }, { "epoch": 0.7163816654861884, "grad_norm": 272858.00408972165, "learning_rate": 1.575781425680234e-07, "logits/chosen": -3.815880298614502, "logits/rejected": -3.748692274093628, "logps/chosen": -103.01872253417969, "logps/rejected": -154.14706420898438, "loss": 118320.25, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009035248309373856, "rewards/margins": 0.005598374176770449, "rewards/rejected": -0.014633622951805592, "step": 7080 }, { "epoch": 0.7173935039967622, "grad_norm": 244465.61988952206, "learning_rate": 1.5701596581965368e-07, "logits/chosen": -4.313401699066162, "logits/rejected": -4.366552352905273, "logps/chosen": -128.8524169921875, "logps/rejected": -130.9639129638672, "loss": 116879.8375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.014025215990841389, "rewards/margins": -0.0006336258957162499, "rewards/rejected": -0.013391588814556599, "step": 7090 }, { "epoch": 0.7184053425073358, "grad_norm": 238118.4342926141, "learning_rate": 1.56453789071284e-07, "logits/chosen": -3.987185001373291, "logits/rejected": -3.875150203704834, "logps/chosen": -115.81591796875, "logps/rejected": -142.41561889648438, "loss": 121798.375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0102298054844141, "rewards/margins": 0.007145605981349945, "rewards/rejected": -0.017375409603118896, "step": 7100 }, { "epoch": 0.7194171810179095, "grad_norm": 294082.6350771569, "learning_rate": 1.5589161232291432e-07, "logits/chosen": -3.7528152465820312, "logits/rejected": -3.720703125, "logps/chosen": -124.98429107666016, "logps/rejected": -175.97930908203125, "loss": 124382.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009784473106265068, "rewards/margins": 0.0056801168248057365, "rewards/rejected": -0.01546459086239338, "step": 7110 }, { "epoch": 0.7204290195284833, "grad_norm": 46927.00609764682, "learning_rate": 1.5532943557454464e-07, "logits/chosen": -4.480721950531006, "logits/rejected": -4.46506404876709, "logps/chosen": -63.90790557861328, "logps/rejected": -85.09671020507812, "loss": 96236.875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008031981997191906, "rewards/margins": 0.004340398125350475, "rewards/rejected": -0.01237238198518753, "step": 7120 }, { "epoch": 0.721440858039057, "grad_norm": 325451.6867369447, "learning_rate": 1.5476725882617495e-07, "logits/chosen": -4.08707857131958, "logits/rejected": -4.162238121032715, "logps/chosen": -77.11101531982422, "logps/rejected": -106.4114761352539, "loss": 125263.0125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0074568139389157295, "rewards/margins": 0.005559433251619339, "rewards/rejected": -0.013016248121857643, "step": 7130 }, { "epoch": 0.7224526965496306, "grad_norm": 241452.60669142572, "learning_rate": 1.5420508207780525e-07, "logits/chosen": -4.414308071136475, "logits/rejected": -4.534913063049316, "logps/chosen": -90.73882293701172, "logps/rejected": -93.8055648803711, "loss": 115235.4875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009165911935269833, "rewards/margins": 0.003756066085770726, "rewards/rejected": -0.01292197685688734, "step": 7140 }, { "epoch": 0.7234645350602044, "grad_norm": 288855.35907905875, "learning_rate": 1.536429053294356e-07, "logits/chosen": -3.9969563484191895, "logits/rejected": -3.9907753467559814, "logps/chosen": -90.9063949584961, "logps/rejected": -95.20767974853516, "loss": 120497.05, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.009834717959165573, "rewards/margins": 0.003699876833707094, "rewards/rejected": -0.013534595258533955, "step": 7150 }, { "epoch": 0.7244763735707781, "grad_norm": 386360.2090021611, "learning_rate": 1.5308072858106588e-07, "logits/chosen": -4.2953386306762695, "logits/rejected": -4.272347450256348, "logps/chosen": -106.47550964355469, "logps/rejected": -106.3961410522461, "loss": 120030.4125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008389269933104515, "rewards/margins": 0.0027142025064677, "rewards/rejected": -0.011103471741080284, "step": 7160 }, { "epoch": 0.7254882120813518, "grad_norm": 357892.6225707842, "learning_rate": 1.5251855183269618e-07, "logits/chosen": -3.9957637786865234, "logits/rejected": -3.919475555419922, "logps/chosen": -109.62730407714844, "logps/rejected": -134.62106323242188, "loss": 115355.775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007121572736650705, "rewards/margins": 0.0008409257861785591, "rewards/rejected": -0.007962497882544994, "step": 7170 }, { "epoch": 0.7265000505919256, "grad_norm": 184177.87182339389, "learning_rate": 1.5195637508432652e-07, "logits/chosen": -4.169290065765381, "logits/rejected": -4.313884735107422, "logps/chosen": -76.89808654785156, "logps/rejected": -118.69490051269531, "loss": 115532.625, "rewards/accuracies": 0.75, "rewards/chosen": -0.00958473701030016, "rewards/margins": 0.006586442235857248, "rewards/rejected": -0.016171179711818695, "step": 7180 }, { "epoch": 0.7275118891024992, "grad_norm": 226035.76696527767, "learning_rate": 1.5139419833595681e-07, "logits/chosen": -3.790128231048584, "logits/rejected": -3.839703321456909, "logps/chosen": -62.906883239746094, "logps/rejected": -73.65010070800781, "loss": 120685.3125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00555334473028779, "rewards/margins": 0.0036475826054811478, "rewards/rejected": -0.00920092687010765, "step": 7190 }, { "epoch": 0.7285237276130729, "grad_norm": 41773.47995416967, "learning_rate": 1.5083202158758713e-07, "logits/chosen": -4.044469356536865, "logits/rejected": -4.062927722930908, "logps/chosen": -95.59349060058594, "logps/rejected": -103.96119689941406, "loss": 106839.5625, "rewards/accuracies": 0.75, "rewards/chosen": -0.008346621878445148, "rewards/margins": 0.005611003842204809, "rewards/rejected": -0.013957624323666096, "step": 7200 }, { "epoch": 0.7295355661236467, "grad_norm": 41727.270209015114, "learning_rate": 1.5026984483921745e-07, "logits/chosen": -4.4334893226623535, "logits/rejected": -4.356921672821045, "logps/chosen": -89.67621612548828, "logps/rejected": -143.0186767578125, "loss": 111358.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013073024339973927, "rewards/margins": 0.004260449204593897, "rewards/rejected": -0.01733347401022911, "step": 7210 }, { "epoch": 0.7305474046342204, "grad_norm": 416310.3284565352, "learning_rate": 1.4970766809084774e-07, "logits/chosen": -4.437172889709473, "logits/rejected": -4.460797309875488, "logps/chosen": -99.21720886230469, "logps/rejected": -104.7569808959961, "loss": 104548.8938, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008455978706479073, "rewards/margins": 0.0038631674833595753, "rewards/rejected": -0.01231914572417736, "step": 7220 }, { "epoch": 0.731559243144794, "grad_norm": 300243.06786072, "learning_rate": 1.4914549134247806e-07, "logits/chosen": -4.293426990509033, "logits/rejected": -4.228333473205566, "logps/chosen": -94.3297119140625, "logps/rejected": -146.62533569335938, "loss": 106877.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009558385238051414, "rewards/margins": 0.004694048315286636, "rewards/rejected": -0.01425243355333805, "step": 7230 }, { "epoch": 0.7325710816553678, "grad_norm": 408886.6065157505, "learning_rate": 1.4858331459410838e-07, "logits/chosen": -3.6943821907043457, "logits/rejected": -3.835599422454834, "logps/chosen": -80.80326843261719, "logps/rejected": -112.52668762207031, "loss": 131904.2, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.013246273621916771, "rewards/margins": 0.002659393474459648, "rewards/rejected": -0.01590566709637642, "step": 7240 }, { "epoch": 0.7335829201659415, "grad_norm": 439331.0531268012, "learning_rate": 1.480211378457387e-07, "logits/chosen": -4.299975872039795, "logits/rejected": -4.556235313415527, "logps/chosen": -79.0093994140625, "logps/rejected": -97.74931335449219, "loss": 124257.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007681438233703375, "rewards/margins": 0.006556396372616291, "rewards/rejected": -0.014237833209335804, "step": 7250 }, { "epoch": 0.7345947586765152, "grad_norm": 275802.7014081124, "learning_rate": 1.47458961097369e-07, "logits/chosen": -3.801198959350586, "logits/rejected": -3.694026470184326, "logps/chosen": -71.65491485595703, "logps/rejected": -114.5855712890625, "loss": 102972.8687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.004994886461645365, "rewards/margins": 0.00855229515582323, "rewards/rejected": -0.013547182083129883, "step": 7260 }, { "epoch": 0.735606597187089, "grad_norm": 165563.41000427597, "learning_rate": 1.4689678434899934e-07, "logits/chosen": -4.2289958000183105, "logits/rejected": -4.08605432510376, "logps/chosen": -73.55394744873047, "logps/rejected": -106.96797943115234, "loss": 114628.1375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00957750715315342, "rewards/margins": 0.0041480762884020805, "rewards/rejected": -0.0137255834415555, "step": 7270 }, { "epoch": 0.7366184356976626, "grad_norm": 162170.1342528291, "learning_rate": 1.4633460760062963e-07, "logits/chosen": -4.123441219329834, "logits/rejected": -4.325385093688965, "logps/chosen": -86.19679260253906, "logps/rejected": -90.06505584716797, "loss": 108087.9375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02007719688117504, "rewards/margins": -0.008041134104132652, "rewards/rejected": -0.01203606091439724, "step": 7280 }, { "epoch": 0.7376302742082363, "grad_norm": 281993.0868125141, "learning_rate": 1.4577243085225992e-07, "logits/chosen": -4.321486949920654, "logits/rejected": -4.090987205505371, "logps/chosen": -89.06423950195312, "logps/rejected": -91.05316162109375, "loss": 120787.6, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006711548659950495, "rewards/margins": 0.004348448943346739, "rewards/rejected": -0.011059997603297234, "step": 7290 }, { "epoch": 0.7386421127188101, "grad_norm": 102262.23013765599, "learning_rate": 1.4521025410389027e-07, "logits/chosen": -4.456615924835205, "logits/rejected": -4.385031700134277, "logps/chosen": -95.715576171875, "logps/rejected": -98.32435607910156, "loss": 110344.925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009815539233386517, "rewards/margins": 0.0002820616646204144, "rewards/rejected": -0.010097600519657135, "step": 7300 }, { "epoch": 0.7396539512293838, "grad_norm": 103964.49429503064, "learning_rate": 1.4464807735552056e-07, "logits/chosen": -3.7586841583251953, "logits/rejected": -3.7700679302215576, "logps/chosen": -84.85095977783203, "logps/rejected": -119.35734558105469, "loss": 124282.025, "rewards/accuracies": 0.75, "rewards/chosen": -0.009521178901195526, "rewards/margins": 0.0032964213751256466, "rewards/rejected": -0.012817601673305035, "step": 7310 }, { "epoch": 0.7406657897399574, "grad_norm": 414608.3565408174, "learning_rate": 1.440859006071509e-07, "logits/chosen": -4.072277545928955, "logits/rejected": -4.068235397338867, "logps/chosen": -69.67396545410156, "logps/rejected": -108.23486328125, "loss": 89666.8438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007799065206199884, "rewards/margins": 0.004289762582629919, "rewards/rejected": -0.012088827788829803, "step": 7320 }, { "epoch": 0.7416776282505312, "grad_norm": 403704.2217081305, "learning_rate": 1.435237238587812e-07, "logits/chosen": -3.516282320022583, "logits/rejected": -3.213463544845581, "logps/chosen": -88.47766876220703, "logps/rejected": -271.3393859863281, "loss": 113095.825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0038262251764535904, "rewards/margins": 0.010789717547595501, "rewards/rejected": -0.014615943655371666, "step": 7330 }, { "epoch": 0.7426894667611049, "grad_norm": 417859.947880862, "learning_rate": 1.429615471104115e-07, "logits/chosen": -4.189812660217285, "logits/rejected": -4.139196872711182, "logps/chosen": -117.53691101074219, "logps/rejected": -104.631103515625, "loss": 134955.0375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005742483772337437, "rewards/margins": 0.004376275464892387, "rewards/rejected": -0.01011875830590725, "step": 7340 }, { "epoch": 0.7437013052716787, "grad_norm": 380201.8864630029, "learning_rate": 1.4239937036204184e-07, "logits/chosen": -3.515697479248047, "logits/rejected": -3.8369858264923096, "logps/chosen": -87.45179748535156, "logps/rejected": -113.87628173828125, "loss": 126107.825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012919047847390175, "rewards/margins": 0.005374048836529255, "rewards/rejected": -0.018293095752596855, "step": 7350 }, { "epoch": 0.7447131437822524, "grad_norm": 372601.92493303685, "learning_rate": 1.4183719361367213e-07, "logits/chosen": -3.971500873565674, "logits/rejected": -3.8412277698516846, "logps/chosen": -284.87921142578125, "logps/rejected": -334.947265625, "loss": 108445.225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.012906436808407307, "rewards/margins": 0.0026922891847789288, "rewards/rejected": -0.015598724596202374, "step": 7360 }, { "epoch": 0.745724982292826, "grad_norm": 541378.5976137455, "learning_rate": 1.4127501686530245e-07, "logits/chosen": -4.160788536071777, "logits/rejected": -4.251266002655029, "logps/chosen": -90.21473693847656, "logps/rejected": -121.34077453613281, "loss": 115460.8625, "rewards/accuracies": 0.75, "rewards/chosen": -0.009146034717559814, "rewards/margins": 0.006720884703099728, "rewards/rejected": -0.015866922214627266, "step": 7370 }, { "epoch": 0.7467368208033998, "grad_norm": 426720.2302795958, "learning_rate": 1.4071284011693277e-07, "logits/chosen": -3.6069607734680176, "logits/rejected": -3.501587390899658, "logps/chosen": -170.2766876220703, "logps/rejected": -168.72482299804688, "loss": 111966.2125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.012548012658953667, "rewards/margins": 0.0019094214076176286, "rewards/rejected": -0.014457432553172112, "step": 7380 }, { "epoch": 0.7477486593139735, "grad_norm": 427855.02584886324, "learning_rate": 1.4015066336856308e-07, "logits/chosen": -4.107905864715576, "logits/rejected": -4.297100067138672, "logps/chosen": -80.00685119628906, "logps/rejected": -86.84269714355469, "loss": 124262.0375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008175224997103214, "rewards/margins": 0.004371381364762783, "rewards/rejected": -0.012546604499220848, "step": 7390 }, { "epoch": 0.7487604978245472, "grad_norm": 225166.69684644038, "learning_rate": 1.3958848662019338e-07, "logits/chosen": -4.270549774169922, "logits/rejected": -4.254513263702393, "logps/chosen": -76.67224884033203, "logps/rejected": -93.0635757446289, "loss": 114768.5625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.008238346315920353, "rewards/margins": 0.0038827192038297653, "rewards/rejected": -0.012121065519750118, "step": 7400 }, { "epoch": 0.7497723363351209, "grad_norm": 363011.5112571325, "learning_rate": 1.390263098718237e-07, "logits/chosen": -4.111577033996582, "logits/rejected": -4.113997459411621, "logps/chosen": -75.90838623046875, "logps/rejected": -108.337158203125, "loss": 134939.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010988804511725903, "rewards/margins": 0.005976541433483362, "rewards/rejected": -0.016965346410870552, "step": 7410 }, { "epoch": 0.7507841748456946, "grad_norm": 363029.75689755945, "learning_rate": 1.3846413312345401e-07, "logits/chosen": -3.9328532218933105, "logits/rejected": -3.8603920936584473, "logps/chosen": -70.97767639160156, "logps/rejected": -80.67808532714844, "loss": 105800.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008087101392447948, "rewards/margins": 0.0021250308491289616, "rewards/rejected": -0.010212132707238197, "step": 7420 }, { "epoch": 0.7517960133562683, "grad_norm": 223558.30407654404, "learning_rate": 1.379019563750843e-07, "logits/chosen": -3.716383457183838, "logits/rejected": -3.831249237060547, "logps/chosen": -97.39202880859375, "logps/rejected": -119.36893463134766, "loss": 130982.3375, "rewards/accuracies": 0.75, "rewards/chosen": -0.008575445041060448, "rewards/margins": 0.005743173882365227, "rewards/rejected": -0.0143186179921031, "step": 7430 }, { "epoch": 0.7528078518668421, "grad_norm": 262786.97804949485, "learning_rate": 1.3733977962671465e-07, "logits/chosen": -4.073144912719727, "logits/rejected": -3.960181474685669, "logps/chosen": -65.75709533691406, "logps/rejected": -95.49187469482422, "loss": 121452.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008622724562883377, "rewards/margins": 0.003767678514122963, "rewards/rejected": -0.01239040307700634, "step": 7440 }, { "epoch": 0.7538196903774158, "grad_norm": 274983.33965137793, "learning_rate": 1.3677760287834494e-07, "logits/chosen": -3.7628562450408936, "logits/rejected": -3.8087284564971924, "logps/chosen": -75.45030212402344, "logps/rejected": -84.47805786132812, "loss": 109951.6125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00880451500415802, "rewards/margins": 0.0045965248718857765, "rewards/rejected": -0.013401040807366371, "step": 7450 }, { "epoch": 0.7548315288879894, "grad_norm": 401810.9520848423, "learning_rate": 1.3621542612997524e-07, "logits/chosen": -4.3767476081848145, "logits/rejected": -4.399510860443115, "logps/chosen": -69.91743469238281, "logps/rejected": -70.08047485351562, "loss": 90574.9812, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005791595671325922, "rewards/margins": 0.0004405028303153813, "rewards/rejected": -0.006232098676264286, "step": 7460 }, { "epoch": 0.7558433673985632, "grad_norm": 211478.85770722895, "learning_rate": 1.3565324938160558e-07, "logits/chosen": -4.41450309753418, "logits/rejected": -4.483403205871582, "logps/chosen": -104.23101806640625, "logps/rejected": -106.65580749511719, "loss": 102016.35, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011751659214496613, "rewards/margins": 0.00282789277844131, "rewards/rejected": -0.014579552225768566, "step": 7470 }, { "epoch": 0.7568552059091369, "grad_norm": 271504.906562521, "learning_rate": 1.3509107263323587e-07, "logits/chosen": -4.1256513595581055, "logits/rejected": -4.1522650718688965, "logps/chosen": -100.23184204101562, "logps/rejected": -117.63809967041016, "loss": 91897.2875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006937532220035791, "rewards/margins": 0.002990133361890912, "rewards/rejected": -0.009927665814757347, "step": 7480 }, { "epoch": 0.7578670444197106, "grad_norm": 287601.80504415615, "learning_rate": 1.345288958848662e-07, "logits/chosen": -3.9424948692321777, "logits/rejected": -4.009277820587158, "logps/chosen": -321.3587951660156, "logps/rejected": -318.92547607421875, "loss": 118859.95, "rewards/accuracies": 0.5, "rewards/chosen": -0.020912988111376762, "rewards/margins": 0.001350566279143095, "rewards/rejected": -0.022263556718826294, "step": 7490 }, { "epoch": 0.7588788829302844, "grad_norm": 191039.29166572852, "learning_rate": 1.339667191364965e-07, "logits/chosen": -4.376297950744629, "logits/rejected": -4.423515319824219, "logps/chosen": -100.26751708984375, "logps/rejected": -134.56019592285156, "loss": 101922.3875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008967943489551544, "rewards/margins": 0.002472478896379471, "rewards/rejected": -0.01144042145460844, "step": 7500 }, { "epoch": 0.759890721440858, "grad_norm": 257471.4116718149, "learning_rate": 1.3340454238812683e-07, "logits/chosen": -3.9031805992126465, "logits/rejected": -4.05730676651001, "logps/chosen": -82.14889526367188, "logps/rejected": -89.98145294189453, "loss": 110105.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.003452691715210676, "rewards/margins": 0.008302392438054085, "rewards/rejected": -0.011755084618926048, "step": 7510 }, { "epoch": 0.7609025599514317, "grad_norm": 479226.3822111724, "learning_rate": 1.3284236563975712e-07, "logits/chosen": -4.025390148162842, "logits/rejected": -4.007852077484131, "logps/chosen": -88.89665222167969, "logps/rejected": -109.1961441040039, "loss": 117044.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010047683492302895, "rewards/margins": 0.001545612933114171, "rewards/rejected": -0.011593297123908997, "step": 7520 }, { "epoch": 0.7619143984620055, "grad_norm": 330493.77743500826, "learning_rate": 1.3228018889138744e-07, "logits/chosen": -3.672696590423584, "logits/rejected": -3.6588714122772217, "logps/chosen": -80.15201568603516, "logps/rejected": -87.24325561523438, "loss": 106454.8625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007744968868792057, "rewards/margins": 0.002278231782838702, "rewards/rejected": -0.010023200884461403, "step": 7530 }, { "epoch": 0.7629262369725792, "grad_norm": 353544.4292595527, "learning_rate": 1.3171801214301776e-07, "logits/chosen": -4.050291538238525, "logits/rejected": -4.249870300292969, "logps/chosen": -40.84009552001953, "logps/rejected": -58.21088409423828, "loss": 114000.075, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0069960253313183784, "rewards/margins": 0.0034184777177870274, "rewards/rejected": -0.010414501652121544, "step": 7540 }, { "epoch": 0.7639380754831528, "grad_norm": 360269.6522716054, "learning_rate": 1.3115583539464808e-07, "logits/chosen": -3.523317813873291, "logits/rejected": -3.4706008434295654, "logps/chosen": -129.5641632080078, "logps/rejected": -129.37631225585938, "loss": 121913.175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009832246229052544, "rewards/margins": 0.002690389519557357, "rewards/rejected": -0.01252263505011797, "step": 7550 }, { "epoch": 0.7649499139937266, "grad_norm": 317078.6102682281, "learning_rate": 1.305936586462784e-07, "logits/chosen": -4.219413757324219, "logits/rejected": -4.328913688659668, "logps/chosen": -74.47515106201172, "logps/rejected": -87.20521545410156, "loss": 106388.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006312399171292782, "rewards/margins": 0.003439192892983556, "rewards/rejected": -0.009751592762768269, "step": 7560 }, { "epoch": 0.7659617525043003, "grad_norm": 465360.6422876752, "learning_rate": 1.300314818979087e-07, "logits/chosen": -4.017872333526611, "logits/rejected": -4.120330333709717, "logps/chosen": -275.74700927734375, "logps/rejected": -292.63275146484375, "loss": 114333.2, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008085707202553749, "rewards/margins": 0.007359664887189865, "rewards/rejected": -0.015445372089743614, "step": 7570 }, { "epoch": 0.7669735910148741, "grad_norm": 287470.03410986863, "learning_rate": 1.29469305149539e-07, "logits/chosen": -4.1626739501953125, "logits/rejected": -4.135104656219482, "logps/chosen": -79.43620300292969, "logps/rejected": -93.35115051269531, "loss": 121522.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010508138686418533, "rewards/margins": 0.003030018648132682, "rewards/rejected": -0.013538157567381859, "step": 7580 }, { "epoch": 0.7679854295254478, "grad_norm": 367984.3251279752, "learning_rate": 1.2890712840116933e-07, "logits/chosen": -4.156266212463379, "logits/rejected": -4.143401145935059, "logps/chosen": -56.69196319580078, "logps/rejected": -85.85543060302734, "loss": 131857.55, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009608769789338112, "rewards/margins": 0.002728225663304329, "rewards/rejected": -0.012336996383965015, "step": 7590 }, { "epoch": 0.7689972680360214, "grad_norm": 199454.3630516425, "learning_rate": 1.2834495165279962e-07, "logits/chosen": -4.272221565246582, "logits/rejected": -4.434269905090332, "logps/chosen": -79.4330062866211, "logps/rejected": -134.49029541015625, "loss": 122943.9, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011502350680530071, "rewards/margins": 0.0089148860424757, "rewards/rejected": -0.020417233929038048, "step": 7600 }, { "epoch": 0.7700091065465952, "grad_norm": 220677.98751636886, "learning_rate": 1.2778277490442997e-07, "logits/chosen": -4.230715274810791, "logits/rejected": -4.352440357208252, "logps/chosen": -86.22007751464844, "logps/rejected": -118.42427062988281, "loss": 122929.0875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009930550120770931, "rewards/margins": 0.0071978019550442696, "rewards/rejected": -0.0171283520758152, "step": 7610 }, { "epoch": 0.7710209450571689, "grad_norm": 396323.36112648254, "learning_rate": 1.2722059815606026e-07, "logits/chosen": -3.7869484424591064, "logits/rejected": -3.754040479660034, "logps/chosen": -81.15708923339844, "logps/rejected": -114.85099792480469, "loss": 110036.15, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01007847674190998, "rewards/margins": 0.007466270122677088, "rewards/rejected": -0.01754474639892578, "step": 7620 }, { "epoch": 0.7720327835677426, "grad_norm": 244688.98299049883, "learning_rate": 1.2665842140769058e-07, "logits/chosen": -3.830481767654419, "logits/rejected": -3.675891876220703, "logps/chosen": -100.64971923828125, "logps/rejected": -126.15092468261719, "loss": 124378.875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00886483769863844, "rewards/margins": 0.003194875316694379, "rewards/rejected": -0.012059712782502174, "step": 7630 }, { "epoch": 0.7730446220783163, "grad_norm": 42279.55186993153, "learning_rate": 1.260962446593209e-07, "logits/chosen": -3.865473985671997, "logits/rejected": -3.9679951667785645, "logps/chosen": -116.35069274902344, "logps/rejected": -120.131103515625, "loss": 113388.45, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011626197025179863, "rewards/margins": 0.0007924248347990215, "rewards/rejected": -0.012418623082339764, "step": 7640 }, { "epoch": 0.77405646058889, "grad_norm": 282982.67975961464, "learning_rate": 1.255340679109512e-07, "logits/chosen": -3.9883294105529785, "logits/rejected": -3.8217086791992188, "logps/chosen": -100.94316101074219, "logps/rejected": -138.47598266601562, "loss": 132408.825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.014486072584986687, "rewards/margins": 0.004970858804881573, "rewards/rejected": -0.019456932321190834, "step": 7650 }, { "epoch": 0.7750682990994637, "grad_norm": 261051.77226199364, "learning_rate": 1.249718911625815e-07, "logits/chosen": -4.372532844543457, "logits/rejected": -4.348183631896973, "logps/chosen": -82.46533966064453, "logps/rejected": -109.20174407958984, "loss": 102204.5875, "rewards/accuracies": 0.5, "rewards/chosen": -0.010506520047783852, "rewards/margins": 0.0019932836294174194, "rewards/rejected": -0.012499802745878696, "step": 7660 }, { "epoch": 0.7760801376100375, "grad_norm": 195647.34378096735, "learning_rate": 1.2440971441421183e-07, "logits/chosen": -4.0061445236206055, "logits/rejected": -4.041876792907715, "logps/chosen": -109.42601013183594, "logps/rejected": -91.19308471679688, "loss": 102420.0813, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008119629696011543, "rewards/margins": 0.0017634708201512694, "rewards/rejected": -0.009883100166916847, "step": 7670 }, { "epoch": 0.7770919761206112, "grad_norm": 413595.53772352205, "learning_rate": 1.2384753766584214e-07, "logits/chosen": -4.181981563568115, "logits/rejected": -4.223334312438965, "logps/chosen": -117.95677185058594, "logps/rejected": -99.30319213867188, "loss": 115422.0625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010493604466319084, "rewards/margins": 0.004705050494521856, "rewards/rejected": -0.015198653563857079, "step": 7680 }, { "epoch": 0.7781038146311848, "grad_norm": 312180.9757344499, "learning_rate": 1.2328536091747246e-07, "logits/chosen": -4.372924327850342, "logits/rejected": -4.450977325439453, "logps/chosen": -81.51934814453125, "logps/rejected": -94.21646881103516, "loss": 127364.4875, "rewards/accuracies": 0.75, "rewards/chosen": -0.008761035278439522, "rewards/margins": 0.005868414416909218, "rewards/rejected": -0.014629448764026165, "step": 7690 }, { "epoch": 0.7791156531417586, "grad_norm": 406637.53252729645, "learning_rate": 1.2272318416910276e-07, "logits/chosen": -4.015949249267578, "logits/rejected": -4.098971366882324, "logps/chosen": -71.97998046875, "logps/rejected": -83.68016815185547, "loss": 122039.8, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.00652660196647048, "rewards/margins": 0.00547502376139164, "rewards/rejected": -0.012001624330878258, "step": 7700 }, { "epoch": 0.7801274916523323, "grad_norm": 315988.51338501053, "learning_rate": 1.2216100742073307e-07, "logits/chosen": -3.286313533782959, "logits/rejected": -3.4293651580810547, "logps/chosen": -102.16381072998047, "logps/rejected": -111.79399108886719, "loss": 118384.2375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013412846252322197, "rewards/margins": 0.0026832891162484884, "rewards/rejected": -0.01609613373875618, "step": 7710 }, { "epoch": 0.781139330162906, "grad_norm": 194103.42821299014, "learning_rate": 1.215988306723634e-07, "logits/chosen": -4.252665042877197, "logits/rejected": -4.051205635070801, "logps/chosen": -74.13288879394531, "logps/rejected": -54.647056579589844, "loss": 101701.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011082599870860577, "rewards/margins": -0.002832916798070073, "rewards/rejected": -0.008249682374298573, "step": 7720 }, { "epoch": 0.7821511686734797, "grad_norm": 395493.6542249166, "learning_rate": 1.210366539239937e-07, "logits/chosen": -4.32069730758667, "logits/rejected": -4.169463634490967, "logps/chosen": -87.7716293334961, "logps/rejected": -108.79273986816406, "loss": 108014.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009733812883496284, "rewards/margins": 0.003363256109878421, "rewards/rejected": -0.013097068294882774, "step": 7730 }, { "epoch": 0.7831630071840534, "grad_norm": 189943.24615426158, "learning_rate": 1.20474477175624e-07, "logits/chosen": -3.845959424972534, "logits/rejected": -3.7005763053894043, "logps/chosen": -70.75074768066406, "logps/rejected": -105.17501068115234, "loss": 125121.725, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004756935406476259, "rewards/margins": 0.004093333147466183, "rewards/rejected": -0.008850268088281155, "step": 7740 }, { "epoch": 0.7841748456946271, "grad_norm": 476904.2104263206, "learning_rate": 1.1991230042725432e-07, "logits/chosen": -3.8478267192840576, "logits/rejected": -4.118649005889893, "logps/chosen": -100.62992095947266, "logps/rejected": -157.14251708984375, "loss": 105462.225, "rewards/accuracies": 0.75, "rewards/chosen": -0.010639457032084465, "rewards/margins": 0.007496605161577463, "rewards/rejected": -0.01813606359064579, "step": 7750 }, { "epoch": 0.7851866842052009, "grad_norm": 307672.55350944644, "learning_rate": 1.1935012367888464e-07, "logits/chosen": -3.7925033569335938, "logits/rejected": -3.825329542160034, "logps/chosen": -80.07588195800781, "logps/rejected": -98.33236694335938, "loss": 116555.2125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008508100174367428, "rewards/margins": 0.0028628918807953596, "rewards/rejected": -0.011370992287993431, "step": 7760 }, { "epoch": 0.7861985227157746, "grad_norm": 397367.25919964927, "learning_rate": 1.1878794693051495e-07, "logits/chosen": -4.09080171585083, "logits/rejected": -3.9986839294433594, "logps/chosen": -76.71125793457031, "logps/rejected": -107.9671630859375, "loss": 112149.475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010640623979270458, "rewards/margins": 0.003631537314504385, "rewards/rejected": -0.014272162690758705, "step": 7770 }, { "epoch": 0.7872103612263482, "grad_norm": 208208.38785006444, "learning_rate": 1.1822577018214527e-07, "logits/chosen": -4.292060852050781, "logits/rejected": -4.353479862213135, "logps/chosen": -71.0816421508789, "logps/rejected": -102.03807067871094, "loss": 113148.875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008054058067500591, "rewards/margins": 0.003483009058982134, "rewards/rejected": -0.011537068523466587, "step": 7780 }, { "epoch": 0.788222199736922, "grad_norm": 310146.1197642975, "learning_rate": 1.1766359343377557e-07, "logits/chosen": -3.5743980407714844, "logits/rejected": -3.6891350746154785, "logps/chosen": -100.74185943603516, "logps/rejected": -124.36262512207031, "loss": 126162.5875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.014500441960990429, "rewards/margins": 0.0008492930792272091, "rewards/rejected": -0.015349733643233776, "step": 7790 }, { "epoch": 0.7892340382474957, "grad_norm": 406850.7624680672, "learning_rate": 1.1710141668540589e-07, "logits/chosen": -3.8483798503875732, "logits/rejected": -3.7976367473602295, "logps/chosen": -109.99488830566406, "logps/rejected": -123.0772476196289, "loss": 127852.9875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0067154644057154655, "rewards/margins": 0.004104310180991888, "rewards/rejected": -0.010819775983691216, "step": 7800 }, { "epoch": 0.7902458767580695, "grad_norm": 420262.5563969256, "learning_rate": 1.165392399370362e-07, "logits/chosen": -3.9101226329803467, "logits/rejected": -3.875485897064209, "logps/chosen": -89.53706359863281, "logps/rejected": -115.1669692993164, "loss": 124744.0375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005075396038591862, "rewards/margins": 0.0034760604612529278, "rewards/rejected": -0.008551456034183502, "step": 7810 }, { "epoch": 0.7912577152686431, "grad_norm": 359503.90155275626, "learning_rate": 1.1597706318866651e-07, "logits/chosen": -4.50681209564209, "logits/rejected": -4.457459449768066, "logps/chosen": -87.36959838867188, "logps/rejected": -68.70368194580078, "loss": 123455.3625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007418530993163586, "rewards/margins": 0.003091733204200864, "rewards/rejected": -0.01051026489585638, "step": 7820 }, { "epoch": 0.7922695537792168, "grad_norm": 332318.2368229668, "learning_rate": 1.1541488644029683e-07, "logits/chosen": -4.147678375244141, "logits/rejected": -4.104836463928223, "logps/chosen": -85.04119873046875, "logps/rejected": -100.5381851196289, "loss": 126781.1125, "rewards/accuracies": 0.75, "rewards/chosen": -0.00886439811438322, "rewards/margins": 0.006912278942763805, "rewards/rejected": -0.015776677057147026, "step": 7830 }, { "epoch": 0.7932813922897906, "grad_norm": 423301.4914880608, "learning_rate": 1.1485270969192714e-07, "logits/chosen": -4.374096393585205, "logits/rejected": -4.1872878074646, "logps/chosen": -104.64073181152344, "logps/rejected": -122.7469711303711, "loss": 120742.2375, "rewards/accuracies": 0.5, "rewards/chosen": -0.011696171946823597, "rewards/margins": 0.002301506232470274, "rewards/rejected": -0.013997676782310009, "step": 7840 }, { "epoch": 0.7942932308003643, "grad_norm": 353120.3897956328, "learning_rate": 1.1429053294355744e-07, "logits/chosen": -3.9731974601745605, "logits/rejected": -3.928518295288086, "logps/chosen": -75.43474578857422, "logps/rejected": -102.15515899658203, "loss": 108309.175, "rewards/accuracies": 0.5, "rewards/chosen": -0.010067204013466835, "rewards/margins": 0.0027014180086553097, "rewards/rejected": -0.012768621556460857, "step": 7850 }, { "epoch": 0.795305069310938, "grad_norm": 379208.34760468506, "learning_rate": 1.1372835619518776e-07, "logits/chosen": -4.272770881652832, "logits/rejected": -4.043082237243652, "logps/chosen": -77.47657775878906, "logps/rejected": -117.0516128540039, "loss": 115131.45, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009439300745725632, "rewards/margins": 0.00762337539345026, "rewards/rejected": -0.017062675207853317, "step": 7860 }, { "epoch": 0.7963169078215117, "grad_norm": 92359.01764186962, "learning_rate": 1.1316617944681807e-07, "logits/chosen": -3.8946356773376465, "logits/rejected": -4.038553714752197, "logps/chosen": -68.43402099609375, "logps/rejected": -85.13797760009766, "loss": 109669.5125, "rewards/accuracies": 0.75, "rewards/chosen": -0.011965210549533367, "rewards/margins": 0.0033139579463750124, "rewards/rejected": -0.015279168263077736, "step": 7870 }, { "epoch": 0.7973287463320854, "grad_norm": 312891.2609664019, "learning_rate": 1.1260400269844839e-07, "logits/chosen": -4.321270942687988, "logits/rejected": -4.210376739501953, "logps/chosen": -90.21158599853516, "logps/rejected": -129.66148376464844, "loss": 118272.0625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.015397314913570881, "rewards/margins": 0.0038171645719558, "rewards/rejected": -0.019214477390050888, "step": 7880 }, { "epoch": 0.7983405848426591, "grad_norm": 859768.4025591754, "learning_rate": 1.120418259500787e-07, "logits/chosen": -4.286216735839844, "logits/rejected": -4.175978183746338, "logps/chosen": -87.1583251953125, "logps/rejected": -143.93447875976562, "loss": 128341.225, "rewards/accuracies": 0.75, "rewards/chosen": -0.010412363335490227, "rewards/margins": 0.013222433626651764, "rewards/rejected": -0.02363479696214199, "step": 7890 }, { "epoch": 0.7993524233532329, "grad_norm": 462526.04533248016, "learning_rate": 1.1147964920170901e-07, "logits/chosen": -3.9930572509765625, "logits/rejected": -3.962195634841919, "logps/chosen": -103.79779052734375, "logps/rejected": -115.02079010009766, "loss": 124204.775, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.012525690719485283, "rewards/margins": 0.00300039816647768, "rewards/rejected": -0.015526088885962963, "step": 7900 }, { "epoch": 0.8003642618638065, "grad_norm": 423970.06475317676, "learning_rate": 1.1091747245333932e-07, "logits/chosen": -4.050687313079834, "logits/rejected": -4.038154125213623, "logps/chosen": -83.9175796508789, "logps/rejected": -94.28934478759766, "loss": 124771.85, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00807961355894804, "rewards/margins": 0.004804011434316635, "rewards/rejected": -0.0128836240619421, "step": 7910 }, { "epoch": 0.8013761003743802, "grad_norm": 236672.8788389101, "learning_rate": 1.1035529570496964e-07, "logits/chosen": -3.983524799346924, "logits/rejected": -4.007144927978516, "logps/chosen": -42.591835021972656, "logps/rejected": -59.558616638183594, "loss": 89069.8313, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008798202499747276, "rewards/margins": 0.000770292361266911, "rewards/rejected": -0.00956849567592144, "step": 7920 }, { "epoch": 0.802387938884954, "grad_norm": 859399.3789072495, "learning_rate": 1.0979311895659995e-07, "logits/chosen": -3.195596218109131, "logits/rejected": -2.963515043258667, "logps/chosen": -236.18484497070312, "logps/rejected": -266.97296142578125, "loss": 104855.2188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00803174264729023, "rewards/margins": 0.00375569611787796, "rewards/rejected": -0.01178743876516819, "step": 7930 }, { "epoch": 0.8033997773955277, "grad_norm": 469356.20411661704, "learning_rate": 1.0923094220823026e-07, "logits/chosen": -3.8656868934631348, "logits/rejected": -3.8172924518585205, "logps/chosen": -106.8467788696289, "logps/rejected": -99.70085906982422, "loss": 116459.7875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009702866896986961, "rewards/margins": 0.0028607037384063005, "rewards/rejected": -0.012563569471240044, "step": 7940 }, { "epoch": 0.8044116159061014, "grad_norm": 388207.2551766517, "learning_rate": 1.0866876545986058e-07, "logits/chosen": -3.960106611251831, "logits/rejected": -4.014041900634766, "logps/chosen": -79.86503601074219, "logps/rejected": -99.77107238769531, "loss": 95958.2, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010094651952385902, "rewards/margins": 0.0019590542651712894, "rewards/rejected": -0.012053707614541054, "step": 7950 }, { "epoch": 0.8054234544166751, "grad_norm": 271624.2456423673, "learning_rate": 1.081065887114909e-07, "logits/chosen": -3.934523820877075, "logits/rejected": -3.872987747192383, "logps/chosen": -126.831298828125, "logps/rejected": -100.11951446533203, "loss": 115958.15, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008265312761068344, "rewards/margins": 0.0013678728137165308, "rewards/rejected": -0.009633185341954231, "step": 7960 }, { "epoch": 0.8064352929272488, "grad_norm": 465956.05404359574, "learning_rate": 1.0754441196312119e-07, "logits/chosen": -3.447449207305908, "logits/rejected": -3.470703125, "logps/chosen": -91.19917297363281, "logps/rejected": -100.43378448486328, "loss": 118826.45, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00919909868389368, "rewards/margins": -0.0009367609163746238, "rewards/rejected": -0.008262337185442448, "step": 7970 }, { "epoch": 0.8074471314378225, "grad_norm": 436513.16348094016, "learning_rate": 1.0698223521475151e-07, "logits/chosen": -3.8368663787841797, "logits/rejected": -3.921740770339966, "logps/chosen": -197.56578063964844, "logps/rejected": -172.3467254638672, "loss": 127866.8375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012048641219735146, "rewards/margins": 0.0047609503380954266, "rewards/rejected": -0.01680959202349186, "step": 7980 }, { "epoch": 0.8084589699483963, "grad_norm": 471412.9088776895, "learning_rate": 1.0642005846638183e-07, "logits/chosen": -3.9598898887634277, "logits/rejected": -3.891242504119873, "logps/chosen": -140.3300018310547, "logps/rejected": -132.6756134033203, "loss": 123070.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012242230586707592, "rewards/margins": 0.0018555953865870833, "rewards/rejected": -0.014097826555371284, "step": 7990 }, { "epoch": 0.8094708084589699, "grad_norm": 297315.8278713908, "learning_rate": 1.0585788171801213e-07, "logits/chosen": -3.728074312210083, "logits/rejected": -3.801783800125122, "logps/chosen": -68.59547424316406, "logps/rejected": -92.35982513427734, "loss": 111800.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008158531971275806, "rewards/margins": 0.006912999786436558, "rewards/rejected": -0.01507152896374464, "step": 8000 }, { "epoch": 0.8104826469695436, "grad_norm": 422267.3997490296, "learning_rate": 1.0529570496964245e-07, "logits/chosen": -3.945225477218628, "logits/rejected": -4.032832145690918, "logps/chosen": -98.76021575927734, "logps/rejected": -140.8959197998047, "loss": 119859.05, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007340181618928909, "rewards/margins": 0.005221239291131496, "rewards/rejected": -0.012561419978737831, "step": 8010 }, { "epoch": 0.8114944854801174, "grad_norm": 228802.97576462853, "learning_rate": 1.0473352822127277e-07, "logits/chosen": -3.8701820373535156, "logits/rejected": -3.940556287765503, "logps/chosen": -101.10469818115234, "logps/rejected": -97.4957046508789, "loss": 111547.35, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009226640686392784, "rewards/margins": 0.002123087178915739, "rewards/rejected": -0.01134972833096981, "step": 8020 }, { "epoch": 0.8125063239906911, "grad_norm": 324150.1486248847, "learning_rate": 1.0417135147290308e-07, "logits/chosen": -3.700404644012451, "logits/rejected": -3.697136640548706, "logps/chosen": -83.92759704589844, "logps/rejected": -105.17192077636719, "loss": 112479.575, "rewards/accuracies": 0.5, "rewards/chosen": -0.010281701572239399, "rewards/margins": 0.004277985543012619, "rewards/rejected": -0.014559686183929443, "step": 8030 }, { "epoch": 0.8135181625012649, "grad_norm": 286750.758654564, "learning_rate": 1.0360917472453338e-07, "logits/chosen": -4.037090301513672, "logits/rejected": -4.043478488922119, "logps/chosen": -99.37135314941406, "logps/rejected": -93.77800750732422, "loss": 120132.1, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010845848359167576, "rewards/margins": 0.001078177010640502, "rewards/rejected": -0.011924026533961296, "step": 8040 }, { "epoch": 0.8145300010118385, "grad_norm": 288713.07285499165, "learning_rate": 1.030469979761637e-07, "logits/chosen": -4.2317304611206055, "logits/rejected": -4.279356002807617, "logps/chosen": -107.75370788574219, "logps/rejected": -133.5959930419922, "loss": 122417.8375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008320098742842674, "rewards/margins": 0.004185177851468325, "rewards/rejected": -0.012505276128649712, "step": 8050 }, { "epoch": 0.8155418395224122, "grad_norm": 358956.46564106893, "learning_rate": 1.0248482122779402e-07, "logits/chosen": -4.234696388244629, "logits/rejected": -4.274272918701172, "logps/chosen": -101.23332977294922, "logps/rejected": -97.89191436767578, "loss": 119024.975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008960237726569176, "rewards/margins": 0.0017022531246766448, "rewards/rejected": -0.010662492364645004, "step": 8060 }, { "epoch": 0.816553678032986, "grad_norm": 163294.25684722565, "learning_rate": 1.0192264447942433e-07, "logits/chosen": -4.062541961669922, "logits/rejected": -3.9996254444122314, "logps/chosen": -93.48268127441406, "logps/rejected": -100.46837615966797, "loss": 109344.3125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008667383342981339, "rewards/margins": 0.004659014288336039, "rewards/rejected": -0.013326396234333515, "step": 8070 }, { "epoch": 0.8175655165435597, "grad_norm": 116413.97854469469, "learning_rate": 1.0136046773105464e-07, "logits/chosen": -4.131561279296875, "logits/rejected": -4.090974807739258, "logps/chosen": -73.47103118896484, "logps/rejected": -92.22039794921875, "loss": 113199.8625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008663355372846127, "rewards/margins": 0.006515624932944775, "rewards/rejected": -0.015178983099758625, "step": 8080 }, { "epoch": 0.8185773550541333, "grad_norm": 272766.6794474993, "learning_rate": 1.0079829098268495e-07, "logits/chosen": -4.23798131942749, "logits/rejected": -4.008040428161621, "logps/chosen": -56.184181213378906, "logps/rejected": -113.3834457397461, "loss": 121295.0375, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.006203674245625734, "rewards/margins": 0.010324726812541485, "rewards/rejected": -0.016528401523828506, "step": 8090 }, { "epoch": 0.819589193564707, "grad_norm": 407856.75367380306, "learning_rate": 1.0023611423431526e-07, "logits/chosen": -4.066802024841309, "logits/rejected": -4.231095314025879, "logps/chosen": -92.92910766601562, "logps/rejected": -99.4348373413086, "loss": 125306.375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012154284864664078, "rewards/margins": 0.0033647671807557344, "rewards/rejected": -0.015519052743911743, "step": 8100 }, { "epoch": 0.8206010320752808, "grad_norm": 253358.73920636546, "learning_rate": 9.967393748594557e-08, "logits/chosen": -3.433570146560669, "logits/rejected": -3.449763059616089, "logps/chosen": -116.4591293334961, "logps/rejected": -142.70030212402344, "loss": 109000.3, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008464435115456581, "rewards/margins": 0.0024151455145329237, "rewards/rejected": -0.010879581794142723, "step": 8110 }, { "epoch": 0.8216128705858545, "grad_norm": 384871.74344483897, "learning_rate": 9.911176073757589e-08, "logits/chosen": -4.189047336578369, "logits/rejected": -4.237068176269531, "logps/chosen": -80.07675170898438, "logps/rejected": -88.99113464355469, "loss": 97146.7625, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0045607625506818295, "rewards/margins": 0.002545821014791727, "rewards/rejected": -0.007106582634150982, "step": 8120 }, { "epoch": 0.8226247090964283, "grad_norm": 594300.2374807876, "learning_rate": 9.854958398920621e-08, "logits/chosen": -4.333995819091797, "logits/rejected": -4.495603084564209, "logps/chosen": -104.78947448730469, "logps/rejected": -107.2220458984375, "loss": 108492.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004412648268043995, "rewards/margins": 0.004461991600692272, "rewards/rejected": -0.008874641731381416, "step": 8130 }, { "epoch": 0.8236365476070019, "grad_norm": 388163.8867996168, "learning_rate": 9.798740724083652e-08, "logits/chosen": -4.068693161010742, "logits/rejected": -4.122685432434082, "logps/chosen": -140.57508850097656, "logps/rejected": -142.74807739257812, "loss": 124017.925, "rewards/accuracies": 0.75, "rewards/chosen": -0.00579815125092864, "rewards/margins": 0.0028135315515100956, "rewards/rejected": -0.00861168373376131, "step": 8140 }, { "epoch": 0.8246483861175756, "grad_norm": 330463.9296594668, "learning_rate": 9.742523049246682e-08, "logits/chosen": -4.051194190979004, "logits/rejected": -4.172976493835449, "logps/chosen": -89.34901428222656, "logps/rejected": -104.2591552734375, "loss": 120785.825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0074844686314463615, "rewards/margins": 0.004208378493785858, "rewards/rejected": -0.011692846193909645, "step": 8150 }, { "epoch": 0.8256602246281494, "grad_norm": 264635.2258624716, "learning_rate": 9.686305374409714e-08, "logits/chosen": -4.1142897605896, "logits/rejected": -4.3055830001831055, "logps/chosen": -105.7720947265625, "logps/rejected": -161.13589477539062, "loss": 122940.025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01123170368373394, "rewards/margins": 0.009346556849777699, "rewards/rejected": -0.020578259602189064, "step": 8160 }, { "epoch": 0.8266720631387231, "grad_norm": 213185.66430724005, "learning_rate": 9.630087699572745e-08, "logits/chosen": -3.9806270599365234, "logits/rejected": -3.9398884773254395, "logps/chosen": -94.42872619628906, "logps/rejected": -146.9995574951172, "loss": 109809.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009127756580710411, "rewards/margins": 0.005660745315253735, "rewards/rejected": -0.01478850282728672, "step": 8170 }, { "epoch": 0.8276839016492967, "grad_norm": 292020.78672077553, "learning_rate": 9.573870024735777e-08, "logits/chosen": -4.198493003845215, "logits/rejected": -4.084284782409668, "logps/chosen": -106.89278411865234, "logps/rejected": -127.61109924316406, "loss": 113457.3, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011437471024692059, "rewards/margins": 0.003870918182656169, "rewards/rejected": -0.015308389440178871, "step": 8180 }, { "epoch": 0.8286957401598705, "grad_norm": 370823.93560708786, "learning_rate": 9.517652349898808e-08, "logits/chosen": -4.313981056213379, "logits/rejected": -4.372199058532715, "logps/chosen": -64.77915954589844, "logps/rejected": -75.48443603515625, "loss": 114353.6375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010825781151652336, "rewards/margins": 0.0032162577845156193, "rewards/rejected": -0.014042039401829243, "step": 8190 }, { "epoch": 0.8297075786704442, "grad_norm": 598207.9178575628, "learning_rate": 9.461434675061839e-08, "logits/chosen": -4.242364406585693, "logits/rejected": -4.2031097412109375, "logps/chosen": -80.56177520751953, "logps/rejected": -103.88216400146484, "loss": 120314.3, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00759007316082716, "rewards/margins": 0.004700451157987118, "rewards/rejected": -0.012290524318814278, "step": 8200 }, { "epoch": 0.8307194171810179, "grad_norm": 333612.9900197379, "learning_rate": 9.40521700022487e-08, "logits/chosen": -4.250183582305908, "logits/rejected": -4.142926216125488, "logps/chosen": -102.42399597167969, "logps/rejected": -143.11001586914062, "loss": 118815.85, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009032492525875568, "rewards/margins": 0.004258002620190382, "rewards/rejected": -0.013290496543049812, "step": 8210 }, { "epoch": 0.8317312556915917, "grad_norm": 457497.5729390148, "learning_rate": 9.348999325387901e-08, "logits/chosen": -4.085418701171875, "logits/rejected": -4.085152626037598, "logps/chosen": -110.159912109375, "logps/rejected": -101.8358154296875, "loss": 129912.6375, "rewards/accuracies": 0.5, "rewards/chosen": -0.014170791022479534, "rewards/margins": -0.0030458748806267977, "rewards/rejected": -0.011124914512038231, "step": 8220 }, { "epoch": 0.8327430942021653, "grad_norm": 331850.23506711435, "learning_rate": 9.292781650550932e-08, "logits/chosen": -4.125381946563721, "logits/rejected": -4.140069007873535, "logps/chosen": -113.55158996582031, "logps/rejected": -120.9760513305664, "loss": 135448.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012177357450127602, "rewards/margins": 0.002386145992204547, "rewards/rejected": -0.014563503675162792, "step": 8230 }, { "epoch": 0.833754932712739, "grad_norm": 118324.94045731366, "learning_rate": 9.236563975713964e-08, "logits/chosen": -4.111886501312256, "logits/rejected": -4.046170711517334, "logps/chosen": -118.06558990478516, "logps/rejected": -149.43795776367188, "loss": 98423.5375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008153899572789669, "rewards/margins": 0.00458982540294528, "rewards/rejected": -0.012743724510073662, "step": 8240 }, { "epoch": 0.8347667712233128, "grad_norm": 441713.5859668366, "learning_rate": 9.180346300876996e-08, "logits/chosen": -3.916966199874878, "logits/rejected": -3.9287807941436768, "logps/chosen": -106.33534240722656, "logps/rejected": -142.1265411376953, "loss": 133195.4, "rewards/accuracies": 0.75, "rewards/chosen": -0.009692870080471039, "rewards/margins": 0.0072878384962677956, "rewards/rejected": -0.01698070950806141, "step": 8250 }, { "epoch": 0.8357786097338865, "grad_norm": 352171.0185721642, "learning_rate": 9.124128626040028e-08, "logits/chosen": -3.861276626586914, "logits/rejected": -3.805842876434326, "logps/chosen": -86.11549377441406, "logps/rejected": -107.51365661621094, "loss": 116733.1, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010991652496159077, "rewards/margins": 0.007366400212049484, "rewards/rejected": -0.018358051776885986, "step": 8260 }, { "epoch": 0.8367904482444601, "grad_norm": 159.15546516688065, "learning_rate": 9.067910951203057e-08, "logits/chosen": -3.808699131011963, "logits/rejected": -3.8431365489959717, "logps/chosen": -117.88509368896484, "logps/rejected": -171.83706665039062, "loss": 117544.075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00911770947277546, "rewards/margins": 0.004661300219595432, "rewards/rejected": -0.013779009692370892, "step": 8270 }, { "epoch": 0.8378022867550339, "grad_norm": 185281.104526655, "learning_rate": 9.011693276366089e-08, "logits/chosen": -3.9734702110290527, "logits/rejected": -3.85813570022583, "logps/chosen": -94.45865631103516, "logps/rejected": -107.853759765625, "loss": 124293.0875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010564441792666912, "rewards/margins": 0.004766702186316252, "rewards/rejected": -0.015331143513321877, "step": 8280 }, { "epoch": 0.8388141252656076, "grad_norm": 245322.01254128746, "learning_rate": 8.95547560152912e-08, "logits/chosen": -3.7243504524230957, "logits/rejected": -3.6368496417999268, "logps/chosen": -130.2371826171875, "logps/rejected": -161.13418579101562, "loss": 124531.325, "rewards/accuracies": 0.75, "rewards/chosen": -0.01189443003386259, "rewards/margins": 0.009856676682829857, "rewards/rejected": -0.021751107648015022, "step": 8290 }, { "epoch": 0.8398259637761814, "grad_norm": 631563.9445745795, "learning_rate": 8.899257926692151e-08, "logits/chosen": -4.010063648223877, "logits/rejected": -4.107161521911621, "logps/chosen": -284.1063537597656, "logps/rejected": -268.1418762207031, "loss": 101403.4438, "rewards/accuracies": 0.5, "rewards/chosen": -0.010292327031493187, "rewards/margins": 0.0014413518365472555, "rewards/rejected": -0.011733678169548512, "step": 8300 }, { "epoch": 0.8408378022867551, "grad_norm": 532910.7406020928, "learning_rate": 8.843040251855183e-08, "logits/chosen": -4.3089280128479, "logits/rejected": -4.456959247589111, "logps/chosen": -81.44640350341797, "logps/rejected": -113.5396957397461, "loss": 123927.625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010695123113691807, "rewards/margins": 0.003534133080393076, "rewards/rejected": -0.014229255728423595, "step": 8310 }, { "epoch": 0.8418496407973287, "grad_norm": 350778.3805673061, "learning_rate": 8.786822577018215e-08, "logits/chosen": -4.224398612976074, "logits/rejected": -4.236653804779053, "logps/chosen": -88.74833679199219, "logps/rejected": -79.84143829345703, "loss": 113341.95, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007341000251471996, "rewards/margins": 0.004217024892568588, "rewards/rejected": -0.01155802607536316, "step": 8320 }, { "epoch": 0.8428614793079025, "grad_norm": 388582.17411961424, "learning_rate": 8.730604902181244e-08, "logits/chosen": -4.475998878479004, "logits/rejected": -4.428420066833496, "logps/chosen": -76.8545150756836, "logps/rejected": -94.07881927490234, "loss": 112793.9875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009790563024580479, "rewards/margins": 0.0038929313886910677, "rewards/rejected": -0.013683495111763477, "step": 8330 }, { "epoch": 0.8438733178184762, "grad_norm": 540116.6639666132, "learning_rate": 8.674387227344276e-08, "logits/chosen": -4.094708442687988, "logits/rejected": -4.181640625, "logps/chosen": -62.035728454589844, "logps/rejected": -125.70023345947266, "loss": 121455.3375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008264906704425812, "rewards/margins": 0.00605298625305295, "rewards/rejected": -0.014317894354462624, "step": 8340 }, { "epoch": 0.8448851563290499, "grad_norm": 394001.9711005937, "learning_rate": 8.618169552507308e-08, "logits/chosen": -4.090094089508057, "logits/rejected": -3.9878737926483154, "logps/chosen": -88.64405822753906, "logps/rejected": -99.89388275146484, "loss": 114469.8125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010854559950530529, "rewards/margins": 0.00167234952095896, "rewards/rejected": -0.012526909820735455, "step": 8350 }, { "epoch": 0.8458969948396236, "grad_norm": 393565.20065561763, "learning_rate": 8.56195187767034e-08, "logits/chosen": -3.818903684616089, "logits/rejected": -3.838106632232666, "logps/chosen": -85.51871490478516, "logps/rejected": -91.48805236816406, "loss": 101681.125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010507269762456417, "rewards/margins": 0.0037950787227600813, "rewards/rejected": -0.014302348718047142, "step": 8360 }, { "epoch": 0.8469088333501973, "grad_norm": 452736.26074413065, "learning_rate": 8.50573420283337e-08, "logits/chosen": -4.1508893966674805, "logits/rejected": -4.137645721435547, "logps/chosen": -83.77490234375, "logps/rejected": -95.26679229736328, "loss": 119818.2125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009693371132016182, "rewards/margins": 0.001196564524434507, "rewards/rejected": -0.010889935307204723, "step": 8370 }, { "epoch": 0.847920671860771, "grad_norm": 363054.32563244156, "learning_rate": 8.449516527996402e-08, "logits/chosen": -4.044825077056885, "logits/rejected": -4.183990955352783, "logps/chosen": -99.4660415649414, "logps/rejected": -82.24153137207031, "loss": 120148.4125, "rewards/accuracies": 0.5, "rewards/chosen": -0.01062183640897274, "rewards/margins": 0.0026903848629444838, "rewards/rejected": -0.013312222436070442, "step": 8380 }, { "epoch": 0.8489325103713448, "grad_norm": 264734.08418687084, "learning_rate": 8.393298853159433e-08, "logits/chosen": -4.138003826141357, "logits/rejected": -4.34324312210083, "logps/chosen": -76.33064270019531, "logps/rejected": -99.98152160644531, "loss": 111067.6375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00950196199119091, "rewards/margins": 0.0031435999553650618, "rewards/rejected": -0.012645562179386616, "step": 8390 }, { "epoch": 0.8499443488819185, "grad_norm": 421050.26299494907, "learning_rate": 8.337081178322463e-08, "logits/chosen": -4.024688720703125, "logits/rejected": -4.228170394897461, "logps/chosen": -101.98949432373047, "logps/rejected": -135.80551147460938, "loss": 117349.65, "rewards/accuracies": 0.75, "rewards/chosen": -0.008566834032535553, "rewards/margins": 0.006575448904186487, "rewards/rejected": -0.015142282471060753, "step": 8400 }, { "epoch": 0.8509561873924921, "grad_norm": 135066.65693192344, "learning_rate": 8.280863503485495e-08, "logits/chosen": -4.081394195556641, "logits/rejected": -4.121504306793213, "logps/chosen": -86.45903015136719, "logps/rejected": -100.47832489013672, "loss": 113872.1375, "rewards/accuracies": 0.5, "rewards/chosen": -0.008455652743577957, "rewards/margins": 0.005061333533376455, "rewards/rejected": -0.0135169867426157, "step": 8410 }, { "epoch": 0.8519680259030659, "grad_norm": 364171.66511539876, "learning_rate": 8.224645828648527e-08, "logits/chosen": -4.086501598358154, "logits/rejected": -4.227648735046387, "logps/chosen": -91.18257904052734, "logps/rejected": -128.17584228515625, "loss": 110639.1375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0069760219193995, "rewards/margins": 0.008608943782746792, "rewards/rejected": -0.015584966167807579, "step": 8420 }, { "epoch": 0.8529798644136396, "grad_norm": 14548.746546072567, "learning_rate": 8.168428153811558e-08, "logits/chosen": -3.7782530784606934, "logits/rejected": -3.891939163208008, "logps/chosen": -125.4861068725586, "logps/rejected": -156.28895568847656, "loss": 110169.8125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009663982316851616, "rewards/margins": 0.002515354659408331, "rewards/rejected": -0.012179335579276085, "step": 8430 }, { "epoch": 0.8539917029242133, "grad_norm": 408150.0714805123, "learning_rate": 8.11221047897459e-08, "logits/chosen": -4.184926509857178, "logits/rejected": -4.3855109214782715, "logps/chosen": -87.87274932861328, "logps/rejected": -106.99827575683594, "loss": 119623.325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00733520369976759, "rewards/margins": 0.005197050981223583, "rewards/rejected": -0.012532254680991173, "step": 8440 }, { "epoch": 0.855003541434787, "grad_norm": 353615.50874147663, "learning_rate": 8.05599280413762e-08, "logits/chosen": -4.292458534240723, "logits/rejected": -4.262897491455078, "logps/chosen": -67.1311264038086, "logps/rejected": -84.53300476074219, "loss": 125253.85, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006808619014918804, "rewards/margins": 0.001254942617379129, "rewards/rejected": -0.008063562214374542, "step": 8450 }, { "epoch": 0.8560153799453607, "grad_norm": 460986.52273376234, "learning_rate": 7.999775129300652e-08, "logits/chosen": -3.900956630706787, "logits/rejected": -3.8937606811523438, "logps/chosen": -88.46359252929688, "logps/rejected": -108.76072692871094, "loss": 106738.3625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01025934424251318, "rewards/margins": 0.0057234978303313255, "rewards/rejected": -0.015982840210199356, "step": 8460 }, { "epoch": 0.8570272184559344, "grad_norm": 278038.6870657503, "learning_rate": 7.943557454463683e-08, "logits/chosen": -4.282145977020264, "logits/rejected": -4.244583606719971, "logps/chosen": -113.19022369384766, "logps/rejected": -112.8884506225586, "loss": 123155.75, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006576386746019125, "rewards/margins": 0.003116356208920479, "rewards/rejected": -0.009692742489278316, "step": 8470 }, { "epoch": 0.8580390569665082, "grad_norm": 280930.0167221045, "learning_rate": 7.887339779626714e-08, "logits/chosen": -4.2119903564453125, "logits/rejected": -4.452739715576172, "logps/chosen": -76.40629577636719, "logps/rejected": -96.99964904785156, "loss": 118569.9375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00961306318640709, "rewards/margins": 0.00679431390017271, "rewards/rejected": -0.016407376155257225, "step": 8480 }, { "epoch": 0.8590508954770819, "grad_norm": 456886.7215559461, "learning_rate": 7.831122104789746e-08, "logits/chosen": -3.8890480995178223, "logits/rejected": -3.8941104412078857, "logps/chosen": -96.68122100830078, "logps/rejected": -133.11509704589844, "loss": 125149.4375, "rewards/accuracies": 0.75, "rewards/chosen": -0.00890798307955265, "rewards/margins": 0.004146319814026356, "rewards/rejected": -0.01305430382490158, "step": 8490 }, { "epoch": 0.8600627339876555, "grad_norm": 339366.15112263506, "learning_rate": 7.774904429952777e-08, "logits/chosen": -3.788590669631958, "logits/rejected": -3.6142513751983643, "logps/chosen": -68.34542846679688, "logps/rejected": -82.78410339355469, "loss": 116252.1875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011244475841522217, "rewards/margins": 0.0034841049928218126, "rewards/rejected": -0.014728580601513386, "step": 8500 }, { "epoch": 0.8610745724982293, "grad_norm": 484470.038193766, "learning_rate": 7.718686755115807e-08, "logits/chosen": -4.229935169219971, "logits/rejected": -4.23465633392334, "logps/chosen": -87.8543701171875, "logps/rejected": -111.2047348022461, "loss": 105731.975, "rewards/accuracies": 0.5, "rewards/chosen": -0.0067582265473902225, "rewards/margins": 0.006072984077036381, "rewards/rejected": -0.01283121109008789, "step": 8510 }, { "epoch": 0.862086411008803, "grad_norm": 341207.1487801134, "learning_rate": 7.662469080278839e-08, "logits/chosen": -3.673260450363159, "logits/rejected": -3.772216320037842, "logps/chosen": -88.59552001953125, "logps/rejected": -112.7111587524414, "loss": 111391.2875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.01217761728912592, "rewards/margins": 0.003498102305456996, "rewards/rejected": -0.01567571982741356, "step": 8520 }, { "epoch": 0.8630982495193767, "grad_norm": 245574.10236848507, "learning_rate": 7.60625140544187e-08, "logits/chosen": -4.155217170715332, "logits/rejected": -4.014200687408447, "logps/chosen": -91.07817077636719, "logps/rejected": -110.82672119140625, "loss": 99356.7563, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011748792603611946, "rewards/margins": 0.004615721292793751, "rewards/rejected": -0.016364512965083122, "step": 8530 }, { "epoch": 0.8641100880299504, "grad_norm": 302251.9091778401, "learning_rate": 7.550033730604902e-08, "logits/chosen": -4.322504997253418, "logits/rejected": -4.317883014678955, "logps/chosen": -93.16816711425781, "logps/rejected": -124.80387878417969, "loss": 117476.4375, "rewards/accuracies": 0.75, "rewards/chosen": -0.0126638887450099, "rewards/margins": 0.0046744393184781075, "rewards/rejected": -0.017338326200842857, "step": 8540 }, { "epoch": 0.8651219265405241, "grad_norm": 269882.7656656947, "learning_rate": 7.493816055767934e-08, "logits/chosen": -4.295327663421631, "logits/rejected": -4.204625129699707, "logps/chosen": -82.03526306152344, "logps/rejected": -107.31734466552734, "loss": 130670.6875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008137856610119343, "rewards/margins": 0.004000414162874222, "rewards/rejected": -0.01213826984167099, "step": 8550 }, { "epoch": 0.8661337650510978, "grad_norm": 230439.7057339642, "learning_rate": 7.437598380930965e-08, "logits/chosen": -4.100316047668457, "logits/rejected": -4.064553260803223, "logps/chosen": -95.85283660888672, "logps/rejected": -124.03543853759766, "loss": 110681.8, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.013596830889582634, "rewards/margins": 0.000848504132591188, "rewards/rejected": -0.014445334672927856, "step": 8560 }, { "epoch": 0.8671456035616716, "grad_norm": 394341.9607890981, "learning_rate": 7.381380706093995e-08, "logits/chosen": -4.069068431854248, "logits/rejected": -4.156402111053467, "logps/chosen": -96.29537963867188, "logps/rejected": -106.07472229003906, "loss": 104583.1438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006865669973194599, "rewards/margins": 0.002488465514034033, "rewards/rejected": -0.009354135021567345, "step": 8570 }, { "epoch": 0.8681574420722453, "grad_norm": 63110.9632486043, "learning_rate": 7.325163031257027e-08, "logits/chosen": -3.8972537517547607, "logits/rejected": -3.929198741912842, "logps/chosen": -87.52639770507812, "logps/rejected": -103.90556335449219, "loss": 112700.7125, "rewards/accuracies": 0.5, "rewards/chosen": -0.00679420679807663, "rewards/margins": 0.0036198210436850786, "rewards/rejected": -0.01041402667760849, "step": 8580 }, { "epoch": 0.869169280582819, "grad_norm": 194537.89551938322, "learning_rate": 7.268945356420058e-08, "logits/chosen": -3.653736114501953, "logits/rejected": -3.706480026245117, "logps/chosen": -92.68396759033203, "logps/rejected": -92.19835662841797, "loss": 116144.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00909310020506382, "rewards/margins": 0.002146177226677537, "rewards/rejected": -0.011239276267588139, "step": 8590 }, { "epoch": 0.8701811190933927, "grad_norm": 311322.7018268617, "learning_rate": 7.212727681583089e-08, "logits/chosen": -3.9579970836639404, "logits/rejected": -3.8592746257781982, "logps/chosen": -88.03755950927734, "logps/rejected": -97.1086654663086, "loss": 109582.95, "rewards/accuracies": 0.5, "rewards/chosen": -0.006060636602342129, "rewards/margins": 0.0005114464438520372, "rewards/rejected": -0.006572083570063114, "step": 8600 }, { "epoch": 0.8711929576039664, "grad_norm": 292696.10895695636, "learning_rate": 7.156510006746121e-08, "logits/chosen": -4.193371295928955, "logits/rejected": -4.175494194030762, "logps/chosen": -117.93431091308594, "logps/rejected": -119.2309341430664, "loss": 116779.3375, "rewards/accuracies": 0.5, "rewards/chosen": -0.009185121394693851, "rewards/margins": 0.0042196535505354404, "rewards/rejected": -0.01340477354824543, "step": 8610 }, { "epoch": 0.8722047961145402, "grad_norm": 236134.07050955773, "learning_rate": 7.100292331909153e-08, "logits/chosen": -4.108538627624512, "logits/rejected": -3.9756901264190674, "logps/chosen": -99.58580780029297, "logps/rejected": -140.28228759765625, "loss": 102007.2563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007594231516122818, "rewards/margins": 0.00785777810961008, "rewards/rejected": -0.015452009625732899, "step": 8620 }, { "epoch": 0.8732166346251138, "grad_norm": 280134.537048553, "learning_rate": 7.044074657072182e-08, "logits/chosen": -4.349753379821777, "logits/rejected": -4.365666389465332, "logps/chosen": -119.95772552490234, "logps/rejected": -127.44145202636719, "loss": 121220.15, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012457159347832203, "rewards/margins": 0.002528610173612833, "rewards/rejected": -0.014985768124461174, "step": 8630 }, { "epoch": 0.8742284731356875, "grad_norm": 352220.58622178575, "learning_rate": 6.987856982235214e-08, "logits/chosen": -4.37808895111084, "logits/rejected": -4.491143703460693, "logps/chosen": -101.60814666748047, "logps/rejected": -83.54839324951172, "loss": 114685.5875, "rewards/accuracies": 0.5, "rewards/chosen": -0.008786764927208424, "rewards/margins": -0.002564677968621254, "rewards/rejected": -0.006222087889909744, "step": 8640 }, { "epoch": 0.8752403116462613, "grad_norm": 403268.9305410876, "learning_rate": 6.931639307398246e-08, "logits/chosen": -4.010319709777832, "logits/rejected": -4.059919357299805, "logps/chosen": -50.8770637512207, "logps/rejected": -76.38798522949219, "loss": 110781.875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007163781672716141, "rewards/margins": 0.007784395478665829, "rewards/rejected": -0.01494817715138197, "step": 8650 }, { "epoch": 0.876252150156835, "grad_norm": 378900.1556593073, "learning_rate": 6.875421632561276e-08, "logits/chosen": -3.5992271900177, "logits/rejected": -3.4975249767303467, "logps/chosen": -76.75928497314453, "logps/rejected": -121.8885269165039, "loss": 126461.6875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01032837014645338, "rewards/margins": 0.006208487786352634, "rewards/rejected": -0.016536857932806015, "step": 8660 }, { "epoch": 0.8772639886674087, "grad_norm": 269869.0681876544, "learning_rate": 6.819203957724308e-08, "logits/chosen": -3.477020263671875, "logits/rejected": -3.4582839012145996, "logps/chosen": -66.64159393310547, "logps/rejected": -76.53962707519531, "loss": 114151.7125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005764378234744072, "rewards/margins": 0.0032883682288229465, "rewards/rejected": -0.009052746929228306, "step": 8670 }, { "epoch": 0.8782758271779824, "grad_norm": 307831.058706998, "learning_rate": 6.76298628288734e-08, "logits/chosen": -4.187370300292969, "logits/rejected": -4.2927021980285645, "logps/chosen": -156.98880004882812, "logps/rejected": -170.23922729492188, "loss": 126271.6125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01180677954107523, "rewards/margins": 0.0028189322911202908, "rewards/rejected": -0.014625711366534233, "step": 8680 }, { "epoch": 0.8792876656885561, "grad_norm": 226014.96913110558, "learning_rate": 6.70676860805037e-08, "logits/chosen": -4.222830772399902, "logits/rejected": -4.230754375457764, "logps/chosen": -109.5527114868164, "logps/rejected": -115.78433990478516, "loss": 126377.8875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006880936678498983, "rewards/margins": 0.005433627869933844, "rewards/rejected": -0.012314563617110252, "step": 8690 }, { "epoch": 0.8802995041991298, "grad_norm": 487080.7159767171, "learning_rate": 6.650550933213401e-08, "logits/chosen": -4.28406286239624, "logits/rejected": -4.276864528656006, "logps/chosen": -132.22976684570312, "logps/rejected": -116.22660064697266, "loss": 123043.5, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01727672666311264, "rewards/margins": -0.0015356873627752066, "rewards/rejected": -0.01574103906750679, "step": 8700 }, { "epoch": 0.8813113427097036, "grad_norm": 418114.235454717, "learning_rate": 6.594333258376433e-08, "logits/chosen": -4.3075385093688965, "logits/rejected": -4.389540195465088, "logps/chosen": -196.93629455566406, "logps/rejected": -198.98704528808594, "loss": 122547.775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.017371278256177902, "rewards/margins": 0.0033461241982877254, "rewards/rejected": -0.020717401057481766, "step": 8710 }, { "epoch": 0.8823231812202772, "grad_norm": 314567.32305535616, "learning_rate": 6.538115583539465e-08, "logits/chosen": -4.145241737365723, "logits/rejected": -4.2290425300598145, "logps/chosen": -95.6786117553711, "logps/rejected": -116.49454498291016, "loss": 121510.75, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009918605908751488, "rewards/margins": 0.0034318086691200733, "rewards/rejected": -0.013350415043532848, "step": 8720 }, { "epoch": 0.8833350197308509, "grad_norm": 596705.8969182066, "learning_rate": 6.481897908702496e-08, "logits/chosen": -4.436038017272949, "logits/rejected": -4.427906513214111, "logps/chosen": -98.1707534790039, "logps/rejected": -118.84698486328125, "loss": 122023.8375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011590873822569847, "rewards/margins": 0.004474039189517498, "rewards/rejected": -0.01606491208076477, "step": 8730 }, { "epoch": 0.8843468582414247, "grad_norm": 459152.3155976506, "learning_rate": 6.425680233865527e-08, "logits/chosen": -4.019091606140137, "logits/rejected": -3.8949337005615234, "logps/chosen": -113.22802734375, "logps/rejected": -106.78239440917969, "loss": 124357.7625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.011915058828890324, "rewards/margins": 0.002486306708306074, "rewards/rejected": -0.01440136693418026, "step": 8740 }, { "epoch": 0.8853586967519984, "grad_norm": 341457.5714164942, "learning_rate": 6.369462559028558e-08, "logits/chosen": -3.9838814735412598, "logits/rejected": -4.019207000732422, "logps/chosen": -61.74604415893555, "logps/rejected": -100.63560485839844, "loss": 95020.35, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006343814544379711, "rewards/margins": 0.009170463308691978, "rewards/rejected": -0.015514279715716839, "step": 8750 }, { "epoch": 0.8863705352625721, "grad_norm": 341470.6936689905, "learning_rate": 6.313244884191589e-08, "logits/chosen": -3.9736411571502686, "logits/rejected": -3.914304733276367, "logps/chosen": -88.4334945678711, "logps/rejected": -111.52935791015625, "loss": 121092.1125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011768588796257973, "rewards/margins": 0.005739509593695402, "rewards/rejected": -0.017508098855614662, "step": 8760 }, { "epoch": 0.8873823737731458, "grad_norm": 421764.36330222077, "learning_rate": 6.25702720935462e-08, "logits/chosen": -3.988992691040039, "logits/rejected": -3.8644192218780518, "logps/chosen": -136.37806701660156, "logps/rejected": -138.86630249023438, "loss": 116257.7875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009052207693457603, "rewards/margins": 0.006942379288375378, "rewards/rejected": -0.015994589775800705, "step": 8770 }, { "epoch": 0.8883942122837195, "grad_norm": 244498.90089477316, "learning_rate": 6.200809534517652e-08, "logits/chosen": -4.201042175292969, "logits/rejected": -4.080817222595215, "logps/chosen": -115.73856353759766, "logps/rejected": -140.98532104492188, "loss": 116305.6375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010114716365933418, "rewards/margins": 0.0046343426220119, "rewards/rejected": -0.014749057590961456, "step": 8780 }, { "epoch": 0.8894060507942932, "grad_norm": 480463.2323343567, "learning_rate": 6.144591859680683e-08, "logits/chosen": -4.1177473068237305, "logits/rejected": -3.950246810913086, "logps/chosen": -94.03702545166016, "logps/rejected": -115.64408874511719, "loss": 126020.4, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.014568922109901905, "rewards/margins": 0.0038861532229930162, "rewards/rejected": -0.018455075100064278, "step": 8790 }, { "epoch": 0.890417889304867, "grad_norm": 591618.4832240726, "learning_rate": 6.088374184843715e-08, "logits/chosen": -4.296426296234131, "logits/rejected": -4.339685916900635, "logps/chosen": -122.1169662475586, "logps/rejected": -120.5469970703125, "loss": 97560.5312, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011644868180155754, "rewards/margins": 0.004072842188179493, "rewards/rejected": -0.015717709437012672, "step": 8800 }, { "epoch": 0.8914297278154406, "grad_norm": 452609.39506245643, "learning_rate": 6.032156510006745e-08, "logits/chosen": -4.370727062225342, "logits/rejected": -4.5599470138549805, "logps/chosen": -62.40937042236328, "logps/rejected": -79.78056335449219, "loss": 125016.275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00614375714212656, "rewards/margins": 0.0038691002409905195, "rewards/rejected": -0.010012857615947723, "step": 8810 }, { "epoch": 0.8924415663260143, "grad_norm": 383569.636961043, "learning_rate": 5.975938835169777e-08, "logits/chosen": -3.2998862266540527, "logits/rejected": -3.1742441654205322, "logps/chosen": -141.77426147460938, "logps/rejected": -161.40777587890625, "loss": 133224.075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010519255883991718, "rewards/margins": 0.014479592442512512, "rewards/rejected": -0.024998843669891357, "step": 8820 }, { "epoch": 0.8934534048365881, "grad_norm": 259085.90011947099, "learning_rate": 5.9197211603328084e-08, "logits/chosen": -4.197716236114502, "logits/rejected": -4.0801472663879395, "logps/chosen": -90.74817657470703, "logps/rejected": -74.2122573852539, "loss": 98956.8687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006990945432335138, "rewards/margins": 0.0015944844344630837, "rewards/rejected": -0.008585428819060326, "step": 8830 }, { "epoch": 0.8944652433471618, "grad_norm": 334687.75494528067, "learning_rate": 5.8635034854958396e-08, "logits/chosen": -4.01959228515625, "logits/rejected": -3.965263843536377, "logps/chosen": -283.20623779296875, "logps/rejected": -308.0642395019531, "loss": 117109.3, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011300744488835335, "rewards/margins": 0.004129254259169102, "rewards/rejected": -0.015429997816681862, "step": 8840 }, { "epoch": 0.8954770818577356, "grad_norm": 34949.47903450482, "learning_rate": 5.807285810658871e-08, "logits/chosen": -4.075701713562012, "logits/rejected": -4.104346752166748, "logps/chosen": -86.24519348144531, "logps/rejected": -90.40750885009766, "loss": 118397.6625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.015905272215604782, "rewards/margins": 0.0032860550563782454, "rewards/rejected": -0.019191324710845947, "step": 8850 }, { "epoch": 0.8964889203683092, "grad_norm": 249558.5487544728, "learning_rate": 5.751068135821902e-08, "logits/chosen": -3.9807517528533936, "logits/rejected": -3.9742088317871094, "logps/chosen": -120.76554870605469, "logps/rejected": -123.37541198730469, "loss": 121863.7, "rewards/accuracies": 0.75, "rewards/chosen": -0.012949919328093529, "rewards/margins": 0.002825247822329402, "rewards/rejected": -0.015775166451931, "step": 8860 }, { "epoch": 0.8975007588788829, "grad_norm": 514924.34339262726, "learning_rate": 5.694850460984933e-08, "logits/chosen": -3.8076813220977783, "logits/rejected": -3.7519326210021973, "logps/chosen": -82.60115051269531, "logps/rejected": -113.09638977050781, "loss": 133323.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01124018058180809, "rewards/margins": 0.004872153513133526, "rewards/rejected": -0.01611233502626419, "step": 8870 }, { "epoch": 0.8985125973894567, "grad_norm": 307784.6399385327, "learning_rate": 5.638632786147965e-08, "logits/chosen": -4.3244123458862305, "logits/rejected": -4.291854381561279, "logps/chosen": -84.3846664428711, "logps/rejected": -97.01918029785156, "loss": 126831.5375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008121268823742867, "rewards/margins": 0.00146322266664356, "rewards/rejected": -0.009584490209817886, "step": 8880 }, { "epoch": 0.8995244359000304, "grad_norm": 537586.3244819761, "learning_rate": 5.582415111310996e-08, "logits/chosen": -3.953510284423828, "logits/rejected": -4.022969722747803, "logps/chosen": -127.1934585571289, "logps/rejected": -121.26444244384766, "loss": 117588.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009778095409274101, "rewards/margins": 0.0015350397443398833, "rewards/rejected": -0.011313135735690594, "step": 8890 }, { "epoch": 0.900536274410604, "grad_norm": 294053.3374754778, "learning_rate": 5.526197436474027e-08, "logits/chosen": -4.093642234802246, "logits/rejected": -4.068291664123535, "logps/chosen": -278.5228576660156, "logps/rejected": -306.2724304199219, "loss": 99692.85, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007864778861403465, "rewards/margins": 0.00521747674793005, "rewards/rejected": -0.01308225654065609, "step": 8900 }, { "epoch": 0.9015481129211778, "grad_norm": 144026.3177622145, "learning_rate": 5.469979761637059e-08, "logits/chosen": -3.955862522125244, "logits/rejected": -3.9472413063049316, "logps/chosen": -122.0195083618164, "logps/rejected": -128.78109741210938, "loss": 116254.975, "rewards/accuracies": 0.75, "rewards/chosen": -0.009035563096404076, "rewards/margins": 0.003953131847083569, "rewards/rejected": -0.012988695874810219, "step": 8910 }, { "epoch": 0.9025599514317515, "grad_norm": 331158.6624761543, "learning_rate": 5.413762086800089e-08, "logits/chosen": -3.874582290649414, "logits/rejected": -3.779094696044922, "logps/chosen": -100.58879089355469, "logps/rejected": -115.41471862792969, "loss": 117381.5125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010232185944914818, "rewards/margins": 0.004281300585716963, "rewards/rejected": -0.014513485133647919, "step": 8920 }, { "epoch": 0.9035717899423252, "grad_norm": 487545.0661209149, "learning_rate": 5.357544411963121e-08, "logits/chosen": -4.350327968597412, "logits/rejected": -4.328451633453369, "logps/chosen": -104.00666809082031, "logps/rejected": -98.83277893066406, "loss": 119644.9375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.009608651511371136, "rewards/margins": 0.006377749145030975, "rewards/rejected": -0.015986401587724686, "step": 8930 }, { "epoch": 0.904583628452899, "grad_norm": 507471.55533966084, "learning_rate": 5.3013267371261524e-08, "logits/chosen": -3.8013808727264404, "logits/rejected": -3.6649177074432373, "logps/chosen": -82.89006042480469, "logps/rejected": -90.04682922363281, "loss": 106643.7125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0072867413982748985, "rewards/margins": 0.002984431339427829, "rewards/rejected": -0.010271173901855946, "step": 8940 }, { "epoch": 0.9055954669634726, "grad_norm": 255812.0124171347, "learning_rate": 5.245109062289183e-08, "logits/chosen": -4.1201887130737305, "logits/rejected": -4.01455020904541, "logps/chosen": -110.28932189941406, "logps/rejected": -126.10809326171875, "loss": 113395.475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006231869570910931, "rewards/margins": 0.0019786464981734753, "rewards/rejected": -0.008210516534745693, "step": 8950 }, { "epoch": 0.9066073054740463, "grad_norm": 297111.1750172441, "learning_rate": 5.188891387452215e-08, "logits/chosen": -3.851720094680786, "logits/rejected": -3.8642520904541016, "logps/chosen": -52.212562561035156, "logps/rejected": -80.71636199951172, "loss": 121268.625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0038822777569293976, "rewards/margins": 0.003970640245825052, "rewards/rejected": -0.007852918468415737, "step": 8960 }, { "epoch": 0.9076191439846201, "grad_norm": 375985.07480873895, "learning_rate": 5.132673712615246e-08, "logits/chosen": -3.7848823070526123, "logits/rejected": -3.696533679962158, "logps/chosen": -95.88836669921875, "logps/rejected": -141.2725067138672, "loss": 105541.5125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00900835357606411, "rewards/margins": 0.007512043230235577, "rewards/rejected": -0.016520395874977112, "step": 8970 }, { "epoch": 0.9086309824951938, "grad_norm": 289806.07961147, "learning_rate": 5.076456037778277e-08, "logits/chosen": -3.8709876537323, "logits/rejected": -3.9820396900177, "logps/chosen": -90.89571380615234, "logps/rejected": -114.7501220703125, "loss": 121524.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012953633442521095, "rewards/margins": 0.004366959910839796, "rewards/rejected": -0.01732059195637703, "step": 8980 }, { "epoch": 0.9096428210057674, "grad_norm": 277878.5750857645, "learning_rate": 5.0202383629413085e-08, "logits/chosen": -4.025021553039551, "logits/rejected": -4.169232368469238, "logps/chosen": -126.07752990722656, "logps/rejected": -110.68658447265625, "loss": 113244.075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009349776431918144, "rewards/margins": 0.004801408387720585, "rewards/rejected": -0.014151183888316154, "step": 8990 }, { "epoch": 0.9106546595163412, "grad_norm": 143930.16167051188, "learning_rate": 4.96402068810434e-08, "logits/chosen": -3.95123553276062, "logits/rejected": -3.9527294635772705, "logps/chosen": -61.64017868041992, "logps/rejected": -114.35748291015625, "loss": 106570.2875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.009745879098773003, "rewards/margins": 0.01045564841479063, "rewards/rejected": -0.020201528444886208, "step": 9000 }, { "epoch": 0.9116664980269149, "grad_norm": 179439.32823327545, "learning_rate": 4.907803013267371e-08, "logits/chosen": -4.085046291351318, "logits/rejected": -4.043295383453369, "logps/chosen": -88.24736022949219, "logps/rejected": -96.80689239501953, "loss": 103717.7312, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00684154499322176, "rewards/margins": 0.0025721783749759197, "rewards/rejected": -0.009413723833858967, "step": 9010 }, { "epoch": 0.9126783365374886, "grad_norm": 288508.98415273277, "learning_rate": 4.851585338430402e-08, "logits/chosen": -4.193526268005371, "logits/rejected": -4.179810523986816, "logps/chosen": -80.25194549560547, "logps/rejected": -96.00931549072266, "loss": 130037.3875, "rewards/accuracies": 0.5, "rewards/chosen": -0.006593102123588324, "rewards/margins": 0.004034596960991621, "rewards/rejected": -0.010627699084579945, "step": 9020 }, { "epoch": 0.9136901750480624, "grad_norm": 272098.2628987145, "learning_rate": 4.795367663593434e-08, "logits/chosen": -4.361043453216553, "logits/rejected": -4.260138988494873, "logps/chosen": -81.97883605957031, "logps/rejected": -99.30425262451172, "loss": 118734.2375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01212235726416111, "rewards/margins": 0.0024616383016109467, "rewards/rejected": -0.014583995565772057, "step": 9030 }, { "epoch": 0.914702013558636, "grad_norm": 487020.70689412643, "learning_rate": 4.7391499887564646e-08, "logits/chosen": -3.9659743309020996, "logits/rejected": -3.983170986175537, "logps/chosen": -126.64698791503906, "logps/rejected": -134.87380981445312, "loss": 97955.3687, "rewards/accuracies": 0.5, "rewards/chosen": -0.007934290915727615, "rewards/margins": 0.0007649677572771907, "rewards/rejected": -0.008699259720742702, "step": 9040 }, { "epoch": 0.9157138520692097, "grad_norm": 364653.21111071174, "learning_rate": 4.682932313919496e-08, "logits/chosen": -3.7138798236846924, "logits/rejected": -3.6401736736297607, "logps/chosen": -101.9919662475586, "logps/rejected": -85.16172790527344, "loss": 108962.275, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.013400467112660408, "rewards/margins": 3.44203544955235e-05, "rewards/rejected": -0.01343488972634077, "step": 9050 }, { "epoch": 0.9167256905797835, "grad_norm": 323477.66055268224, "learning_rate": 4.626714639082528e-08, "logits/chosen": -4.031325340270996, "logits/rejected": -3.9098992347717285, "logps/chosen": -45.24324035644531, "logps/rejected": -96.09133911132812, "loss": 103141.1062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0074615562334656715, "rewards/margins": 0.009060061536729336, "rewards/rejected": -0.016521615907549858, "step": 9060 }, { "epoch": 0.9177375290903572, "grad_norm": 383333.8836089793, "learning_rate": 4.570496964245558e-08, "logits/chosen": -4.008470058441162, "logits/rejected": -4.068471431732178, "logps/chosen": -59.4635124206543, "logps/rejected": -96.85350036621094, "loss": 124185.2875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006932751275599003, "rewards/margins": 0.007355979643762112, "rewards/rejected": -0.014288730919361115, "step": 9070 }, { "epoch": 0.9187493676009308, "grad_norm": 467549.5350380466, "learning_rate": 4.51427928940859e-08, "logits/chosen": -3.9852538108825684, "logits/rejected": -3.927855968475342, "logps/chosen": -90.18970489501953, "logps/rejected": -129.1749725341797, "loss": 123628.225, "rewards/accuracies": 0.75, "rewards/chosen": -0.00634436821565032, "rewards/margins": 0.009663324803113937, "rewards/rejected": -0.016007693484425545, "step": 9080 }, { "epoch": 0.9197612061115046, "grad_norm": 302220.46825235477, "learning_rate": 4.4580616145716213e-08, "logits/chosen": -4.135179042816162, "logits/rejected": -4.080653190612793, "logps/chosen": -54.551544189453125, "logps/rejected": -90.72606658935547, "loss": 104424.0188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0069213733077049255, "rewards/margins": 0.007630051579326391, "rewards/rejected": -0.014551426284015179, "step": 9090 }, { "epoch": 0.9207730446220783, "grad_norm": 279151.76419841027, "learning_rate": 4.401843939734652e-08, "logits/chosen": -4.138516426086426, "logits/rejected": -4.202983856201172, "logps/chosen": -79.46161651611328, "logps/rejected": -94.9834213256836, "loss": 110224.6375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006496983114629984, "rewards/margins": 0.008634328842163086, "rewards/rejected": -0.015131311491131783, "step": 9100 }, { "epoch": 0.921784883132652, "grad_norm": 1700376.5618202363, "learning_rate": 4.345626264897684e-08, "logits/chosen": -3.7538185119628906, "logits/rejected": -3.8071532249450684, "logps/chosen": -138.71755981445312, "logps/rejected": -138.94216918945312, "loss": 126817.5125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.013999557122588158, "rewards/margins": 0.0035819902550429106, "rewards/rejected": -0.017581548541784286, "step": 9110 }, { "epoch": 0.9227967216432258, "grad_norm": 442353.2290473817, "learning_rate": 4.289408590060715e-08, "logits/chosen": -3.9314491748809814, "logits/rejected": -3.892240524291992, "logps/chosen": -89.34886169433594, "logps/rejected": -105.2237548828125, "loss": 125207.825, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011378498747944832, "rewards/margins": 0.007206455804407597, "rewards/rejected": -0.018584955483675003, "step": 9120 }, { "epoch": 0.9238085601537994, "grad_norm": 312408.00185447367, "learning_rate": 4.233190915223746e-08, "logits/chosen": -3.5218234062194824, "logits/rejected": -3.5046706199645996, "logps/chosen": -85.2452392578125, "logps/rejected": -127.21073913574219, "loss": 97043.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008213781751692295, "rewards/margins": 0.005726475268602371, "rewards/rejected": -0.013940255157649517, "step": 9130 }, { "epoch": 0.9248203986643732, "grad_norm": 326703.962649545, "learning_rate": 4.1769732403867774e-08, "logits/chosen": -4.270815849304199, "logits/rejected": -4.2111992835998535, "logps/chosen": -104.72509765625, "logps/rejected": -108.7729721069336, "loss": 125766.175, "rewards/accuracies": 0.75, "rewards/chosen": -0.011119318194687366, "rewards/margins": 0.0030083004385232925, "rewards/rejected": -0.014127619564533234, "step": 9140 }, { "epoch": 0.9258322371749469, "grad_norm": 240787.3152400999, "learning_rate": 4.1207555655498087e-08, "logits/chosen": -3.871220111846924, "logits/rejected": -3.442108154296875, "logps/chosen": -77.38665008544922, "logps/rejected": -289.89007568359375, "loss": 112081.0625, "rewards/accuracies": 0.75, "rewards/chosen": -0.011555308476090431, "rewards/margins": 0.009345161728560925, "rewards/rejected": -0.02090046927332878, "step": 9150 }, { "epoch": 0.9268440756855206, "grad_norm": 293332.6701837181, "learning_rate": 4.06453789071284e-08, "logits/chosen": -3.8082969188690186, "logits/rejected": -3.7263598442077637, "logps/chosen": -88.42158508300781, "logps/rejected": -128.20602416992188, "loss": 118638.15, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.007909895852208138, "rewards/margins": 0.008832307532429695, "rewards/rejected": -0.016742205247282982, "step": 9160 }, { "epoch": 0.9278559141960943, "grad_norm": 484355.8621710144, "learning_rate": 4.008320215875871e-08, "logits/chosen": -4.36624813079834, "logits/rejected": -4.375934600830078, "logps/chosen": -107.97148132324219, "logps/rejected": -126.8670883178711, "loss": 103649.9312, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.008608532138168812, "rewards/margins": 0.004907368216663599, "rewards/rejected": -0.013515899889171124, "step": 9170 }, { "epoch": 0.928867752706668, "grad_norm": 371070.76199489913, "learning_rate": 3.952102541038903e-08, "logits/chosen": -3.9613826274871826, "logits/rejected": -3.95758056640625, "logps/chosen": -97.17466735839844, "logps/rejected": -104.33918762207031, "loss": 125929.3, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.012408889830112457, "rewards/margins": 0.0010125895496457815, "rewards/rejected": -0.013421478681266308, "step": 9180 }, { "epoch": 0.9298795912172417, "grad_norm": 142.8974277230586, "learning_rate": 3.8958848662019335e-08, "logits/chosen": -4.170875549316406, "logits/rejected": -4.099246025085449, "logps/chosen": -90.67923736572266, "logps/rejected": -97.02192687988281, "loss": 118244.375, "rewards/accuracies": 0.75, "rewards/chosen": -0.005168101750314236, "rewards/margins": 0.0055146971717476845, "rewards/rejected": -0.010682797990739346, "step": 9190 }, { "epoch": 0.9308914297278155, "grad_norm": 279307.1071328232, "learning_rate": 3.839667191364965e-08, "logits/chosen": -4.275193691253662, "logits/rejected": -4.315365791320801, "logps/chosen": -64.46754455566406, "logps/rejected": -86.3163070678711, "loss": 97854.9062, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0054799458011984825, "rewards/margins": 0.0025901668705046177, "rewards/rejected": -0.008070112206041813, "step": 9200 }, { "epoch": 0.9319032682383892, "grad_norm": 409732.2620779486, "learning_rate": 3.7834495165279966e-08, "logits/chosen": -3.8381829261779785, "logits/rejected": -3.797884702682495, "logps/chosen": -78.94202423095703, "logps/rejected": -114.01863098144531, "loss": 124774.4, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010127966292202473, "rewards/margins": 0.004748090170323849, "rewards/rejected": -0.014876055531203747, "step": 9210 }, { "epoch": 0.9329151067489628, "grad_norm": 608436.5631799783, "learning_rate": 3.727231841691027e-08, "logits/chosen": -3.8545258045196533, "logits/rejected": -4.024331569671631, "logps/chosen": -99.67369079589844, "logps/rejected": -113.30782318115234, "loss": 123327.2125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008264346979558468, "rewards/margins": 0.008100657723844051, "rewards/rejected": -0.01636500470340252, "step": 9220 }, { "epoch": 0.9339269452595366, "grad_norm": 289669.28914514004, "learning_rate": 3.671014166854059e-08, "logits/chosen": -4.110379695892334, "logits/rejected": -3.867628574371338, "logps/chosen": -55.662025451660156, "logps/rejected": -75.73758697509766, "loss": 114253.025, "rewards/accuracies": 0.5, "rewards/chosen": -0.00746946269646287, "rewards/margins": 0.0030820234678685665, "rewards/rejected": -0.010551486164331436, "step": 9230 }, { "epoch": 0.9349387837701103, "grad_norm": 227428.12511744004, "learning_rate": 3.61479649201709e-08, "logits/chosen": -4.062264442443848, "logits/rejected": -4.087777614593506, "logps/chosen": -98.99603271484375, "logps/rejected": -82.16569519042969, "loss": 111960.0125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011811173520982265, "rewards/margins": 0.0009425958851352334, "rewards/rejected": -0.012753767892718315, "step": 9240 }, { "epoch": 0.935950622280684, "grad_norm": 705240.2190102136, "learning_rate": 3.558578817180121e-08, "logits/chosen": -4.234978675842285, "logits/rejected": -4.1274333000183105, "logps/chosen": -66.84501647949219, "logps/rejected": -104.93025970458984, "loss": 106397.1875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01065686997026205, "rewards/margins": 0.005836677271872759, "rewards/rejected": -0.016493547707796097, "step": 9250 }, { "epoch": 0.9369624607912577, "grad_norm": 9348.762333527597, "learning_rate": 3.502361142343153e-08, "logits/chosen": -4.021543502807617, "logits/rejected": -3.8801143169403076, "logps/chosen": -73.72119140625, "logps/rejected": -92.24208068847656, "loss": 95061.375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004484384320676327, "rewards/margins": 0.004729871638119221, "rewards/rejected": -0.009214254096150398, "step": 9260 }, { "epoch": 0.9379742993018314, "grad_norm": 106483.79222585144, "learning_rate": 3.446143467506184e-08, "logits/chosen": -4.206275939941406, "logits/rejected": -4.251518726348877, "logps/chosen": -106.91944885253906, "logps/rejected": -114.06278228759766, "loss": 112398.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009029841050505638, "rewards/margins": 0.006023214664310217, "rewards/rejected": -0.015053058043122292, "step": 9270 }, { "epoch": 0.9389861378124051, "grad_norm": 379246.5475651217, "learning_rate": 3.3899257926692145e-08, "logits/chosen": -3.7227814197540283, "logits/rejected": -3.9053096771240234, "logps/chosen": -123.21219635009766, "logps/rejected": -153.96926879882812, "loss": 128410.8375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.016175318509340286, "rewards/margins": 0.0043297805823385715, "rewards/rejected": -0.02050510048866272, "step": 9280 }, { "epoch": 0.9399979763229789, "grad_norm": 165013.14920665298, "learning_rate": 3.3337081178322464e-08, "logits/chosen": -4.133365631103516, "logits/rejected": -4.097664833068848, "logps/chosen": -70.04796600341797, "logps/rejected": -91.22482299804688, "loss": 109941.3875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009596329182386398, "rewards/margins": 0.0032221893779933453, "rewards/rejected": -0.012818519957363605, "step": 9290 }, { "epoch": 0.9410098148335526, "grad_norm": 1277999.1715039306, "learning_rate": 3.2774904429952776e-08, "logits/chosen": -3.3323731422424316, "logits/rejected": -3.349330186843872, "logps/chosen": -258.72674560546875, "logps/rejected": -304.8492126464844, "loss": 116865.55, "rewards/accuracies": 0.75, "rewards/chosen": -0.01302745658904314, "rewards/margins": 0.004500804003328085, "rewards/rejected": -0.017528260126709938, "step": 9300 }, { "epoch": 0.9420216533441262, "grad_norm": 425765.4519162559, "learning_rate": 3.221272768158309e-08, "logits/chosen": -3.874479293823242, "logits/rejected": -3.8136093616485596, "logps/chosen": -72.1672134399414, "logps/rejected": -89.69673156738281, "loss": 114374.0625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.012704099528491497, "rewards/margins": 0.0006084696506150067, "rewards/rejected": -0.013312570750713348, "step": 9310 }, { "epoch": 0.9430334918547, "grad_norm": 307968.995791301, "learning_rate": 3.16505509332134e-08, "logits/chosen": -3.4175307750701904, "logits/rejected": -3.4503250122070312, "logps/chosen": -122.676513671875, "logps/rejected": -153.57688903808594, "loss": 122113.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.012504488229751587, "rewards/margins": 0.007534942124038935, "rewards/rejected": -0.020039431750774384, "step": 9320 }, { "epoch": 0.9440453303652737, "grad_norm": 566232.3651596041, "learning_rate": 3.108837418484371e-08, "logits/chosen": -4.223851680755615, "logits/rejected": -4.170505523681641, "logps/chosen": -96.09298706054688, "logps/rejected": -141.09658813476562, "loss": 123453.3625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.005210190545767546, "rewards/margins": 0.008998661302030087, "rewards/rejected": -0.014208853244781494, "step": 9330 }, { "epoch": 0.9450571688758475, "grad_norm": 394828.068803201, "learning_rate": 3.0526197436474024e-08, "logits/chosen": -3.8400473594665527, "logits/rejected": -3.9287331104278564, "logps/chosen": -100.09215545654297, "logps/rejected": -126.90911865234375, "loss": 128200.55, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.00806563813239336, "rewards/margins": 0.0077147395350039005, "rewards/rejected": -0.015780378133058548, "step": 9340 }, { "epoch": 0.9460690073864211, "grad_norm": 96106.05508032681, "learning_rate": 2.9964020688104337e-08, "logits/chosen": -4.058514595031738, "logits/rejected": -4.197056770324707, "logps/chosen": -64.88381958007812, "logps/rejected": -97.88761901855469, "loss": 82198.8188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010651446878910065, "rewards/margins": 0.007316960487514734, "rewards/rejected": -0.01796840690076351, "step": 9350 }, { "epoch": 0.9470808458969948, "grad_norm": 257642.84401004258, "learning_rate": 2.940184393973465e-08, "logits/chosen": -4.154553413391113, "logits/rejected": -4.163033485412598, "logps/chosen": -71.7684326171875, "logps/rejected": -98.4129867553711, "loss": 135757.0875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01138218492269516, "rewards/margins": 0.0023018806241452694, "rewards/rejected": -0.013684066943824291, "step": 9360 }, { "epoch": 0.9480926844075686, "grad_norm": 273162.9712276988, "learning_rate": 2.8839667191364964e-08, "logits/chosen": -4.007162094116211, "logits/rejected": -3.9523723125457764, "logps/chosen": -73.41001892089844, "logps/rejected": -125.1382064819336, "loss": 107487.0625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.012552660889923573, "rewards/margins": 0.006663169711828232, "rewards/rejected": -0.019215833395719528, "step": 9370 }, { "epoch": 0.9491045229181423, "grad_norm": 448877.4526218905, "learning_rate": 2.8277490442995276e-08, "logits/chosen": -4.082608222961426, "logits/rejected": -4.0633344650268555, "logps/chosen": -110.57574462890625, "logps/rejected": -115.8017807006836, "loss": 123545.0625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009473489597439766, "rewards/margins": 0.002319857245311141, "rewards/rejected": -0.0117933489382267, "step": 9380 }, { "epoch": 0.950116361428716, "grad_norm": 405623.6873595471, "learning_rate": 2.771531369462559e-08, "logits/chosen": -3.9203083515167236, "logits/rejected": -3.857520580291748, "logps/chosen": -78.60089874267578, "logps/rejected": -82.97459411621094, "loss": 119546.7875, "rewards/accuracies": 0.5, "rewards/chosen": -0.009000062011182308, "rewards/margins": 0.0006035291007719934, "rewards/rejected": -0.009603590704500675, "step": 9390 }, { "epoch": 0.9511281999392897, "grad_norm": 314198.38168918685, "learning_rate": 2.71531369462559e-08, "logits/chosen": -3.750643253326416, "logits/rejected": -3.8828296661376953, "logps/chosen": -101.66117095947266, "logps/rejected": -128.7971649169922, "loss": 123493.1625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008997110649943352, "rewards/margins": 0.002502544317394495, "rewards/rejected": -0.01149965450167656, "step": 9400 }, { "epoch": 0.9521400384498634, "grad_norm": 457918.210651223, "learning_rate": 2.6590960197886213e-08, "logits/chosen": -4.091675758361816, "logits/rejected": -4.0662407875061035, "logps/chosen": -149.65814208984375, "logps/rejected": -161.6226806640625, "loss": 125557.1125, "rewards/accuracies": 0.5, "rewards/chosen": -0.015780258923768997, "rewards/margins": 0.0036566038616001606, "rewards/rejected": -0.01943686231970787, "step": 9410 }, { "epoch": 0.9531518769604371, "grad_norm": 384635.6885021683, "learning_rate": 2.6028783449516525e-08, "logits/chosen": -3.879788875579834, "logits/rejected": -4.005295753479004, "logps/chosen": -63.342506408691406, "logps/rejected": -77.45150756835938, "loss": 97069.0063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009567140601575375, "rewards/margins": 0.0037602433003485203, "rewards/rejected": -0.013327384367585182, "step": 9420 }, { "epoch": 0.9541637154710109, "grad_norm": 543175.5134408739, "learning_rate": 2.546660670114684e-08, "logits/chosen": -3.5622570514678955, "logits/rejected": -3.4582481384277344, "logps/chosen": -267.56304931640625, "logps/rejected": -318.1617431640625, "loss": 121152.325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012517949566245079, "rewards/margins": 0.008223354816436768, "rewards/rejected": -0.020741304382681847, "step": 9430 }, { "epoch": 0.9551755539815845, "grad_norm": 404937.3233357443, "learning_rate": 2.4904429952777153e-08, "logits/chosen": -3.9322261810302734, "logits/rejected": -4.043418884277344, "logps/chosen": -112.6158447265625, "logps/rejected": -108.2094497680664, "loss": 121805.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006976640783250332, "rewards/margins": 0.004673903342336416, "rewards/rejected": -0.011650544591248035, "step": 9440 }, { "epoch": 0.9561873924921582, "grad_norm": 315390.1194719369, "learning_rate": 2.4342253204407462e-08, "logits/chosen": -4.379286766052246, "logits/rejected": -4.469390392303467, "logps/chosen": -88.45399475097656, "logps/rejected": -90.76317596435547, "loss": 114895.5875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007997235283255577, "rewards/margins": 0.005050938576459885, "rewards/rejected": -0.013048173859715462, "step": 9450 }, { "epoch": 0.957199231002732, "grad_norm": 466484.54469281825, "learning_rate": 2.3780076456037777e-08, "logits/chosen": -4.124789237976074, "logits/rejected": -4.04677677154541, "logps/chosen": -80.44025421142578, "logps/rejected": -137.12808227539062, "loss": 113748.4875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.009440635330975056, "rewards/margins": 0.010512905195355415, "rewards/rejected": -0.019953539595007896, "step": 9460 }, { "epoch": 0.9582110695133057, "grad_norm": 396617.38495693053, "learning_rate": 2.321789970766809e-08, "logits/chosen": -4.371657371520996, "logits/rejected": -4.44508695602417, "logps/chosen": -73.26972961425781, "logps/rejected": -107.08268737792969, "loss": 97962.7875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0101273562759161, "rewards/margins": 0.007543954998254776, "rewards/rejected": -0.017671313136816025, "step": 9470 }, { "epoch": 0.9592229080238794, "grad_norm": 255000.0214773151, "learning_rate": 2.2655722959298405e-08, "logits/chosen": -4.285630226135254, "logits/rejected": -4.3579254150390625, "logps/chosen": -91.79985046386719, "logps/rejected": -107.73516845703125, "loss": 110867.4, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009371770545840263, "rewards/margins": 0.0026771242264658213, "rewards/rejected": -0.012048894539475441, "step": 9480 }, { "epoch": 0.9602347465344531, "grad_norm": 193378.7001353474, "learning_rate": 2.2093546210928717e-08, "logits/chosen": -3.869476318359375, "logits/rejected": -3.879193067550659, "logps/chosen": -76.95286560058594, "logps/rejected": -88.60786437988281, "loss": 116444.175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0024948727805167437, "rewards/margins": 0.0077176811173558235, "rewards/rejected": -0.010212553665041924, "step": 9490 }, { "epoch": 0.9612465850450268, "grad_norm": 434553.1623235897, "learning_rate": 2.1531369462559026e-08, "logits/chosen": -3.8946824073791504, "logits/rejected": -3.9759726524353027, "logps/chosen": -88.37757873535156, "logps/rejected": -123.55009460449219, "loss": 124044.1875, "rewards/accuracies": 0.75, "rewards/chosen": -0.007869130000472069, "rewards/margins": 0.007490699179470539, "rewards/rejected": -0.015359828248620033, "step": 9500 }, { "epoch": 0.9622584235556005, "grad_norm": 431117.31018362276, "learning_rate": 2.096919271418934e-08, "logits/chosen": -3.727459669113159, "logits/rejected": -3.6478238105773926, "logps/chosen": -105.90567779541016, "logps/rejected": -144.3240966796875, "loss": 119481.875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01097404770553112, "rewards/margins": 0.007662780582904816, "rewards/rejected": -0.018636828288435936, "step": 9510 }, { "epoch": 0.9632702620661743, "grad_norm": 425646.21713502426, "learning_rate": 2.0407015965819654e-08, "logits/chosen": -3.866877317428589, "logits/rejected": -3.8664004802703857, "logps/chosen": -67.36805725097656, "logps/rejected": -70.0781021118164, "loss": 104446.6438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0077029005624353886, "rewards/margins": 0.0037230930756777525, "rewards/rejected": -0.011425992473959923, "step": 9520 }, { "epoch": 0.9642821005767479, "grad_norm": 441635.7473491108, "learning_rate": 1.9844839217449966e-08, "logits/chosen": -3.858445644378662, "logits/rejected": -3.8677124977111816, "logps/chosen": -110.73213958740234, "logps/rejected": -151.53518676757812, "loss": 136400.7, "rewards/accuracies": 0.75, "rewards/chosen": -0.010622120462357998, "rewards/margins": 0.010559212416410446, "rewards/rejected": -0.02118133381009102, "step": 9530 }, { "epoch": 0.9652939390873216, "grad_norm": 196556.32067255597, "learning_rate": 1.9282662469080278e-08, "logits/chosen": -4.104430198669434, "logits/rejected": -4.161004543304443, "logps/chosen": -111.50225830078125, "logps/rejected": -92.09217071533203, "loss": 114322.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010185925289988518, "rewards/margins": 0.0021272655576467514, "rewards/rejected": -0.01231319084763527, "step": 9540 }, { "epoch": 0.9663057775978954, "grad_norm": 362424.568644476, "learning_rate": 1.872048572071059e-08, "logits/chosen": -4.084863185882568, "logits/rejected": -4.112037658691406, "logps/chosen": -95.34103393554688, "logps/rejected": -125.73600006103516, "loss": 121074.075, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011587217450141907, "rewards/margins": 0.009355949237942696, "rewards/rejected": -0.02094316855072975, "step": 9550 }, { "epoch": 0.9673176161084691, "grad_norm": 288376.1920759395, "learning_rate": 1.8158308972340902e-08, "logits/chosen": -3.753190517425537, "logits/rejected": -3.7959036827087402, "logps/chosen": -86.61421966552734, "logps/rejected": -93.5081787109375, "loss": 124791.025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008642861619591713, "rewards/margins": 0.0044698575511574745, "rewards/rejected": -0.013112718239426613, "step": 9560 }, { "epoch": 0.9683294546190429, "grad_norm": 269448.89591888123, "learning_rate": 1.7596132223971218e-08, "logits/chosen": -4.356909275054932, "logits/rejected": -4.248805046081543, "logps/chosen": -66.31163787841797, "logps/rejected": -70.6850357055664, "loss": 125229.6875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011195449158549309, "rewards/margins": 0.0014007985591888428, "rewards/rejected": -0.012596246786415577, "step": 9570 }, { "epoch": 0.9693412931296165, "grad_norm": 481180.99331923004, "learning_rate": 1.703395547560153e-08, "logits/chosen": -4.192138195037842, "logits/rejected": -4.271235942840576, "logps/chosen": -97.1610336303711, "logps/rejected": -124.29378509521484, "loss": 110324.5375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.010377154685556889, "rewards/margins": 0.006192672066390514, "rewards/rejected": -0.016569826751947403, "step": 9580 }, { "epoch": 0.9703531316401902, "grad_norm": 517997.4581631657, "learning_rate": 1.647177872723184e-08, "logits/chosen": -4.0767035484313965, "logits/rejected": -3.9697928428649902, "logps/chosen": -69.50992584228516, "logps/rejected": -110.08528900146484, "loss": 120489.2, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006844178773462772, "rewards/margins": 0.006802765186876059, "rewards/rejected": -0.013646943494677544, "step": 9590 }, { "epoch": 0.971364970150764, "grad_norm": 183198.57318344706, "learning_rate": 1.5909601978862154e-08, "logits/chosen": -3.537818193435669, "logits/rejected": -3.4721953868865967, "logps/chosen": -70.39857482910156, "logps/rejected": -135.46920776367188, "loss": 97716.0625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005956948734819889, "rewards/margins": 0.007264907006174326, "rewards/rejected": -0.013221855275332928, "step": 9600 }, { "epoch": 0.9723768086613377, "grad_norm": 405704.3850069246, "learning_rate": 1.5347425230492466e-08, "logits/chosen": -4.482111930847168, "logits/rejected": -4.442517280578613, "logps/chosen": -120.3714370727539, "logps/rejected": -118.3258285522461, "loss": 124778.1625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.010016797110438347, "rewards/margins": 0.004308981820940971, "rewards/rejected": -0.014325778000056744, "step": 9610 }, { "epoch": 0.9733886471719113, "grad_norm": 437874.95627708576, "learning_rate": 1.4785248482122779e-08, "logits/chosen": -4.392117977142334, "logits/rejected": -4.130487442016602, "logps/chosen": -52.64751434326172, "logps/rejected": -60.755271911621094, "loss": 106337.1, "rewards/accuracies": 0.5, "rewards/chosen": -0.0093241510912776, "rewards/margins": 0.0027712176088243723, "rewards/rejected": -0.012095366604626179, "step": 9620 }, { "epoch": 0.974400485682485, "grad_norm": 363747.04076973745, "learning_rate": 1.422307173375309e-08, "logits/chosen": -4.379356384277344, "logits/rejected": -4.3832621574401855, "logps/chosen": -72.3182144165039, "logps/rejected": -56.909637451171875, "loss": 99953.4937, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006303323898464441, "rewards/margins": 0.0015097304712980986, "rewards/rejected": -0.007813054136931896, "step": 9630 }, { "epoch": 0.9754123241930588, "grad_norm": 247670.66311319728, "learning_rate": 1.3660894985383405e-08, "logits/chosen": -4.236478805541992, "logits/rejected": -4.255645275115967, "logps/chosen": -115.89263916015625, "logps/rejected": -133.51515197753906, "loss": 138558.2, "rewards/accuracies": 0.75, "rewards/chosen": -0.009032749570906162, "rewards/margins": 0.0066255005076527596, "rewards/rejected": -0.015658248215913773, "step": 9640 }, { "epoch": 0.9764241627036325, "grad_norm": 213863.7427699631, "learning_rate": 1.3098718237013715e-08, "logits/chosen": -4.155786037445068, "logits/rejected": -3.917539119720459, "logps/chosen": -74.1441650390625, "logps/rejected": -105.3725814819336, "loss": 129382.725, "rewards/accuracies": 0.75, "rewards/chosen": -0.010057590901851654, "rewards/margins": 0.005849766079336405, "rewards/rejected": -0.01590735837817192, "step": 9650 }, { "epoch": 0.9774360012142063, "grad_norm": 436382.6633251706, "learning_rate": 1.2536541488644029e-08, "logits/chosen": -4.081709384918213, "logits/rejected": -3.932584285736084, "logps/chosen": -98.51025390625, "logps/rejected": -143.04702758789062, "loss": 128407.7375, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.006496958434581757, "rewards/margins": 0.009936420246958733, "rewards/rejected": -0.01643337681889534, "step": 9660 }, { "epoch": 0.9784478397247799, "grad_norm": 303654.6159033657, "learning_rate": 1.1974364740274343e-08, "logits/chosen": -4.260552406311035, "logits/rejected": -4.216398239135742, "logps/chosen": -85.33565521240234, "logps/rejected": -102.05516052246094, "loss": 113716.05, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007734550628811121, "rewards/margins": 0.0012798577081412077, "rewards/rejected": -0.00901440903544426, "step": 9670 }, { "epoch": 0.9794596782353536, "grad_norm": 543845.8292650963, "learning_rate": 1.1412187991904655e-08, "logits/chosen": -4.007050037384033, "logits/rejected": -3.8003392219543457, "logps/chosen": -86.61026763916016, "logps/rejected": -161.85475158691406, "loss": 113354.3625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0057397643104195595, "rewards/margins": 0.015450581908226013, "rewards/rejected": -0.021190345287322998, "step": 9680 }, { "epoch": 0.9804715167459274, "grad_norm": 330821.7777607501, "learning_rate": 1.0850011243534967e-08, "logits/chosen": -4.073096752166748, "logits/rejected": -4.225350856781006, "logps/chosen": -82.14356994628906, "logps/rejected": -104.26485443115234, "loss": 126031.4125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009784078225493431, "rewards/margins": 0.004483391996473074, "rewards/rejected": -0.014267469756305218, "step": 9690 }, { "epoch": 0.9814833552565011, "grad_norm": 463823.7073425865, "learning_rate": 1.028783449516528e-08, "logits/chosen": -3.7265307903289795, "logits/rejected": -3.758744716644287, "logps/chosen": -75.30622100830078, "logps/rejected": -103.0025634765625, "loss": 93875.3687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0067327022552490234, "rewards/margins": 0.005484845023602247, "rewards/rejected": -0.012217545881867409, "step": 9700 }, { "epoch": 0.9824951937670747, "grad_norm": 506411.0276333108, "learning_rate": 9.725657746795593e-09, "logits/chosen": -3.8191161155700684, "logits/rejected": -3.641930103302002, "logps/chosen": -129.01834106445312, "logps/rejected": -130.443359375, "loss": 135528.7375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013171913102269173, "rewards/margins": 0.0019681008998304605, "rewards/rejected": -0.01514001376926899, "step": 9710 }, { "epoch": 0.9835070322776485, "grad_norm": 248076.8744352946, "learning_rate": 9.163480998425904e-09, "logits/chosen": -4.157187461853027, "logits/rejected": -4.053224563598633, "logps/chosen": -73.15760803222656, "logps/rejected": -75.61663055419922, "loss": 94031.7875, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.009174773469567299, "rewards/margins": -4.994387199985795e-05, "rewards/rejected": -0.009124829433858395, "step": 9720 }, { "epoch": 0.9845188707882222, "grad_norm": 380266.27166931523, "learning_rate": 8.601304250056217e-09, "logits/chosen": -3.5948891639709473, "logits/rejected": -3.566527843475342, "logps/chosen": -74.29622650146484, "logps/rejected": -84.50221252441406, "loss": 129675.975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01240230817347765, "rewards/margins": 0.002236140426248312, "rewards/rejected": -0.014638449065387249, "step": 9730 }, { "epoch": 0.9855307092987959, "grad_norm": 327151.63561901695, "learning_rate": 8.039127501686531e-09, "logits/chosen": -4.1476335525512695, "logits/rejected": -4.128661155700684, "logps/chosen": -68.76351165771484, "logps/rejected": -130.87893676757812, "loss": 124443.3875, "rewards/accuracies": 0.75, "rewards/chosen": -0.00524232629686594, "rewards/margins": 0.010867701843380928, "rewards/rejected": -0.016110029071569443, "step": 9740 }, { "epoch": 0.9865425478093697, "grad_norm": 498703.19390379195, "learning_rate": 7.476950753316842e-09, "logits/chosen": -3.9754090309143066, "logits/rejected": -4.0629754066467285, "logps/chosen": -114.82139587402344, "logps/rejected": -126.71794128417969, "loss": 123336.7375, "rewards/accuracies": 0.75, "rewards/chosen": -0.009708220139145851, "rewards/margins": 0.006540189031511545, "rewards/rejected": -0.01624840870499611, "step": 9750 }, { "epoch": 0.9875543863199433, "grad_norm": 184553.42869978855, "learning_rate": 6.914774004947156e-09, "logits/chosen": -4.042461395263672, "logits/rejected": -3.978917360305786, "logps/chosen": -64.28446960449219, "logps/rejected": -102.9139633178711, "loss": 114127.875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009910019114613533, "rewards/margins": 0.009462013840675354, "rewards/rejected": -0.019372034817934036, "step": 9760 }, { "epoch": 0.988566224830517, "grad_norm": 346403.8153285661, "learning_rate": 6.352597256577468e-09, "logits/chosen": -3.710261821746826, "logits/rejected": -3.669654369354248, "logps/chosen": -81.95560455322266, "logps/rejected": -119.3456802368164, "loss": 111189.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013499243184924126, "rewards/margins": 0.005898322444409132, "rewards/rejected": -0.019397566094994545, "step": 9770 }, { "epoch": 0.9895780633410908, "grad_norm": 20568.296696841695, "learning_rate": 5.790420508207781e-09, "logits/chosen": -4.066479682922363, "logits/rejected": -4.144700050354004, "logps/chosen": -75.88557434082031, "logps/rejected": -105.34871673583984, "loss": 104375.575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009019764140248299, "rewards/margins": 0.005218202713876963, "rewards/rejected": -0.014237967319786549, "step": 9780 }, { "epoch": 0.9905899018516645, "grad_norm": 244329.23656007156, "learning_rate": 5.228243759838093e-09, "logits/chosen": -3.7779674530029297, "logits/rejected": -3.655822277069092, "logps/chosen": -91.8704833984375, "logps/rejected": -110.03865814208984, "loss": 123813.425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01254862267524004, "rewards/margins": 0.005024886690080166, "rewards/rejected": -0.017573509365320206, "step": 9790 }, { "epoch": 0.9916017403622381, "grad_norm": 333140.7377243426, "learning_rate": 4.666067011468405e-09, "logits/chosen": -3.4529309272766113, "logits/rejected": -3.579690456390381, "logps/chosen": -74.97050476074219, "logps/rejected": -84.1295394897461, "loss": 92906.275, "rewards/accuracies": 0.5, "rewards/chosen": -0.005321663338690996, "rewards/margins": 0.0036211558617651463, "rewards/rejected": -0.008942820131778717, "step": 9800 }, { "epoch": 0.9926135788728119, "grad_norm": 376026.3799852578, "learning_rate": 4.103890263098718e-09, "logits/chosen": -4.028354167938232, "logits/rejected": -3.823741912841797, "logps/chosen": -95.30006408691406, "logps/rejected": -90.4883041381836, "loss": 128506.825, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.020144687965512276, "rewards/margins": -0.004319856408983469, "rewards/rejected": -0.015824832022190094, "step": 9810 }, { "epoch": 0.9936254173833856, "grad_norm": 2344287.647789935, "learning_rate": 3.5417135147290307e-09, "logits/chosen": -3.774406909942627, "logits/rejected": -3.765840530395508, "logps/chosen": -279.0651550292969, "logps/rejected": -348.43658447265625, "loss": 115481.0375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010420155711472034, "rewards/margins": 0.008069278672337532, "rewards/rejected": -0.01848943531513214, "step": 9820 }, { "epoch": 0.9946372558939593, "grad_norm": 202273.11585080044, "learning_rate": 2.979536766359343e-09, "logits/chosen": -4.275419235229492, "logits/rejected": -4.34443473815918, "logps/chosen": -53.62701416015625, "logps/rejected": -76.06523895263672, "loss": 96819.3687, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007256893906742334, "rewards/margins": 0.003511035116389394, "rewards/rejected": -0.010767927393317223, "step": 9830 }, { "epoch": 0.9956490944045331, "grad_norm": 315533.5717159262, "learning_rate": 2.417360017989656e-09, "logits/chosen": -4.090587139129639, "logits/rejected": -4.033541679382324, "logps/chosen": -74.35514068603516, "logps/rejected": -107.67127990722656, "loss": 113034.5625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008774438872933388, "rewards/margins": 0.003095925087109208, "rewards/rejected": -0.011870364658534527, "step": 9840 }, { "epoch": 0.9966609329151067, "grad_norm": 286832.4940455012, "learning_rate": 1.8551832696199685e-09, "logits/chosen": -3.5826804637908936, "logits/rejected": -3.479930877685547, "logps/chosen": -83.3212661743164, "logps/rejected": -121.39381408691406, "loss": 123459.925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012059072963893414, "rewards/margins": 0.005127669312059879, "rewards/rejected": -0.017186742275953293, "step": 9850 }, { "epoch": 0.9976727714256804, "grad_norm": 273224.32414185675, "learning_rate": 1.293006521250281e-09, "logits/chosen": -4.037057876586914, "logits/rejected": -4.164947509765625, "logps/chosen": -96.47148895263672, "logps/rejected": -97.11286163330078, "loss": 105889.5875, "rewards/accuracies": 0.5, "rewards/chosen": -0.009954235516488552, "rewards/margins": 0.007880708202719688, "rewards/rejected": -0.017834942787885666, "step": 9860 }, { "epoch": 0.9986846099362542, "grad_norm": 438837.5963546457, "learning_rate": 7.308297728805936e-10, "logits/chosen": -4.053694725036621, "logits/rejected": -4.209842681884766, "logps/chosen": -110.622802734375, "logps/rejected": -118.92723083496094, "loss": 118724.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011669805273413658, "rewards/margins": 0.0029773968271911144, "rewards/rejected": -0.01464720070362091, "step": 9870 }, { "epoch": 0.9996964484468279, "grad_norm": 468665.9701137798, "learning_rate": 1.6865302451090621e-10, "logits/chosen": -4.1098151206970215, "logits/rejected": -3.934731960296631, "logps/chosen": -131.5048370361328, "logps/rejected": -128.96128845214844, "loss": 128688.0625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009731548838317394, "rewards/margins": 0.005325093399733305, "rewards/rejected": -0.015056641772389412, "step": 9880 }, { "epoch": 1.0, "step": 9883, "total_flos": 0.0, "train_loss": 116322.4441848534, "train_runtime": 4176.8003, "train_samples_per_second": 4.732, "train_steps_per_second": 2.366 } ], "logging_steps": 10, "max_steps": 9883, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }