{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994340690435767, "eval_steps": 100, "global_step": 883, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.617977528089887e-09, "logits/chosen": -2.7943434715270996, "logits/rejected": -2.817823886871338, "logps/chosen": -334.107666015625, "logps/rejected": -197.05621337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.617977528089887e-08, "logits/chosen": -2.833451271057129, "logits/rejected": -2.7827768325805664, "logps/chosen": -323.80584716796875, "logps/rejected": -189.39964294433594, "loss": 0.6931, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0005755923339165747, "rewards/margins": 0.0003566421801224351, "rewards/rejected": 0.00021895011013839394, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.1235955056179774e-07, "logits/chosen": -2.778655767440796, "logits/rejected": -2.7627151012420654, "logps/chosen": -323.3365783691406, "logps/rejected": -168.40744018554688, "loss": 0.6917, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0010369193041697145, "rewards/margins": 0.0018870027270168066, "rewards/rejected": -0.0008500836556777358, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.6853932584269663e-07, "logits/chosen": -2.7871737480163574, "logits/rejected": -2.7326064109802246, "logps/chosen": -305.997314453125, "logps/rejected": -180.06800842285156, "loss": 0.683, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009164368733763695, "rewards/margins": 0.015919247642159462, "rewards/rejected": -0.006754877977073193, "step": 30 }, { "epoch": 0.05, "learning_rate": 2.2471910112359549e-07, "logits/chosen": -2.7199320793151855, "logits/rejected": -2.711822032928467, "logps/chosen": -314.8984680175781, "logps/rejected": -178.45077514648438, "loss": 0.6653, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.034292496740818024, "rewards/margins": 0.06667112559080124, "rewards/rejected": -0.03237862139940262, "step": 40 }, { "epoch": 0.06, "learning_rate": 2.8089887640449437e-07, "logits/chosen": -2.6660404205322266, "logits/rejected": -2.6608872413635254, "logps/chosen": -340.89056396484375, "logps/rejected": -192.22543334960938, "loss": 0.6387, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0421828031539917, "rewards/margins": 0.14488723874092102, "rewards/rejected": -0.10270445048809052, "step": 50 }, { "epoch": 0.07, "learning_rate": 3.3707865168539325e-07, "logits/chosen": -2.6621761322021484, "logits/rejected": -2.6332790851593018, "logps/chosen": -290.0724182128906, "logps/rejected": -199.76377868652344, "loss": 0.6192, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03800051286816597, "rewards/margins": 0.1342642456293106, "rewards/rejected": -0.17226476967334747, "step": 60 }, { "epoch": 0.08, "learning_rate": 3.9325842696629214e-07, "logits/chosen": -2.5926709175109863, "logits/rejected": -2.5758044719696045, "logps/chosen": -318.26446533203125, "logps/rejected": -217.8231964111328, "loss": 0.5872, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.061497531831264496, "rewards/margins": 0.23234911262989044, "rewards/rejected": -0.29384663701057434, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.4943820224719097e-07, "logits/chosen": -2.561908483505249, "logits/rejected": -2.5379791259765625, "logps/chosen": -396.86993408203125, "logps/rejected": -253.50143432617188, "loss": 0.5579, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.01939094439148903, "rewards/margins": 0.49067315459251404, "rewards/rejected": -0.5100641250610352, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.999980431020109e-07, "logits/chosen": -2.5810797214508057, "logits/rejected": -2.5567336082458496, "logps/chosen": -380.4464416503906, "logps/rejected": -262.82904052734375, "loss": 0.5455, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.24769897758960724, "rewards/margins": 0.5812320709228516, "rewards/rejected": -0.8289310336112976, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.997632524101301e-07, "logits/chosen": -2.6055984497070312, "logits/rejected": -2.5864219665527344, "logps/chosen": -367.29071044921875, "logps/rejected": -280.4869079589844, "loss": 0.5392, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2197243869304657, "rewards/margins": 0.5160216689109802, "rewards/rejected": -0.7357459664344788, "step": 100 }, { "epoch": 0.11, "eval_logits/chosen": -2.590785264968872, "eval_logits/rejected": -2.5756187438964844, "eval_logps/chosen": -322.57501220703125, "eval_logps/rejected": -351.5352478027344, "eval_loss": 0.6285832524299622, "eval_rewards/accuracies": 0.65234375, "eval_rewards/chosen": -0.6553537845611572, "eval_rewards/margins": 0.2864663004875183, "eval_rewards/rejected": -0.9418200850486755, "eval_runtime": 53.1932, "eval_samples_per_second": 37.599, "eval_steps_per_second": 0.602, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.991375032514749e-07, "logits/chosen": -2.5533313751220703, "logits/rejected": -2.5264110565185547, "logps/chosen": -363.4510498046875, "logps/rejected": -284.7992248535156, "loss": 0.5232, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3205306828022003, "rewards/margins": 0.6818863749504089, "rewards/rejected": -1.002416968345642, "step": 110 }, { "epoch": 0.14, "learning_rate": 4.98121775121344e-07, "logits/chosen": -2.6315197944641113, "logits/rejected": -2.5978212356567383, "logps/chosen": -410.644775390625, "logps/rejected": -323.01190185546875, "loss": 0.4994, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.27390944957733154, "rewards/margins": 0.8327864408493042, "rewards/rejected": -1.1066958904266357, "step": 120 }, { "epoch": 0.15, "learning_rate": 4.96717657955441e-07, "logits/chosen": -2.59904408454895, "logits/rejected": -2.5410983562469482, "logps/chosen": -416.3720703125, "logps/rejected": -325.9648132324219, "loss": 0.5013, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.40370965003967285, "rewards/margins": 0.9006286859512329, "rewards/rejected": -1.3043382167816162, "step": 130 }, { "epoch": 0.16, "learning_rate": 4.949273496411216e-07, "logits/chosen": -2.545508861541748, "logits/rejected": -2.5205612182617188, "logps/chosen": -379.17767333984375, "logps/rejected": -337.29962158203125, "loss": 0.4954, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.451716810464859, "rewards/margins": 0.8486088514328003, "rewards/rejected": -1.300325632095337, "step": 140 }, { "epoch": 0.17, "learning_rate": 4.927536525770046e-07, "logits/chosen": -2.5130438804626465, "logits/rejected": -2.487233877182007, "logps/chosen": -423.2710876464844, "logps/rejected": -352.5829772949219, "loss": 0.4976, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5199152231216431, "rewards/margins": 1.0131314992904663, "rewards/rejected": -1.5330466032028198, "step": 150 }, { "epoch": 0.18, "learning_rate": 4.901999692863326e-07, "logits/chosen": -2.520357847213745, "logits/rejected": -2.4684462547302246, "logps/chosen": -498.07098388671875, "logps/rejected": -388.2645263671875, "loss": 0.463, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5807570219039917, "rewards/margins": 1.1767116785049438, "rewards/rejected": -1.757468581199646, "step": 160 }, { "epoch": 0.19, "learning_rate": 4.872702970909464e-07, "logits/chosen": -2.345059633255005, "logits/rejected": -2.281158924102783, "logps/chosen": -455.2555236816406, "logps/rejected": -373.2399597167969, "loss": 0.4471, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8019993901252747, "rewards/margins": 1.065147042274475, "rewards/rejected": -1.8671462535858154, "step": 170 }, { "epoch": 0.2, "learning_rate": 4.839692218542131e-07, "logits/chosen": -2.167600631713867, "logits/rejected": -2.1524620056152344, "logps/chosen": -445.18963623046875, "logps/rejected": -420.07354736328125, "loss": 0.4607, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5789515972137451, "rewards/margins": 0.9403783082962036, "rewards/rejected": -2.5193300247192383, "step": 180 }, { "epoch": 0.22, "learning_rate": 4.803019108026997e-07, "logits/chosen": -2.0659067630767822, "logits/rejected": -2.0179924964904785, "logps/chosen": -446.5098571777344, "logps/rejected": -408.96685791015625, "loss": 0.4605, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1547313928604126, "rewards/margins": 1.099097490310669, "rewards/rejected": -2.253828525543213, "step": 190 }, { "epoch": 0.23, "learning_rate": 4.7627410443782887e-07, "logits/chosen": -1.9613704681396484, "logits/rejected": -1.9336235523223877, "logps/chosen": -434.38311767578125, "logps/rejected": -421.72308349609375, "loss": 0.4524, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2233312129974365, "rewards/margins": 1.0944594144821167, "rewards/rejected": -2.3177905082702637, "step": 200 }, { "epoch": 0.23, "eval_logits/chosen": -1.9878398180007935, "eval_logits/rejected": -1.9677612781524658, "eval_logps/chosen": -405.3453674316406, "eval_logps/rejected": -474.3326721191406, "eval_loss": 0.5474696755409241, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -1.4830571413040161, "eval_rewards/margins": 0.6867368221282959, "eval_rewards/rejected": -2.1697940826416016, "eval_runtime": 53.0465, "eval_samples_per_second": 37.703, "eval_steps_per_second": 0.603, "step": 200 }, { "epoch": 0.24, "learning_rate": 4.7189210755018034e-07, "logits/chosen": -1.916168212890625, "logits/rejected": -1.849001169204712, "logps/chosen": -497.56134033203125, "logps/rejected": -451.7841796875, "loss": 0.4423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2633593082427979, "rewards/margins": 1.236897587776184, "rewards/rejected": -2.5002567768096924, "step": 210 }, { "epoch": 0.25, "learning_rate": 4.671627793504988e-07, "logits/chosen": -1.965778112411499, "logits/rejected": -1.8829681873321533, "logps/chosen": -516.19921875, "logps/rejected": -489.0526428222656, "loss": 0.4306, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.368606686592102, "rewards/margins": 1.4575475454330444, "rewards/rejected": -2.8261542320251465, "step": 220 }, { "epoch": 0.26, "learning_rate": 4.6209352273286095e-07, "logits/chosen": -1.8527837991714478, "logits/rejected": -1.7781047821044922, "logps/chosen": -492.2167053222656, "logps/rejected": -515.4146728515625, "loss": 0.4315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5558216571807861, "rewards/margins": 1.2494769096374512, "rewards/rejected": -2.8052985668182373, "step": 230 }, { "epoch": 0.27, "learning_rate": 4.56692272686805e-07, "logits/chosen": -1.8593418598175049, "logits/rejected": -1.7763780355453491, "logps/chosen": -473.20245361328125, "logps/rejected": -463.26849365234375, "loss": 0.4462, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5299947261810303, "rewards/margins": 1.291903018951416, "rewards/rejected": -2.8218979835510254, "step": 240 }, { "epoch": 0.28, "learning_rate": 4.5096748387656326e-07, "logits/chosen": -1.6604913473129272, "logits/rejected": -1.5300872325897217, "logps/chosen": -527.0318603515625, "logps/rejected": -502.64129638671875, "loss": 0.4618, "rewards/accuracies": 0.75, "rewards/chosen": -2.062129497528076, "rewards/margins": 1.1275193691253662, "rewards/rejected": -3.1896486282348633, "step": 250 }, { "epoch": 0.29, "learning_rate": 4.4492811740683877e-07, "logits/chosen": -1.5592234134674072, "logits/rejected": -1.3744081258773804, "logps/chosen": -491.737548828125, "logps/rejected": -486.6441345214844, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.907570481300354, "rewards/margins": 1.1632691621780396, "rewards/rejected": -3.0708391666412354, "step": 260 }, { "epoch": 0.31, "learning_rate": 4.3858362679584354e-07, "logits/chosen": -1.5746996402740479, "logits/rejected": -1.2380870580673218, "logps/chosen": -457.90753173828125, "logps/rejected": -446.56683349609375, "loss": 0.4103, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1260545253753662, "rewards/margins": 1.6088136434555054, "rewards/rejected": -2.734868288040161, "step": 270 }, { "epoch": 0.32, "learning_rate": 4.3194394317755245e-07, "logits/chosen": -1.3573920726776123, "logits/rejected": -1.0481122732162476, "logps/chosen": -512.2153930664062, "logps/rejected": -469.4527893066406, "loss": 0.4381, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6660184860229492, "rewards/margins": 1.3182036876678467, "rewards/rejected": -2.984222173690796, "step": 280 }, { "epoch": 0.33, "learning_rate": 4.2501945975633914e-07, "logits/chosen": -1.5231261253356934, "logits/rejected": -1.2471725940704346, "logps/chosen": -508.29248046875, "logps/rejected": -447.50689697265625, "loss": 0.4364, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4383156299591064, "rewards/margins": 1.2735927104949951, "rewards/rejected": -2.7119078636169434, "step": 290 }, { "epoch": 0.34, "learning_rate": 4.1782101553832405e-07, "logits/chosen": -1.4166069030761719, "logits/rejected": -1.1425375938415527, "logps/chosen": -467.41717529296875, "logps/rejected": -439.3959045410156, "loss": 0.3976, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5179073810577393, "rewards/margins": 1.1478455066680908, "rewards/rejected": -2.665753126144409, "step": 300 }, { "epoch": 0.34, "eval_logits/chosen": -1.18406081199646, "eval_logits/rejected": -0.9782991409301758, "eval_logps/chosen": -442.4473571777344, "eval_logps/rejected": -545.2501220703125, "eval_loss": 0.5194380879402161, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -1.8540773391723633, "eval_rewards/margins": 1.0248912572860718, "eval_rewards/rejected": -2.8789682388305664, "eval_runtime": 53.0005, "eval_samples_per_second": 37.736, "eval_steps_per_second": 0.604, "step": 300 }, { "epoch": 0.35, "learning_rate": 4.103598783649029e-07, "logits/chosen": -1.0781385898590088, "logits/rejected": -0.6068095564842224, "logps/chosen": -542.6256713867188, "logps/rejected": -505.87078857421875, "loss": 0.4248, "rewards/accuracies": 0.78125, "rewards/chosen": -1.779624342918396, "rewards/margins": 1.5406283140182495, "rewards/rejected": -3.3202528953552246, "step": 310 }, { "epoch": 0.36, "learning_rate": 4.026477272750119e-07, "logits/chosen": -0.7725287079811096, "logits/rejected": -0.2756701111793518, "logps/chosen": -545.5137329101562, "logps/rejected": -528.4269409179688, "loss": 0.4226, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.142789602279663, "rewards/margins": 1.3576524257659912, "rewards/rejected": -3.500441789627075, "step": 320 }, { "epoch": 0.37, "learning_rate": 3.9469663422373864e-07, "logits/chosen": -0.9761560559272766, "logits/rejected": -0.6311030983924866, "logps/chosen": -517.2960205078125, "logps/rejected": -506.86328125, "loss": 0.4432, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8108386993408203, "rewards/margins": 1.37090265750885, "rewards/rejected": -3.181741237640381, "step": 330 }, { "epoch": 0.38, "learning_rate": 3.865190451858954e-07, "logits/chosen": -0.865078330039978, "logits/rejected": -0.3488244414329529, "logps/chosen": -540.340087890625, "logps/rejected": -525.5319213867188, "loss": 0.43, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.597611904144287, "rewards/margins": 1.6154896020889282, "rewards/rejected": -3.213101625442505, "step": 340 }, { "epoch": 0.4, "learning_rate": 3.781277606741327e-07, "logits/chosen": -1.0114878416061401, "logits/rejected": -0.7175018191337585, "logps/chosen": -450.4183654785156, "logps/rejected": -459.7533264160156, "loss": 0.4271, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.427380919456482, "rewards/margins": 1.2692419290542603, "rewards/rejected": -2.696622610092163, "step": 350 }, { "epoch": 0.41, "learning_rate": 3.6953591570208996e-07, "logits/chosen": -1.2963850498199463, "logits/rejected": -0.8218928575515747, "logps/chosen": -540.1664428710938, "logps/rejected": -555.89306640625, "loss": 0.4147, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.701040506362915, "rewards/margins": 1.8265488147735596, "rewards/rejected": -3.5275893211364746, "step": 360 }, { "epoch": 0.42, "learning_rate": 3.607569592239452e-07, "logits/chosen": -1.0880775451660156, "logits/rejected": -0.6546664237976074, "logps/chosen": -559.7450561523438, "logps/rejected": -538.1546630859375, "loss": 0.4192, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7286078929901123, "rewards/margins": 1.7532542943954468, "rewards/rejected": -3.4818618297576904, "step": 370 }, { "epoch": 0.43, "learning_rate": 3.518046330825494e-07, "logits/chosen": -1.1186842918395996, "logits/rejected": -0.6067591905593872, "logps/chosen": -560.1696166992188, "logps/rejected": -522.5840454101562, "loss": 0.4349, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8213374614715576, "rewards/margins": 1.5133308172225952, "rewards/rejected": -3.3346683979034424, "step": 380 }, { "epoch": 0.44, "learning_rate": 3.4269295049909713e-07, "logits/chosen": -1.1209189891815186, "logits/rejected": -0.7713836431503296, "logps/chosen": -473.28759765625, "logps/rejected": -484.11065673828125, "loss": 0.3979, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7088820934295654, "rewards/margins": 1.425378441810608, "rewards/rejected": -3.1342601776123047, "step": 390 }, { "epoch": 0.45, "learning_rate": 3.3343617413800453e-07, "logits/chosen": -1.1869983673095703, "logits/rejected": -0.6728812456130981, "logps/chosen": -529.2347412109375, "logps/rejected": -498.1748962402344, "loss": 0.3892, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.649171233177185, "rewards/margins": 1.6641887426376343, "rewards/rejected": -3.3133597373962402, "step": 400 }, { "epoch": 0.45, "eval_logits/chosen": -0.8579260110855103, "eval_logits/rejected": -0.6001935601234436, "eval_logps/chosen": -464.98876953125, "eval_logps/rejected": -575.0087280273438, "eval_loss": 0.5159767866134644, "eval_rewards/accuracies": 0.77734375, "eval_rewards/chosen": -2.079491376876831, "eval_rewards/margins": 1.0970630645751953, "eval_rewards/rejected": -3.1765542030334473, "eval_runtime": 53.0852, "eval_samples_per_second": 37.675, "eval_steps_per_second": 0.603, "step": 400 }, { "epoch": 0.46, "learning_rate": 3.2404879378132893e-07, "logits/chosen": -0.8699030876159668, "logits/rejected": -0.48875007033348083, "logps/chosen": -468.9755859375, "logps/rejected": -495.357421875, "loss": 0.4084, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6207454204559326, "rewards/margins": 1.6607239246368408, "rewards/rejected": -3.2814698219299316, "step": 410 }, { "epoch": 0.48, "learning_rate": 3.1454550364767894e-07, "logits/chosen": -1.098257303237915, "logits/rejected": -0.709359347820282, "logps/chosen": -512.3826904296875, "logps/rejected": -523.95458984375, "loss": 0.4354, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7999498844146729, "rewards/margins": 1.4373884201049805, "rewards/rejected": -3.2373383045196533, "step": 420 }, { "epoch": 0.49, "learning_rate": 3.049411793911154e-07, "logits/chosen": -0.9810858964920044, "logits/rejected": -0.6282259821891785, "logps/chosen": -509.84368896484375, "logps/rejected": -517.60107421875, "loss": 0.3974, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8548433780670166, "rewards/margins": 1.386967420578003, "rewards/rejected": -3.2418110370635986, "step": 430 }, { "epoch": 0.5, "learning_rate": 2.9525085481604914e-07, "logits/chosen": -0.6511877775192261, "logits/rejected": -0.07081355899572372, "logps/chosen": -509.661376953125, "logps/rejected": -524.3201904296875, "loss": 0.4151, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8540523052215576, "rewards/margins": 1.5340583324432373, "rewards/rejected": -3.388110399246216, "step": 440 }, { "epoch": 0.51, "learning_rate": 2.854896983445833e-07, "logits/chosen": -0.5572197437286377, "logits/rejected": 0.0708194151520729, "logps/chosen": -562.8184814453125, "logps/rejected": -528.6909790039062, "loss": 0.4329, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.886749267578125, "rewards/margins": 1.5286136865615845, "rewards/rejected": -3.415362596511841, "step": 450 }, { "epoch": 0.52, "learning_rate": 2.7567298927313654e-07, "logits/chosen": -0.8817178606987, "logits/rejected": -0.6781443357467651, "logps/chosen": -470.3621520996094, "logps/rejected": -495.33111572265625, "loss": 0.4137, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4939197301864624, "rewards/margins": 1.3965364694595337, "rewards/rejected": -2.890456199645996, "step": 460 }, { "epoch": 0.53, "learning_rate": 2.658160938555123e-07, "logits/chosen": -0.900059700012207, "logits/rejected": -0.38037875294685364, "logps/chosen": -530.0759887695312, "logps/rejected": -549.6453857421875, "loss": 0.3727, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6465427875518799, "rewards/margins": 1.722328543663025, "rewards/rejected": -3.3688716888427734, "step": 470 }, { "epoch": 0.54, "learning_rate": 2.559344412498532e-07, "logits/chosen": -0.5834644436836243, "logits/rejected": -0.024540895596146584, "logps/chosen": -526.4287719726562, "logps/rejected": -525.1407470703125, "loss": 0.4301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8217204809188843, "rewards/margins": 1.5024107694625854, "rewards/rejected": -3.3241310119628906, "step": 480 }, { "epoch": 0.55, "learning_rate": 2.460434993671294e-07, "logits/chosen": -0.999637246131897, "logits/rejected": -0.7088354825973511, "logps/chosen": -467.12353515625, "logps/rejected": -472.5782775878906, "loss": 0.3968, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5620094537734985, "rewards/margins": 1.4622641801834106, "rewards/rejected": -3.02427339553833, "step": 490 }, { "epoch": 0.57, "learning_rate": 2.361587506589672e-07, "logits/chosen": -1.169862151145935, "logits/rejected": -0.6735583543777466, "logps/chosen": -547.8793334960938, "logps/rejected": -530.5782470703125, "loss": 0.3964, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.728243112564087, "rewards/margins": 1.6642783880233765, "rewards/rejected": -3.392521381378174, "step": 500 }, { "epoch": 0.57, "eval_logits/chosen": -1.018913984298706, "eval_logits/rejected": -0.8011811375617981, "eval_logps/chosen": -476.00384521484375, "eval_logps/rejected": -588.1665649414062, "eval_loss": 0.49919986724853516, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -2.1896419525146484, "eval_rewards/margins": 1.11849045753479, "eval_rewards/rejected": -3.3081324100494385, "eval_runtime": 73.4341, "eval_samples_per_second": 27.235, "eval_steps_per_second": 0.436, "step": 500 }, { "epoch": 0.58, "learning_rate": 2.2629566788271613e-07, "logits/chosen": -1.1643812656402588, "logits/rejected": -0.6770884394645691, "logps/chosen": -498.718994140625, "logps/rejected": -513.8646240234375, "loss": 0.4072, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7343899011611938, "rewards/margins": 1.711033582687378, "rewards/rejected": -3.4454236030578613, "step": 510 }, { "epoch": 0.59, "learning_rate": 2.1646968988169135e-07, "logits/chosen": -1.2519400119781494, "logits/rejected": -0.7656970620155334, "logps/chosen": -552.4429931640625, "logps/rejected": -580.3065185546875, "loss": 0.3859, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9031795263290405, "rewards/margins": 1.720510721206665, "rewards/rejected": -3.623690366744995, "step": 520 }, { "epoch": 0.6, "learning_rate": 2.0669619741850232e-07, "logits/chosen": -1.166473388671875, "logits/rejected": -0.5304248929023743, "logps/chosen": -543.8204345703125, "logps/rejected": -517.4561767578125, "loss": 0.4265, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9481357336044312, "rewards/margins": 1.5012562274932861, "rewards/rejected": -3.4493918418884277, "step": 530 }, { "epoch": 0.61, "learning_rate": 1.9699048909929518e-07, "logits/chosen": -1.3502863645553589, "logits/rejected": -0.972245991230011, "logps/chosen": -513.7689819335938, "logps/rejected": -506.95953369140625, "loss": 0.3917, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8054568767547607, "rewards/margins": 1.3885786533355713, "rewards/rejected": -3.194035291671753, "step": 540 }, { "epoch": 0.62, "learning_rate": 1.8736775742659732e-07, "logits/chosen": -1.1914501190185547, "logits/rejected": -0.8519012331962585, "logps/chosen": -489.40234375, "logps/rejected": -517.2066040039062, "loss": 0.3892, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6697397232055664, "rewards/margins": 1.5511436462402344, "rewards/rejected": -3.2208831310272217, "step": 550 }, { "epoch": 0.63, "learning_rate": 1.7784306501824616e-07, "logits/chosen": -1.1693607568740845, "logits/rejected": -0.501569926738739, "logps/chosen": -549.26220703125, "logps/rejected": -523.9554443359375, "loss": 0.4249, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8511940240859985, "rewards/margins": 1.4909955263137817, "rewards/rejected": -3.342189311981201, "step": 560 }, { "epoch": 0.65, "learning_rate": 1.6843132102963025e-07, "logits/chosen": -1.1927831172943115, "logits/rejected": -0.8532694578170776, "logps/chosen": -539.3836669921875, "logps/rejected": -510.72637939453125, "loss": 0.3897, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6676031351089478, "rewards/margins": 1.5155996084213257, "rewards/rejected": -3.1832027435302734, "step": 570 }, { "epoch": 0.66, "learning_rate": 1.591472578161458e-07, "logits/chosen": -1.3109443187713623, "logits/rejected": -0.9485646486282349, "logps/chosen": -494.5043029785156, "logps/rejected": -489.75537109375, "loss": 0.4009, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5513614416122437, "rewards/margins": 1.510770559310913, "rewards/rejected": -3.062131881713867, "step": 580 }, { "epoch": 0.67, "learning_rate": 1.5000540787240274e-07, "logits/chosen": -1.2452589273452759, "logits/rejected": -0.857632040977478, "logps/chosen": -504.5924377441406, "logps/rejected": -519.4934692382812, "loss": 0.3993, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7807199954986572, "rewards/margins": 1.571396827697754, "rewards/rejected": -3.352116823196411, "step": 590 }, { "epoch": 0.68, "learning_rate": 1.410200810842749e-07, "logits/chosen": -1.2575485706329346, "logits/rejected": -0.8479830622673035, "logps/chosen": -503.79388427734375, "logps/rejected": -516.9588012695312, "loss": 0.4149, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7127326726913452, "rewards/margins": 1.6169878244400024, "rewards/rejected": -3.3297202587127686, "step": 600 }, { "epoch": 0.68, "eval_logits/chosen": -1.2397898435592651, "eval_logits/rejected": -1.0526514053344727, "eval_logps/chosen": -477.6524658203125, "eval_logps/rejected": -589.7600708007812, "eval_loss": 0.4948367774486542, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": -2.2061285972595215, "eval_rewards/margins": 1.1179393529891968, "eval_rewards/rejected": -3.3240678310394287, "eval_runtime": 53.1159, "eval_samples_per_second": 37.654, "eval_steps_per_second": 0.602, "step": 600 }, { "epoch": 0.69, "learning_rate": 1.322053423294041e-07, "logits/chosen": -1.256247639656067, "logits/rejected": -0.9272082448005676, "logps/chosen": -501.60675048828125, "logps/rejected": -531.3302001953125, "loss": 0.4028, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7432218790054321, "rewards/margins": 1.7136541604995728, "rewards/rejected": -3.456876039505005, "step": 610 }, { "epoch": 0.7, "learning_rate": 1.2357498946121905e-07, "logits/chosen": -1.3026126623153687, "logits/rejected": -0.9675828218460083, "logps/chosen": -534.3182373046875, "logps/rejected": -527.3935546875, "loss": 0.4187, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8490867614746094, "rewards/margins": 1.5810914039611816, "rewards/rejected": -3.430178165435791, "step": 620 }, { "epoch": 0.71, "learning_rate": 1.1514253171093161e-07, "logits/chosen": -1.2203739881515503, "logits/rejected": -0.7822047472000122, "logps/chosen": -493.72821044921875, "logps/rejected": -506.88690185546875, "loss": 0.4051, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6152998208999634, "rewards/margins": 1.5858867168426514, "rewards/rejected": -3.201186418533325, "step": 630 }, { "epoch": 0.72, "learning_rate": 1.0692116854131883e-07, "logits/chosen": -1.000585913658142, "logits/rejected": -0.7414053678512573, "logps/chosen": -493.9928283691406, "logps/rejected": -534.2704467773438, "loss": 0.3866, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7848412990570068, "rewards/margins": 1.534330129623413, "rewards/rejected": -3.319171905517578, "step": 640 }, { "epoch": 0.74, "learning_rate": 9.89237689853889e-08, "logits/chosen": -0.9636529684066772, "logits/rejected": -0.6193439364433289, "logps/chosen": -499.71234130859375, "logps/rejected": -517.5823974609375, "loss": 0.394, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.829134225845337, "rewards/margins": 1.6988353729248047, "rewards/rejected": -3.5279693603515625, "step": 650 }, { "epoch": 0.75, "learning_rate": 9.11628515022765e-08, "logits/chosen": -1.0789777040481567, "logits/rejected": -0.6399408578872681, "logps/chosen": -513.3380737304688, "logps/rejected": -544.7978515625, "loss": 0.3623, "rewards/accuracies": 0.875, "rewards/chosen": -1.7524398565292358, "rewards/margins": 1.8130983114242554, "rewards/rejected": -3.565537929534912, "step": 660 }, { "epoch": 0.76, "learning_rate": 8.365056438189486e-08, "logits/chosen": -1.0069674253463745, "logits/rejected": -0.5994616746902466, "logps/chosen": -542.05712890625, "logps/rejected": -564.6227416992188, "loss": 0.4122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9871015548706055, "rewards/margins": 1.6281112432479858, "rewards/rejected": -3.6152126789093018, "step": 670 }, { "epoch": 0.77, "learning_rate": 7.639866672902101e-08, "logits/chosen": -1.0949068069458008, "logits/rejected": -0.7090824246406555, "logps/chosen": -549.8911743164062, "logps/rejected": -559.15771484375, "loss": 0.4132, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8535239696502686, "rewards/margins": 1.733758568763733, "rewards/rejected": -3.587282657623291, "step": 680 }, { "epoch": 0.78, "learning_rate": 6.941851005657851e-08, "logits/chosen": -1.1339385509490967, "logits/rejected": -0.738599419593811, "logps/chosen": -494.1913146972656, "logps/rejected": -504.7791442871094, "loss": 0.3813, "rewards/accuracies": 0.84375, "rewards/chosen": -1.752722978591919, "rewards/margins": 1.4443397521972656, "rewards/rejected": -3.1970624923706055, "step": 690 }, { "epoch": 0.79, "learning_rate": 6.272102051693051e-08, "logits/chosen": -1.2199567556381226, "logits/rejected": -0.9412355422973633, "logps/chosen": -552.1275024414062, "logps/rejected": -515.4296264648438, "loss": 0.4004, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7129371166229248, "rewards/margins": 1.4609147310256958, "rewards/rejected": -3.1738522052764893, "step": 700 }, { "epoch": 0.79, "eval_logits/chosen": -1.0643391609191895, "eval_logits/rejected": -0.8519161343574524, "eval_logps/chosen": -474.26617431640625, "eval_logps/rejected": -593.8731079101562, "eval_loss": 0.49052032828330994, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": -2.1722652912139893, "eval_rewards/margins": 1.192933201789856, "eval_rewards/rejected": -3.3651983737945557, "eval_runtime": 53.0717, "eval_samples_per_second": 37.685, "eval_steps_per_second": 0.603, "step": 700 }, { "epoch": 0.8, "learning_rate": 5.6316681798995844e-08, "logits/chosen": -1.0180628299713135, "logits/rejected": -0.7236673831939697, "logps/chosen": -491.35565185546875, "logps/rejected": -525.1397705078125, "loss": 0.3851, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7499040365219116, "rewards/margins": 1.7332220077514648, "rewards/rejected": -3.483126163482666, "step": 710 }, { "epoch": 0.81, "learning_rate": 5.0215518717961256e-08, "logits/chosen": -1.0655405521392822, "logits/rejected": -0.6608148813247681, "logps/chosen": -525.560302734375, "logps/rejected": -529.1053466796875, "loss": 0.3984, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8151214122772217, "rewards/margins": 1.7203428745269775, "rewards/rejected": -3.53546404838562, "step": 720 }, { "epoch": 0.83, "learning_rate": 4.4427081523275925e-08, "logits/chosen": -1.0117073059082031, "logits/rejected": -0.715721607208252, "logps/chosen": -504.0994567871094, "logps/rejected": -539.0814208984375, "loss": 0.3756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8760732412338257, "rewards/margins": 1.5654491186141968, "rewards/rejected": -3.4415221214294434, "step": 730 }, { "epoch": 0.84, "learning_rate": 3.896043094949061e-08, "logits/chosen": -1.1520367860794067, "logits/rejected": -0.5986729860305786, "logps/chosen": -532.8388061523438, "logps/rejected": -558.7303466796875, "loss": 0.4003, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8816320896148682, "rewards/margins": 1.7765041589736938, "rewards/rejected": -3.6581363677978516, "step": 740 }, { "epoch": 0.85, "learning_rate": 3.3824124033343557e-08, "logits/chosen": -0.8991321325302124, "logits/rejected": -0.6385317444801331, "logps/chosen": -567.1549072265625, "logps/rejected": -579.142578125, "loss": 0.3994, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1505260467529297, "rewards/margins": 1.6301181316375732, "rewards/rejected": -3.780644178390503, "step": 750 }, { "epoch": 0.86, "learning_rate": 2.9026200719291904e-08, "logits/chosen": -1.001379370689392, "logits/rejected": -0.6102081537246704, "logps/chosen": -508.70147705078125, "logps/rejected": -536.8555297851562, "loss": 0.4286, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9450082778930664, "rewards/margins": 1.5907418727874756, "rewards/rejected": -3.535750150680542, "step": 760 }, { "epoch": 0.87, "learning_rate": 2.4574171274456433e-08, "logits/chosen": -1.0912601947784424, "logits/rejected": -0.6700750589370728, "logps/chosen": -519.89208984375, "logps/rejected": -525.0479736328125, "loss": 0.3678, "rewards/accuracies": 0.8125, "rewards/chosen": -1.884690523147583, "rewards/margins": 1.6182489395141602, "rewards/rejected": -3.502938747406006, "step": 770 }, { "epoch": 0.88, "learning_rate": 2.047500453267881e-08, "logits/chosen": -1.0198689699172974, "logits/rejected": -0.5865429043769836, "logps/chosen": -526.2073974609375, "logps/rejected": -554.8323974609375, "loss": 0.3887, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9122520685195923, "rewards/margins": 1.7002776861190796, "rewards/rejected": -3.612529754638672, "step": 780 }, { "epoch": 0.89, "learning_rate": 1.673511698609292e-08, "logits/chosen": -0.9797528982162476, "logits/rejected": -0.5832753777503967, "logps/chosen": -553.0879516601562, "logps/rejected": -561.693603515625, "loss": 0.3901, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9004099369049072, "rewards/margins": 1.7125294208526611, "rewards/rejected": -3.6129393577575684, "step": 790 }, { "epoch": 0.91, "learning_rate": 1.3360362741285769e-08, "logits/chosen": -1.0027343034744263, "logits/rejected": -0.7030217051506042, "logps/chosen": -501.4288024902344, "logps/rejected": -526.0103759765625, "loss": 0.3887, "rewards/accuracies": 0.8125, "rewards/chosen": -1.869350790977478, "rewards/margins": 1.5730822086334229, "rewards/rejected": -3.4424331188201904, "step": 800 }, { "epoch": 0.91, "eval_logits/chosen": -0.9597108364105225, "eval_logits/rejected": -0.7242004871368408, "eval_logps/chosen": -487.77545166015625, "eval_logps/rejected": -609.3139038085938, "eval_loss": 0.4919503927230835, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -2.3073582649230957, "eval_rewards/margins": 1.21224844455719, "eval_rewards/rejected": -3.519606590270996, "eval_runtime": 53.0285, "eval_samples_per_second": 37.716, "eval_steps_per_second": 0.603, "step": 800 }, { "epoch": 0.92, "learning_rate": 1.0356024355769433e-08, "logits/chosen": -1.0092878341674805, "logits/rejected": -0.7896069884300232, "logps/chosen": -532.9703369140625, "logps/rejected": -526.88330078125, "loss": 0.3767, "rewards/accuracies": 0.78125, "rewards/chosen": -1.830877661705017, "rewards/margins": 1.5391663312911987, "rewards/rejected": -3.370044231414795, "step": 810 }, { "epoch": 0.93, "learning_rate": 7.726804569108597e-09, "logits/chosen": -1.1117920875549316, "logits/rejected": -0.6487603187561035, "logps/chosen": -553.5621337890625, "logps/rejected": -571.2651977539062, "loss": 0.4191, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9834985733032227, "rewards/margins": 1.6336091756820679, "rewards/rejected": -3.61710786819458, "step": 820 }, { "epoch": 0.94, "learning_rate": 5.476818941645561e-09, "logits/chosen": -1.1343705654144287, "logits/rejected": -0.5430102348327637, "logps/chosen": -569.5715942382812, "logps/rejected": -541.0032958984375, "loss": 0.3755, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8141847848892212, "rewards/margins": 1.6208727359771729, "rewards/rejected": -3.4350574016571045, "step": 830 }, { "epoch": 0.95, "learning_rate": 3.609589412347347e-09, "logits/chosen": -1.0426546335220337, "logits/rejected": -0.6130795478820801, "logps/chosen": -517.9793701171875, "logps/rejected": -554.4876098632812, "loss": 0.3741, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7485253810882568, "rewards/margins": 1.8532413244247437, "rewards/rejected": -3.601766586303711, "step": 840 }, { "epoch": 0.96, "learning_rate": 2.1280387858572667e-09, "logits/chosen": -0.9971386194229126, "logits/rejected": -0.6877419352531433, "logps/chosen": -496.14208984375, "logps/rejected": -510.2310485839844, "loss": 0.3879, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8203623294830322, "rewards/margins": 1.555535078048706, "rewards/rejected": -3.3758976459503174, "step": 850 }, { "epoch": 0.97, "learning_rate": 1.03448615738172e-09, "logits/chosen": -1.004620909690857, "logits/rejected": -0.6449930667877197, "logps/chosen": -515.4451904296875, "logps/rejected": -543.5704956054688, "loss": 0.3946, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7996273040771484, "rewards/margins": 1.7972816228866577, "rewards/rejected": -3.5969085693359375, "step": 860 }, { "epoch": 0.98, "learning_rate": 3.3064328257259575e-10, "logits/chosen": -1.0538240671157837, "logits/rejected": -0.67207270860672, "logps/chosen": -510.56414794921875, "logps/rejected": -538.0349731445312, "loss": 0.3821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.884305715560913, "rewards/margins": 1.7009254693984985, "rewards/rejected": -3.585231065750122, "step": 870 }, { "epoch": 1.0, "learning_rate": 1.7611898088715216e-11, "logits/chosen": -1.063408613204956, "logits/rejected": -0.8093023300170898, "logps/chosen": -536.09716796875, "logps/rejected": -549.3472290039062, "loss": 0.4077, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7760584354400635, "rewards/margins": 1.5979098081588745, "rewards/rejected": -3.3739686012268066, "step": 880 }, { "epoch": 1.0, "step": 883, "total_flos": 0.0, "train_loss": 0.43981607611020046, "train_runtime": 8273.4147, "train_samples_per_second": 13.662, "train_steps_per_second": 0.107 } ], "logging_steps": 10, "max_steps": 883, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }