{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994340690435767, "eval_steps": 100, "global_step": 883, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.617977528089887e-09, "logits/chosen": -2.8261122703552246, "logits/rejected": -2.782524824142456, "logps/chosen": -386.01312255859375, "logps/rejected": -174.26467895507812, "loss": 0.2845, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.617977528089887e-08, "logits/chosen": -2.770503044128418, "logits/rejected": -2.7067270278930664, "logps/chosen": -337.03875732421875, "logps/rejected": -169.81399536132812, "loss": 0.2812, "rewards/accuracies": 0.3541666567325592, "rewards/chosen": -0.0004154888156335801, "rewards/margins": -0.0005714126164093614, "rewards/rejected": 0.00015592378622386605, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.1235955056179774e-07, "logits/chosen": -2.804004192352295, "logits/rejected": -2.800266742706299, "logps/chosen": -306.04486083984375, "logps/rejected": -189.47840881347656, "loss": 0.2839, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.001905800774693489, "rewards/margins": 0.003199492348358035, "rewards/rejected": -0.0012936916900798678, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.6853932584269663e-07, "logits/chosen": -2.7842857837677, "logits/rejected": -2.7480905055999756, "logps/chosen": -340.1336669921875, "logps/rejected": -176.36483764648438, "loss": 0.2736, "rewards/accuracies": 0.65625, "rewards/chosen": 0.013033352792263031, "rewards/margins": 0.02086206153035164, "rewards/rejected": -0.007828707806766033, "step": 30 }, { "epoch": 0.05, "learning_rate": 2.2471910112359549e-07, "logits/chosen": -2.7320685386657715, "logits/rejected": -2.694530963897705, "logps/chosen": -318.46612548828125, "logps/rejected": -170.09690856933594, "loss": 0.2779, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.011926891282200813, "rewards/margins": 0.05010765790939331, "rewards/rejected": -0.038180768489837646, "step": 40 }, { "epoch": 0.06, "learning_rate": 2.8089887640449437e-07, "logits/chosen": -2.638516902923584, "logits/rejected": -2.6276156902313232, "logps/chosen": -326.9003601074219, "logps/rejected": -187.60606384277344, "loss": 0.2637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04087567329406738, "rewards/margins": 0.14330647885799408, "rewards/rejected": -0.1024308055639267, "step": 50 }, { "epoch": 0.07, "learning_rate": 3.3707865168539325e-07, "logits/chosen": -2.584765672683716, "logits/rejected": -2.537776231765747, "logps/chosen": -350.67724609375, "logps/rejected": -225.81283569335938, "loss": 0.2493, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.013082382269203663, "rewards/margins": 0.19787843525409698, "rewards/rejected": -0.21096083521842957, "step": 60 }, { "epoch": 0.08, "learning_rate": 3.9325842696629214e-07, "logits/chosen": -2.5709831714630127, "logits/rejected": -2.5257468223571777, "logps/chosen": -342.55291748046875, "logps/rejected": -237.45059204101562, "loss": 0.2242, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19872836768627167, "rewards/margins": 0.2264036238193512, "rewards/rejected": -0.42513203620910645, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.4943820224719097e-07, "logits/chosen": -2.46580171585083, "logits/rejected": -2.4357619285583496, "logps/chosen": -338.7400817871094, "logps/rejected": -255.5446319580078, "loss": 0.187, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2514013946056366, "rewards/margins": 0.39069053530693054, "rewards/rejected": -0.6420919895172119, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.999980431020109e-07, "logits/chosen": -2.492363452911377, "logits/rejected": -2.4689993858337402, "logps/chosen": -373.74151611328125, "logps/rejected": -278.97564697265625, "loss": 0.1471, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2644743025302887, "rewards/margins": 0.5433769822120667, "rewards/rejected": -0.8078513145446777, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.997632524101301e-07, "logits/chosen": -2.5200695991516113, "logits/rejected": -2.493776798248291, "logps/chosen": -365.40191650390625, "logps/rejected": -278.2363586425781, "loss": 0.1485, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.31133976578712463, "rewards/margins": 0.6167228817939758, "rewards/rejected": -0.9280625581741333, "step": 100 }, { "epoch": 0.11, "eval_logits/chosen": -2.513258934020996, "eval_logits/rejected": -2.4997854232788086, "eval_logps/chosen": -313.2470703125, "eval_logps/rejected": -334.7263488769531, "eval_loss": 0.18025632202625275, "eval_rewards/accuracies": 0.640625, "eval_rewards/chosen": -0.5620743632316589, "eval_rewards/margins": 0.2116563767194748, "eval_rewards/rejected": -0.7737306952476501, "eval_runtime": 53.274, "eval_samples_per_second": 37.542, "eval_steps_per_second": 0.601, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.991375032514749e-07, "logits/chosen": -2.4822776317596436, "logits/rejected": -2.445827007293701, "logps/chosen": -388.3408508300781, "logps/rejected": -346.55419921875, "loss": 0.1265, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5192545652389526, "rewards/margins": 0.6625990867614746, "rewards/rejected": -1.1818536520004272, "step": 110 }, { "epoch": 0.14, "learning_rate": 4.98121775121344e-07, "logits/chosen": -2.4391703605651855, "logits/rejected": -2.4168076515197754, "logps/chosen": -354.26580810546875, "logps/rejected": -301.9068298339844, "loss": 0.1549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4692833423614502, "rewards/margins": 0.6351093053817749, "rewards/rejected": -1.104392647743225, "step": 120 }, { "epoch": 0.15, "learning_rate": 4.96717657955441e-07, "logits/chosen": -2.456784963607788, "logits/rejected": -2.402589797973633, "logps/chosen": -380.1900939941406, "logps/rejected": -345.0179443359375, "loss": 0.1076, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7707170248031616, "rewards/margins": 0.8249391317367554, "rewards/rejected": -1.595656156539917, "step": 130 }, { "epoch": 0.16, "learning_rate": 4.949273496411216e-07, "logits/chosen": -2.3544960021972656, "logits/rejected": -2.34092378616333, "logps/chosen": -446.9285583496094, "logps/rejected": -376.6002197265625, "loss": 0.0663, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0127520561218262, "rewards/margins": 1.086138367652893, "rewards/rejected": -2.098890542984009, "step": 140 }, { "epoch": 0.17, "learning_rate": 4.927536525770046e-07, "logits/chosen": -2.376620054244995, "logits/rejected": -2.3480007648468018, "logps/chosen": -434.1917419433594, "logps/rejected": -401.83587646484375, "loss": 0.067, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.034598708152771, "rewards/margins": 0.8842358589172363, "rewards/rejected": -1.9188346862792969, "step": 150 }, { "epoch": 0.18, "learning_rate": 4.901999692863326e-07, "logits/chosen": -2.378688335418701, "logits/rejected": -2.3023369312286377, "logps/chosen": -454.9664611816406, "logps/rejected": -411.72027587890625, "loss": 0.0667, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2886327505111694, "rewards/margins": 0.9617815017700195, "rewards/rejected": -2.2504146099090576, "step": 160 }, { "epoch": 0.19, "learning_rate": 4.872702970909464e-07, "logits/chosen": -2.327148914337158, "logits/rejected": -2.255551338195801, "logps/chosen": -474.45538330078125, "logps/rejected": -428.8330078125, "loss": 0.0538, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.456198811531067, "rewards/margins": 0.9602320790290833, "rewards/rejected": -2.416430950164795, "step": 170 }, { "epoch": 0.2, "learning_rate": 4.839692218542131e-07, "logits/chosen": -2.3194737434387207, "logits/rejected": -2.289604902267456, "logps/chosen": -398.30853271484375, "logps/rejected": -385.2601013183594, "loss": 0.0682, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9385713338851929, "rewards/margins": 1.0706655979156494, "rewards/rejected": -2.009237051010132, "step": 180 }, { "epoch": 0.22, "learning_rate": 4.803019108026997e-07, "logits/chosen": -2.3338797092437744, "logits/rejected": -2.2716145515441895, "logps/chosen": -423.6573181152344, "logps/rejected": -402.0962219238281, "loss": 0.0702, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.050208568572998, "rewards/margins": 0.8981093168258667, "rewards/rejected": -1.9483178853988647, "step": 190 }, { "epoch": 0.23, "learning_rate": 4.7627410443782887e-07, "logits/chosen": -2.3091745376586914, "logits/rejected": -2.2324705123901367, "logps/chosen": -481.0274353027344, "logps/rejected": -428.9169921875, "loss": 0.0592, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2154655456542969, "rewards/margins": 1.0429514646530151, "rewards/rejected": -2.2584171295166016, "step": 200 }, { "epoch": 0.23, "eval_logits/chosen": -2.2729034423828125, "eval_logits/rejected": -2.2396340370178223, "eval_logps/chosen": -431.05743408203125, "eval_logps/rejected": -490.1517639160156, "eval_loss": 0.06622401624917984, "eval_rewards/accuracies": 0.6796875, "eval_rewards/chosen": -1.7401777505874634, "eval_rewards/margins": 0.5878072381019592, "eval_rewards/rejected": -2.3279850482940674, "eval_runtime": 53.2522, "eval_samples_per_second": 37.557, "eval_steps_per_second": 0.601, "step": 200 }, { "epoch": 0.24, "learning_rate": 4.7189210755018034e-07, "logits/chosen": -2.2162270545959473, "logits/rejected": -2.109973430633545, "logps/chosen": -463.5928649902344, "logps/rejected": -448.2275390625, "loss": 0.0504, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.341931939125061, "rewards/margins": 1.1762769222259521, "rewards/rejected": -2.5182089805603027, "step": 210 }, { "epoch": 0.25, "learning_rate": 4.671627793504988e-07, "logits/chosen": -2.222045421600342, "logits/rejected": -2.127629280090332, "logps/chosen": -493.30133056640625, "logps/rejected": -467.193115234375, "loss": 0.0508, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4817438125610352, "rewards/margins": 1.220350980758667, "rewards/rejected": -2.702094554901123, "step": 220 }, { "epoch": 0.26, "learning_rate": 4.6209352273286095e-07, "logits/chosen": -2.233363628387451, "logits/rejected": -2.184459686279297, "logps/chosen": -456.435302734375, "logps/rejected": -444.67449951171875, "loss": 0.0543, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.346243143081665, "rewards/margins": 1.1588122844696045, "rewards/rejected": -2.5050556659698486, "step": 230 }, { "epoch": 0.27, "learning_rate": 4.56692272686805e-07, "logits/chosen": -2.227086067199707, "logits/rejected": -2.1657800674438477, "logps/chosen": -463.6180114746094, "logps/rejected": -463.9268493652344, "loss": 0.0517, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6306241750717163, "rewards/margins": 1.0510774850845337, "rewards/rejected": -2.68170166015625, "step": 240 }, { "epoch": 0.28, "learning_rate": 4.5096748387656326e-07, "logits/chosen": -2.2646913528442383, "logits/rejected": -2.1847498416900635, "logps/chosen": -526.8988647460938, "logps/rejected": -455.5740661621094, "loss": 0.049, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5727968215942383, "rewards/margins": 1.1092311143875122, "rewards/rejected": -2.68202805519104, "step": 250 }, { "epoch": 0.29, "learning_rate": 4.4492811740683877e-07, "logits/chosen": -2.272037982940674, "logits/rejected": -2.1871819496154785, "logps/chosen": -510.32061767578125, "logps/rejected": -452.8402404785156, "loss": 0.0492, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5913618803024292, "rewards/margins": 1.0165178775787354, "rewards/rejected": -2.607879400253296, "step": 260 }, { "epoch": 0.31, "learning_rate": 4.3858362679584354e-07, "logits/chosen": -2.17218017578125, "logits/rejected": -2.114896774291992, "logps/chosen": -487.0856018066406, "logps/rejected": -454.93115234375, "loss": 0.0458, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7943446636199951, "rewards/margins": 0.9125874638557434, "rewards/rejected": -2.706932544708252, "step": 270 }, { "epoch": 0.32, "learning_rate": 4.3194394317755245e-07, "logits/chosen": -2.1766180992126465, "logits/rejected": -2.0868289470672607, "logps/chosen": -574.8035888671875, "logps/rejected": -496.2310485839844, "loss": 0.0411, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0265350341796875, "rewards/margins": 1.1947269439697266, "rewards/rejected": -3.221261501312256, "step": 280 }, { "epoch": 0.33, "learning_rate": 4.2501945975633914e-07, "logits/chosen": -2.130429744720459, "logits/rejected": -2.0812618732452393, "logps/chosen": -565.4262084960938, "logps/rejected": -521.2449951171875, "loss": 0.0359, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1656954288482666, "rewards/margins": 1.03809654712677, "rewards/rejected": -3.203791856765747, "step": 290 }, { "epoch": 0.34, "learning_rate": 4.1782101553832405e-07, "logits/chosen": -2.1119463443756104, "logits/rejected": -2.0517754554748535, "logps/chosen": -525.1202392578125, "logps/rejected": -516.9113159179688, "loss": 0.0394, "rewards/accuracies": 0.75, "rewards/chosen": -2.1311917304992676, "rewards/margins": 1.1441915035247803, "rewards/rejected": -3.2753829956054688, "step": 300 }, { "epoch": 0.34, "eval_logits/chosen": -2.138880968093872, "eval_logits/rejected": -2.1100966930389404, "eval_logps/chosen": -494.104736328125, "eval_logps/rejected": -555.0247802734375, "eval_loss": 0.04939676821231842, "eval_rewards/accuracies": 0.6953125, "eval_rewards/chosen": -2.37065052986145, "eval_rewards/margins": 0.606063961982727, "eval_rewards/rejected": -2.976714611053467, "eval_runtime": 53.3026, "eval_samples_per_second": 37.522, "eval_steps_per_second": 0.6, "step": 300 }, { "epoch": 0.35, "learning_rate": 4.103598783649029e-07, "logits/chosen": -2.1108057498931885, "logits/rejected": -2.0741026401519775, "logps/chosen": -521.8655395507812, "logps/rejected": -494.017578125, "loss": 0.0399, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8981420993804932, "rewards/margins": 0.9213889837265015, "rewards/rejected": -2.819530963897705, "step": 310 }, { "epoch": 0.36, "learning_rate": 4.026477272750119e-07, "logits/chosen": -2.195026159286499, "logits/rejected": -2.128283977508545, "logps/chosen": -519.8826904296875, "logps/rejected": -489.1722717285156, "loss": 0.0407, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8402801752090454, "rewards/margins": 1.171037197113037, "rewards/rejected": -3.011317491531372, "step": 320 }, { "epoch": 0.37, "learning_rate": 3.9469663422373864e-07, "logits/chosen": -2.1483047008514404, "logits/rejected": -2.095182418823242, "logps/chosen": -515.9031372070312, "logps/rejected": -511.58416748046875, "loss": 0.0362, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.987370252609253, "rewards/margins": 1.1986653804779053, "rewards/rejected": -3.186035633087158, "step": 330 }, { "epoch": 0.38, "learning_rate": 3.865190451858954e-07, "logits/chosen": -2.1236190795898438, "logits/rejected": -2.038846492767334, "logps/chosen": -567.5887451171875, "logps/rejected": -531.2391357421875, "loss": 0.0378, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1050643920898438, "rewards/margins": 1.3257628679275513, "rewards/rejected": -3.4308273792266846, "step": 340 }, { "epoch": 0.4, "learning_rate": 3.781277606741327e-07, "logits/chosen": -2.073742628097534, "logits/rejected": -2.0120556354522705, "logps/chosen": -546.9391479492188, "logps/rejected": -522.168701171875, "loss": 0.0363, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.975027084350586, "rewards/margins": 1.3364970684051514, "rewards/rejected": -3.311523914337158, "step": 350 }, { "epoch": 0.41, "learning_rate": 3.6953591570208996e-07, "logits/chosen": -2.0751125812530518, "logits/rejected": -1.9913240671157837, "logps/chosen": -532.4324951171875, "logps/rejected": -570.5128173828125, "loss": 0.0339, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.149209976196289, "rewards/margins": 1.5435600280761719, "rewards/rejected": -3.692770004272461, "step": 360 }, { "epoch": 0.42, "learning_rate": 3.607569592239452e-07, "logits/chosen": -2.0640273094177246, "logits/rejected": -1.9882128238677979, "logps/chosen": -538.2806396484375, "logps/rejected": -503.5306701660156, "loss": 0.0419, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8893718719482422, "rewards/margins": 1.269718050956726, "rewards/rejected": -3.159090042114258, "step": 370 }, { "epoch": 0.43, "learning_rate": 3.518046330825494e-07, "logits/chosen": -2.1525139808654785, "logits/rejected": -2.09255051612854, "logps/chosen": -541.6168823242188, "logps/rejected": -525.7049560546875, "loss": 0.0488, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.759751319885254, "rewards/margins": 1.318419098854065, "rewards/rejected": -3.0781702995300293, "step": 380 }, { "epoch": 0.44, "learning_rate": 3.4269295049909713e-07, "logits/chosen": -2.111579179763794, "logits/rejected": -2.008594512939453, "logps/chosen": -519.3077392578125, "logps/rejected": -495.54736328125, "loss": 0.0479, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9454580545425415, "rewards/margins": 1.1256051063537598, "rewards/rejected": -3.0710630416870117, "step": 390 }, { "epoch": 0.45, "learning_rate": 3.3343617413800453e-07, "logits/chosen": -2.116332530975342, "logits/rejected": -2.0108766555786133, "logps/chosen": -560.86083984375, "logps/rejected": -513.68408203125, "loss": 0.0401, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8742320537567139, "rewards/margins": 1.4719246625900269, "rewards/rejected": -3.346156597137451, "step": 400 }, { "epoch": 0.45, "eval_logits/chosen": -2.0799078941345215, "eval_logits/rejected": -2.042891502380371, "eval_logps/chosen": -499.7916259765625, "eval_logps/rejected": -568.111572265625, "eval_loss": 0.052323117852211, "eval_rewards/accuracies": 0.703125, "eval_rewards/chosen": -2.4275197982788086, "eval_rewards/margins": 0.6800626516342163, "eval_rewards/rejected": -3.1075825691223145, "eval_runtime": 53.2431, "eval_samples_per_second": 37.564, "eval_steps_per_second": 0.601, "step": 400 }, { "epoch": 0.46, "learning_rate": 3.2404879378132893e-07, "logits/chosen": -2.1349825859069824, "logits/rejected": -2.0403599739074707, "logps/chosen": -573.2166137695312, "logps/rejected": -546.21484375, "loss": 0.0393, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9816631078720093, "rewards/margins": 1.3866467475891113, "rewards/rejected": -3.3683102130889893, "step": 410 }, { "epoch": 0.48, "learning_rate": 3.1454550364767894e-07, "logits/chosen": -2.0241355895996094, "logits/rejected": -1.955384612083435, "logps/chosen": -587.6964721679688, "logps/rejected": -564.29833984375, "loss": 0.0336, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.4298739433288574, "rewards/margins": 1.2927566766738892, "rewards/rejected": -3.722630262374878, "step": 420 }, { "epoch": 0.49, "learning_rate": 3.049411793911154e-07, "logits/chosen": -2.0459389686584473, "logits/rejected": -1.9516912698745728, "logps/chosen": -587.4751586914062, "logps/rejected": -571.4771728515625, "loss": 0.0377, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2811179161071777, "rewards/margins": 1.313342571258545, "rewards/rejected": -3.5944607257843018, "step": 430 }, { "epoch": 0.5, "learning_rate": 2.9525085481604914e-07, "logits/chosen": -2.076460361480713, "logits/rejected": -2.008845090866089, "logps/chosen": -501.68743896484375, "logps/rejected": -492.162841796875, "loss": 0.0406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8489151000976562, "rewards/margins": 1.1866779327392578, "rewards/rejected": -3.035592794418335, "step": 440 }, { "epoch": 0.51, "learning_rate": 2.854896983445833e-07, "logits/chosen": -2.0890183448791504, "logits/rejected": -1.99982488155365, "logps/chosen": -555.748291015625, "logps/rejected": -527.487060546875, "loss": 0.0446, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0640885829925537, "rewards/margins": 1.2410598993301392, "rewards/rejected": -3.3051486015319824, "step": 450 }, { "epoch": 0.52, "learning_rate": 2.7567298927313654e-07, "logits/chosen": -2.0581395626068115, "logits/rejected": -1.9576694965362549, "logps/chosen": -517.2969970703125, "logps/rejected": -494.01580810546875, "loss": 0.0444, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7231872081756592, "rewards/margins": 1.3869811296463013, "rewards/rejected": -3.110168695449829, "step": 460 }, { "epoch": 0.53, "learning_rate": 2.658160938555123e-07, "logits/chosen": -2.0949597358703613, "logits/rejected": -1.978247046470642, "logps/chosen": -530.58642578125, "logps/rejected": -518.8834838867188, "loss": 0.0408, "rewards/accuracies": 0.75, "rewards/chosen": -1.8325612545013428, "rewards/margins": 1.4025018215179443, "rewards/rejected": -3.235063076019287, "step": 470 }, { "epoch": 0.54, "learning_rate": 2.559344412498532e-07, "logits/chosen": -2.0444254875183105, "logits/rejected": -1.958059310913086, "logps/chosen": -538.9998779296875, "logps/rejected": -531.4126586914062, "loss": 0.0282, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9186712503433228, "rewards/margins": 1.5285937786102295, "rewards/rejected": -3.447265148162842, "step": 480 }, { "epoch": 0.55, "learning_rate": 2.460434993671294e-07, "logits/chosen": -2.060708522796631, "logits/rejected": -1.9541358947753906, "logps/chosen": -524.92431640625, "logps/rejected": -490.29669189453125, "loss": 0.0367, "rewards/accuracies": 0.75, "rewards/chosen": -1.956221342086792, "rewards/margins": 1.1774486303329468, "rewards/rejected": -3.1336700916290283, "step": 490 }, { "epoch": 0.57, "learning_rate": 2.361587506589672e-07, "logits/chosen": -2.0169599056243896, "logits/rejected": -1.9417072534561157, "logps/chosen": -512.1517333984375, "logps/rejected": -506.2262268066406, "loss": 0.0335, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9970229864120483, "rewards/margins": 1.3175480365753174, "rewards/rejected": -3.314570665359497, "step": 500 }, { "epoch": 0.57, "eval_logits/chosen": -2.0455641746520996, "eval_logits/rejected": -2.005706310272217, "eval_logps/chosen": -497.6726989746094, "eval_logps/rejected": -580.1129150390625, "eval_loss": 0.04609997943043709, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -2.4063305854797363, "eval_rewards/margins": 0.8212659358978271, "eval_rewards/rejected": -3.2275965213775635, "eval_runtime": 53.2473, "eval_samples_per_second": 37.561, "eval_steps_per_second": 0.601, "step": 500 }, { "epoch": 0.58, "learning_rate": 2.2629566788271613e-07, "logits/chosen": -2.0371432304382324, "logits/rejected": -1.9557580947875977, "logps/chosen": -560.0842895507812, "logps/rejected": -532.668212890625, "loss": 0.038, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9359846115112305, "rewards/margins": 1.3490617275238037, "rewards/rejected": -3.285046339035034, "step": 510 }, { "epoch": 0.59, "learning_rate": 2.1646968988169135e-07, "logits/chosen": -2.011871814727783, "logits/rejected": -1.9335308074951172, "logps/chosen": -507.52337646484375, "logps/rejected": -516.07958984375, "loss": 0.0395, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9623944759368896, "rewards/margins": 1.2198023796081543, "rewards/rejected": -3.182196855545044, "step": 520 }, { "epoch": 0.6, "learning_rate": 2.0669619741850232e-07, "logits/chosen": -1.9674116373062134, "logits/rejected": -1.874136209487915, "logps/chosen": -560.131591796875, "logps/rejected": -497.582275390625, "loss": 0.0333, "rewards/accuracies": 0.75, "rewards/chosen": -2.1217665672302246, "rewards/margins": 1.2892667055130005, "rewards/rejected": -3.4110336303710938, "step": 530 }, { "epoch": 0.61, "learning_rate": 1.9699048909929518e-07, "logits/chosen": -1.9731553792953491, "logits/rejected": -1.9132474660873413, "logps/chosen": -545.8017578125, "logps/rejected": -487.0414123535156, "loss": 0.0348, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1676135063171387, "rewards/margins": 1.0356405973434448, "rewards/rejected": -3.203253984451294, "step": 540 }, { "epoch": 0.62, "learning_rate": 1.8736775742659732e-07, "logits/chosen": -1.9223613739013672, "logits/rejected": -1.8391857147216797, "logps/chosen": -517.1666870117188, "logps/rejected": -498.1993713378906, "loss": 0.0342, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.948054313659668, "rewards/margins": 1.2562531232833862, "rewards/rejected": -3.2043070793151855, "step": 550 }, { "epoch": 0.63, "learning_rate": 1.7784306501824616e-07, "logits/chosen": -1.9654014110565186, "logits/rejected": -1.8999271392822266, "logps/chosen": -574.9034423828125, "logps/rejected": -553.96240234375, "loss": 0.0336, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0827620029449463, "rewards/margins": 1.3543148040771484, "rewards/rejected": -3.4370765686035156, "step": 560 }, { "epoch": 0.65, "learning_rate": 1.6843132102963025e-07, "logits/chosen": -1.9901962280273438, "logits/rejected": -1.8920910358428955, "logps/chosen": -571.3399658203125, "logps/rejected": -549.0781860351562, "loss": 0.0364, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1259512901306152, "rewards/margins": 1.4393842220306396, "rewards/rejected": -3.565335512161255, "step": 570 }, { "epoch": 0.66, "learning_rate": 1.591472578161458e-07, "logits/chosen": -2.0036253929138184, "logits/rejected": -1.9164316654205322, "logps/chosen": -597.6680908203125, "logps/rejected": -563.1033935546875, "loss": 0.0314, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.2230658531188965, "rewards/margins": 1.4771394729614258, "rewards/rejected": -3.7002053260803223, "step": 580 }, { "epoch": 0.67, "learning_rate": 1.5000540787240274e-07, "logits/chosen": -1.9651165008544922, "logits/rejected": -1.9028050899505615, "logps/chosen": -573.396728515625, "logps/rejected": -580.0013427734375, "loss": 0.0282, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4309842586517334, "rewards/margins": 1.3270018100738525, "rewards/rejected": -3.757986068725586, "step": 590 }, { "epoch": 0.68, "learning_rate": 1.410200810842749e-07, "logits/chosen": -1.9549171924591064, "logits/rejected": -1.8664356470108032, "logps/chosen": -546.6082153320312, "logps/rejected": -557.6231689453125, "loss": 0.0273, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.497020959854126, "rewards/margins": 1.3337533473968506, "rewards/rejected": -3.8307743072509766, "step": 600 }, { "epoch": 0.68, "eval_logits/chosen": -1.9557578563690186, "eval_logits/rejected": -1.9161585569381714, "eval_logps/chosen": -541.6861572265625, "eval_logps/rejected": -628.8740844726562, "eval_loss": 0.04087229445576668, "eval_rewards/accuracies": 0.70703125, "eval_rewards/chosen": -2.8464653491973877, "eval_rewards/margins": 0.8687426447868347, "eval_rewards/rejected": -3.715208053588867, "eval_runtime": 53.2271, "eval_samples_per_second": 37.575, "eval_steps_per_second": 0.601, "step": 600 }, { "epoch": 0.69, "learning_rate": 1.322053423294041e-07, "logits/chosen": -1.9264421463012695, "logits/rejected": -1.8596795797348022, "logps/chosen": -555.9921875, "logps/rejected": -528.2036743164062, "loss": 0.0364, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3642823696136475, "rewards/margins": 1.1525689363479614, "rewards/rejected": -3.5168514251708984, "step": 610 }, { "epoch": 0.7, "learning_rate": 1.2357498946121905e-07, "logits/chosen": -2.047217845916748, "logits/rejected": -1.9547252655029297, "logps/chosen": -573.0296630859375, "logps/rejected": -531.27880859375, "loss": 0.04, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1570980548858643, "rewards/margins": 1.236627459526062, "rewards/rejected": -3.393725633621216, "step": 620 }, { "epoch": 0.71, "learning_rate": 1.1514253171093161e-07, "logits/chosen": -2.0023763179779053, "logits/rejected": -1.9147300720214844, "logps/chosen": -558.9900512695312, "logps/rejected": -542.7515869140625, "loss": 0.0424, "rewards/accuracies": 0.75, "rewards/chosen": -2.013390064239502, "rewards/margins": 1.3009288311004639, "rewards/rejected": -3.314318895339966, "step": 630 }, { "epoch": 0.72, "learning_rate": 1.0692116854131883e-07, "logits/chosen": -1.9406598806381226, "logits/rejected": -1.8927663564682007, "logps/chosen": -502.59344482421875, "logps/rejected": -507.50372314453125, "loss": 0.0437, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.104419231414795, "rewards/margins": 1.1473711729049683, "rewards/rejected": -3.251791000366211, "step": 640 }, { "epoch": 0.74, "learning_rate": 9.89237689853889e-08, "logits/chosen": -1.9217647314071655, "logits/rejected": -1.840735673904419, "logps/chosen": -539.6429443359375, "logps/rejected": -520.7713623046875, "loss": 0.0348, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0584945678710938, "rewards/margins": 1.4084278345108032, "rewards/rejected": -3.4669222831726074, "step": 650 }, { "epoch": 0.75, "learning_rate": 9.11628515022765e-08, "logits/chosen": -1.936431884765625, "logits/rejected": -1.8774102926254272, "logps/chosen": -507.42413330078125, "logps/rejected": -489.027099609375, "loss": 0.0321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9116840362548828, "rewards/margins": 1.3113796710968018, "rewards/rejected": -3.2230639457702637, "step": 660 }, { "epoch": 0.76, "learning_rate": 8.365056438189486e-08, "logits/chosen": -1.9381834268569946, "logits/rejected": -1.8834428787231445, "logps/chosen": -545.0704345703125, "logps/rejected": -511.5694885253906, "loss": 0.0361, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.27673602104187, "rewards/margins": 1.0493533611297607, "rewards/rejected": -3.326089382171631, "step": 670 }, { "epoch": 0.77, "learning_rate": 7.639866672902101e-08, "logits/chosen": -1.9358975887298584, "logits/rejected": -1.8624534606933594, "logps/chosen": -552.2891845703125, "logps/rejected": -548.515625, "loss": 0.0358, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3348231315612793, "rewards/margins": 1.1951119899749756, "rewards/rejected": -3.529935121536255, "step": 680 }, { "epoch": 0.78, "learning_rate": 6.941851005657851e-08, "logits/chosen": -1.9736747741699219, "logits/rejected": -1.8825995922088623, "logps/chosen": -592.0359497070312, "logps/rejected": -565.7269897460938, "loss": 0.0392, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2588744163513184, "rewards/margins": 1.3530534505844116, "rewards/rejected": -3.6119277477264404, "step": 690 }, { "epoch": 0.79, "learning_rate": 6.272102051693051e-08, "logits/chosen": -1.9536447525024414, "logits/rejected": -1.8848438262939453, "logps/chosen": -544.1442260742188, "logps/rejected": -547.0964965820312, "loss": 0.0377, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0580058097839355, "rewards/margins": 1.3238760232925415, "rewards/rejected": -3.3818821907043457, "step": 700 }, { "epoch": 0.79, "eval_logits/chosen": -1.967289686203003, "eval_logits/rejected": -1.9273663759231567, "eval_logps/chosen": -510.2101745605469, "eval_logps/rejected": -594.1712036132812, "eval_loss": 0.0495593398809433, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -2.531705617904663, "eval_rewards/margins": 0.836473286151886, "eval_rewards/rejected": -3.3681788444519043, "eval_runtime": 53.2631, "eval_samples_per_second": 37.549, "eval_steps_per_second": 0.601, "step": 700 }, { "epoch": 0.8, "learning_rate": 5.6316681798995844e-08, "logits/chosen": -1.9590046405792236, "logits/rejected": -1.8676058053970337, "logps/chosen": -530.419921875, "logps/rejected": -494.63720703125, "loss": 0.0395, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.950580358505249, "rewards/margins": 1.374487280845642, "rewards/rejected": -3.3250679969787598, "step": 710 }, { "epoch": 0.81, "learning_rate": 5.0215518717961256e-08, "logits/chosen": -1.9477427005767822, "logits/rejected": -1.9216333627700806, "logps/chosen": -568.46337890625, "logps/rejected": -551.7440185546875, "loss": 0.0382, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0898067951202393, "rewards/margins": 1.2489385604858398, "rewards/rejected": -3.3387451171875, "step": 720 }, { "epoch": 0.83, "learning_rate": 4.4427081523275925e-08, "logits/chosen": -1.9811092615127563, "logits/rejected": -1.8924198150634766, "logps/chosen": -560.4685668945312, "logps/rejected": -545.5777587890625, "loss": 0.0384, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.166611433029175, "rewards/margins": 1.3478820323944092, "rewards/rejected": -3.514493465423584, "step": 730 }, { "epoch": 0.84, "learning_rate": 3.896043094949061e-08, "logits/chosen": -1.952548623085022, "logits/rejected": -1.9003283977508545, "logps/chosen": -536.4952392578125, "logps/rejected": -554.130859375, "loss": 0.0368, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2593836784362793, "rewards/margins": 1.1702790260314941, "rewards/rejected": -3.4296627044677734, "step": 740 }, { "epoch": 0.85, "learning_rate": 3.3824124033343557e-08, "logits/chosen": -1.978271245956421, "logits/rejected": -1.8869798183441162, "logps/chosen": -595.8272094726562, "logps/rejected": -596.80908203125, "loss": 0.0356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2419838905334473, "rewards/margins": 1.5834529399871826, "rewards/rejected": -3.825437068939209, "step": 750 }, { "epoch": 0.86, "learning_rate": 2.9026200719291904e-08, "logits/chosen": -1.9425359964370728, "logits/rejected": -1.8644979000091553, "logps/chosen": -541.2769775390625, "logps/rejected": -546.4044799804688, "loss": 0.0394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3050169944763184, "rewards/margins": 1.2533900737762451, "rewards/rejected": -3.5584073066711426, "step": 760 }, { "epoch": 0.87, "learning_rate": 2.4574171274456433e-08, "logits/chosen": -1.9859609603881836, "logits/rejected": -1.8982467651367188, "logps/chosen": -581.1209716796875, "logps/rejected": -553.73291015625, "loss": 0.036, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9945024251937866, "rewards/margins": 1.5215613842010498, "rewards/rejected": -3.516064167022705, "step": 770 }, { "epoch": 0.88, "learning_rate": 2.047500453267881e-08, "logits/chosen": -1.9515501260757446, "logits/rejected": -1.857346534729004, "logps/chosen": -565.3553466796875, "logps/rejected": -536.8668823242188, "loss": 0.0422, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2416954040527344, "rewards/margins": 1.2734299898147583, "rewards/rejected": -3.515125274658203, "step": 780 }, { "epoch": 0.89, "learning_rate": 1.673511698609292e-08, "logits/chosen": -1.9258711338043213, "logits/rejected": -1.8735402822494507, "logps/chosen": -532.5035400390625, "logps/rejected": -557.9683837890625, "loss": 0.0354, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2362167835235596, "rewards/margins": 1.4313932657241821, "rewards/rejected": -3.6676101684570312, "step": 790 }, { "epoch": 0.91, "learning_rate": 1.3360362741285769e-08, "logits/chosen": -1.9208987951278687, "logits/rejected": -1.8273556232452393, "logps/chosen": -566.8057861328125, "logps/rejected": -583.4982299804688, "loss": 0.0352, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.093482732772827, "rewards/margins": 1.81829035282135, "rewards/rejected": -3.911773681640625, "step": 800 }, { "epoch": 0.91, "eval_logits/chosen": -1.9501041173934937, "eval_logits/rejected": -1.9090631008148193, "eval_logps/chosen": -521.0439453125, "eval_logps/rejected": -606.3505249023438, "eval_loss": 0.04646110162138939, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -2.640043258666992, "eval_rewards/margins": 0.8499288558959961, "eval_rewards/rejected": -3.4899723529815674, "eval_runtime": 53.2044, "eval_samples_per_second": 37.591, "eval_steps_per_second": 0.601, "step": 800 }, { "epoch": 0.92, "learning_rate": 1.0356024355769433e-08, "logits/chosen": -1.950596570968628, "logits/rejected": -1.882962942123413, "logps/chosen": -582.3919067382812, "logps/rejected": -592.8079223632812, "loss": 0.0361, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.399203062057495, "rewards/margins": 1.3655624389648438, "rewards/rejected": -3.7647652626037598, "step": 810 }, { "epoch": 0.93, "learning_rate": 7.726804569108597e-09, "logits/chosen": -1.9748871326446533, "logits/rejected": -1.888811707496643, "logps/chosen": -553.1395263671875, "logps/rejected": -559.5152587890625, "loss": 0.0332, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.228854179382324, "rewards/margins": 1.4253013134002686, "rewards/rejected": -3.654155731201172, "step": 820 }, { "epoch": 0.94, "learning_rate": 5.476818941645561e-09, "logits/chosen": -1.944598913192749, "logits/rejected": -1.8884025812149048, "logps/chosen": -540.9061279296875, "logps/rejected": -574.0789794921875, "loss": 0.036, "rewards/accuracies": 0.75, "rewards/chosen": -2.298133134841919, "rewards/margins": 1.5589947700500488, "rewards/rejected": -3.8571276664733887, "step": 830 }, { "epoch": 0.95, "learning_rate": 3.609589412347347e-09, "logits/chosen": -1.995321273803711, "logits/rejected": -1.909641981124878, "logps/chosen": -570.28369140625, "logps/rejected": -541.8267211914062, "loss": 0.0347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.329395055770874, "rewards/margins": 1.3247489929199219, "rewards/rejected": -3.654143810272217, "step": 840 }, { "epoch": 0.96, "learning_rate": 2.1280387858572667e-09, "logits/chosen": -1.9664127826690674, "logits/rejected": -1.8978990316390991, "logps/chosen": -567.4363403320312, "logps/rejected": -551.3905639648438, "loss": 0.0347, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.342649459838867, "rewards/margins": 1.2894551753997803, "rewards/rejected": -3.6321043968200684, "step": 850 }, { "epoch": 0.97, "learning_rate": 1.03448615738172e-09, "logits/chosen": -1.91958749294281, "logits/rejected": -1.8549039363861084, "logps/chosen": -635.5400390625, "logps/rejected": -613.4205322265625, "loss": 0.0354, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2879905700683594, "rewards/margins": 1.6663434505462646, "rewards/rejected": -3.954333782196045, "step": 860 }, { "epoch": 0.98, "learning_rate": 3.3064328257259575e-10, "logits/chosen": -1.9804208278656006, "logits/rejected": -1.891332983970642, "logps/chosen": -612.157958984375, "logps/rejected": -556.913818359375, "loss": 0.0366, "rewards/accuracies": 0.75, "rewards/chosen": -2.3392751216888428, "rewards/margins": 1.2889056205749512, "rewards/rejected": -3.628180980682373, "step": 870 }, { "epoch": 1.0, "learning_rate": 1.7611898088715216e-11, "logits/chosen": -1.952614426612854, "logits/rejected": -1.846652626991272, "logps/chosen": -577.5618896484375, "logps/rejected": -596.773193359375, "loss": 0.0347, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3046746253967285, "rewards/margins": 1.4788968563079834, "rewards/rejected": -3.783571720123291, "step": 880 }, { "epoch": 1.0, "step": 883, "total_flos": 0.0, "train_loss": 0.06580274857555349, "train_runtime": 7965.4661, "train_samples_per_second": 14.19, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 883, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }