{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 1, "global_step": 472, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 74.35959798227883, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.0022015571594238, "logits/rejected": -1.0571039915084839, "logps/chosen": -26.953861236572266, "logps/rejected": -41.69861602783203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 71.81906743432833, "learning_rate": 2.083333333333333e-08, "logits/chosen": -0.9866722822189331, "logits/rejected": -1.1117209196090698, "logps/chosen": -33.70802307128906, "logps/rejected": -37.13496398925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 75.40773138219605, "learning_rate": 3.125e-08, "logits/chosen": -1.2932835817337036, "logits/rejected": -1.2812855243682861, "logps/chosen": -30.66956329345703, "logps/rejected": -49.71609878540039, "loss": 0.7141, "rewards/accuracies": 0.3125, "rewards/chosen": -0.015645474195480347, "rewards/margins": -0.0929969847202301, "rewards/rejected": 0.07735151052474976, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 74.47397381287325, "learning_rate": 4.166666666666666e-08, "logits/chosen": -1.0415635108947754, "logits/rejected": -1.0745270252227783, "logps/chosen": -26.642629623413086, "logps/rejected": -38.44277572631836, "loss": 0.7026, "rewards/accuracies": 0.5, "rewards/chosen": 0.048020511865615845, "rewards/margins": -0.018185943365097046, "rewards/rejected": 0.06620645523071289, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 76.30048141954929, "learning_rate": 5.208333333333333e-08, "logits/chosen": -1.2889574766159058, "logits/rejected": -1.219961404800415, "logps/chosen": -31.46270751953125, "logps/rejected": -28.84333610534668, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.04683685302734375, "rewards/margins": -0.016847282648086548, "rewards/rejected": -0.029989570379257202, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 71.45564125608341, "learning_rate": 6.25e-08, "logits/chosen": -1.2035868167877197, "logits/rejected": -1.1329045295715332, "logps/chosen": -35.237884521484375, "logps/rejected": -38.39533996582031, "loss": 0.6914, "rewards/accuracies": 0.4375, "rewards/chosen": -0.014047741889953613, "rewards/margins": 0.0001532137393951416, "rewards/rejected": -0.014200955629348755, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 74.36407794239695, "learning_rate": 7.291666666666667e-08, "logits/chosen": -1.2061525583267212, "logits/rejected": -1.1756294965744019, "logps/chosen": -29.565950393676758, "logps/rejected": -33.2646484375, "loss": 0.7034, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09937351942062378, "rewards/margins": 0.04770404100418091, "rewards/rejected": 0.05166947841644287, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 72.94427557648902, "learning_rate": 8.333333333333333e-08, "logits/chosen": -1.2344228029251099, "logits/rejected": -1.255335807800293, "logps/chosen": -26.22496223449707, "logps/rejected": -39.68927764892578, "loss": 0.7084, "rewards/accuracies": 0.375, "rewards/chosen": 0.005427069962024689, "rewards/margins": -0.014611326158046722, "rewards/rejected": 0.02003839612007141, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 69.72083135093712, "learning_rate": 9.375e-08, "logits/chosen": -1.1401718854904175, "logits/rejected": -1.1010041236877441, "logps/chosen": -33.88338851928711, "logps/rejected": -28.835594177246094, "loss": 0.7, "rewards/accuracies": 0.5625, "rewards/chosen": 0.037932395935058594, "rewards/margins": -0.005142271518707275, "rewards/rejected": 0.04307466745376587, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 71.74909213913158, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.180943250656128, "logits/rejected": -1.090497612953186, "logps/chosen": -37.923099517822266, "logps/rejected": -36.79877471923828, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": 0.031415216624736786, "rewards/margins": -0.0013954713940620422, "rewards/rejected": 0.03281068801879883, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 75.11980776081754, "learning_rate": 1.1458333333333332e-07, "logits/chosen": -0.9823209047317505, "logits/rejected": -0.9876938462257385, "logps/chosen": -27.41145896911621, "logps/rejected": -45.299991607666016, "loss": 0.7077, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0018342137336730957, "rewards/margins": 0.0696558952331543, "rewards/rejected": -0.07149010896682739, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 71.48536897103016, "learning_rate": 1.25e-07, "logits/chosen": -0.9993177652359009, "logits/rejected": -0.9222314953804016, "logps/chosen": -23.32973289489746, "logps/rejected": -32.4486083984375, "loss": 0.6838, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01836332678794861, "rewards/margins": 0.00046828389167785645, "rewards/rejected": -0.018831610679626465, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 69.73428591175114, "learning_rate": 1.3541666666666666e-07, "logits/chosen": -1.4160847663879395, "logits/rejected": -1.2769191265106201, "logps/chosen": -26.515804290771484, "logps/rejected": -36.50257110595703, "loss": 0.7069, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09053070843219757, "rewards/margins": 0.059635356068611145, "rewards/rejected": 0.030895352363586426, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 74.26548135080421, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -1.2310669422149658, "logits/rejected": -1.1061973571777344, "logps/chosen": -38.83403396606445, "logps/rejected": -57.466835021972656, "loss": 0.7022, "rewards/accuracies": 0.4375, "rewards/chosen": -0.014897465705871582, "rewards/margins": 0.015999972820281982, "rewards/rejected": -0.030897438526153564, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 70.61107518338639, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -1.245528221130371, "logits/rejected": -1.2389111518859863, "logps/chosen": -24.09255027770996, "logps/rejected": -35.16242218017578, "loss": 0.7081, "rewards/accuracies": 0.625, "rewards/chosen": 0.009011238813400269, "rewards/margins": 0.01104736328125, "rewards/rejected": -0.0020361244678497314, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 69.06624100974847, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -1.1713544130325317, "logits/rejected": -1.2104028463363647, "logps/chosen": -27.774211883544922, "logps/rejected": -32.56517791748047, "loss": 0.6959, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06307366490364075, "rewards/margins": -0.055293723940849304, "rewards/rejected": -0.007779940962791443, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 74.45322195801226, "learning_rate": 1.7708333333333334e-07, "logits/chosen": -1.2374873161315918, "logits/rejected": -1.2554068565368652, "logps/chosen": -24.268505096435547, "logps/rejected": -35.4179573059082, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001607835292816162, "rewards/margins": -0.01779848337173462, "rewards/rejected": 0.017637699842453003, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 71.43793235652528, "learning_rate": 1.875e-07, "logits/chosen": -1.1041090488433838, "logits/rejected": -1.0679348707199097, "logps/chosen": -23.000707626342773, "logps/rejected": -29.853412628173828, "loss": 0.705, "rewards/accuracies": 0.375, "rewards/chosen": -0.055961236357688904, "rewards/margins": -0.05573050677776337, "rewards/rejected": -0.0002307295799255371, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 71.50823519836709, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -1.0583523511886597, "logits/rejected": -1.0313684940338135, "logps/chosen": -21.926410675048828, "logps/rejected": -35.9990348815918, "loss": 0.7084, "rewards/accuracies": 0.3125, "rewards/chosen": -0.018382668495178223, "rewards/margins": 0.02424493432044983, "rewards/rejected": -0.04262760281562805, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 65.57787832871358, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.2984358072280884, "logits/rejected": -1.2342230081558228, "logps/chosen": -30.60858154296875, "logps/rejected": -38.88047790527344, "loss": 0.7012, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01342683658003807, "rewards/margins": 0.014368299394845963, "rewards/rejected": -0.027795135974884033, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 68.54840408145243, "learning_rate": 2.1875e-07, "logits/chosen": -1.3012466430664062, "logits/rejected": -1.3128973245620728, "logps/chosen": -26.805089950561523, "logps/rejected": -41.52635192871094, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": 0.01039344072341919, "rewards/margins": 0.011234819889068604, "rewards/rejected": -0.0008413791656494141, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 75.35032250165732, "learning_rate": 2.2916666666666663e-07, "logits/chosen": -1.1717727184295654, "logits/rejected": -1.1936640739440918, "logps/chosen": -21.9468994140625, "logps/rejected": -27.52823257446289, "loss": 0.6756, "rewards/accuracies": 0.5625, "rewards/chosen": -0.030159294605255127, "rewards/margins": 0.03967534005641937, "rewards/rejected": -0.0698346346616745, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 69.76965900033692, "learning_rate": 2.3958333333333335e-07, "logits/chosen": -1.227396845817566, "logits/rejected": -1.2514605522155762, "logps/chosen": -30.95319938659668, "logps/rejected": -33.107421875, "loss": 0.6483, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009671658277511597, "rewards/margins": 0.06378874182701111, "rewards/rejected": -0.05411708354949951, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 71.41404237507972, "learning_rate": 2.5e-07, "logits/chosen": -1.1064453125, "logits/rejected": -1.0061161518096924, "logps/chosen": -32.68113327026367, "logps/rejected": -38.1193962097168, "loss": 0.6682, "rewards/accuracies": 0.5625, "rewards/chosen": -0.034807443618774414, "rewards/margins": 0.0188644677400589, "rewards/rejected": -0.05367191135883331, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 70.49202249008462, "learning_rate": 2.604166666666667e-07, "logits/chosen": -1.1899397373199463, "logits/rejected": -1.1957398653030396, "logps/chosen": -37.75098419189453, "logps/rejected": -34.21184539794922, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": 0.003971487283706665, "rewards/margins": 0.10494436323642731, "rewards/rejected": -0.10097287595272064, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 67.14613715257252, "learning_rate": 2.708333333333333e-07, "logits/chosen": -1.3827494382858276, "logits/rejected": -1.3526558876037598, "logps/chosen": -27.053857803344727, "logps/rejected": -33.783485412597656, "loss": 0.6528, "rewards/accuracies": 0.625, "rewards/chosen": 0.015252411365509033, "rewards/margins": 0.1568407416343689, "rewards/rejected": -0.14158833026885986, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 68.86332311369058, "learning_rate": 2.8125e-07, "logits/chosen": -1.0096489191055298, "logits/rejected": -1.0554242134094238, "logps/chosen": -33.25010681152344, "logps/rejected": -35.958675384521484, "loss": 0.6573, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04330983757972717, "rewards/margins": 0.12464988231658936, "rewards/rejected": -0.16795971989631653, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 66.94051666468431, "learning_rate": 2.916666666666667e-07, "logits/chosen": -1.3319611549377441, "logits/rejected": -1.2341638803482056, "logps/chosen": -23.90947914123535, "logps/rejected": -31.997909545898438, "loss": 0.6585, "rewards/accuracies": 0.5, "rewards/chosen": -0.028478246182203293, "rewards/margins": 0.07672013342380524, "rewards/rejected": -0.10519838333129883, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 62.13361032578897, "learning_rate": 3.020833333333333e-07, "logits/chosen": -1.058496356010437, "logits/rejected": -1.061716914176941, "logps/chosen": -27.815153121948242, "logps/rejected": -35.63865661621094, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": 0.002513296902179718, "rewards/margins": 0.08875668793916702, "rewards/rejected": -0.0862433910369873, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 69.40663934648452, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.148958683013916, "logits/rejected": -1.0955352783203125, "logps/chosen": -29.258386611938477, "logps/rejected": -37.004207611083984, "loss": 0.6333, "rewards/accuracies": 0.875, "rewards/chosen": -0.018777184188365936, "rewards/margins": 0.270157128572464, "rewards/rejected": -0.28893429040908813, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 63.75111913966312, "learning_rate": 3.2291666666666666e-07, "logits/chosen": -1.1732263565063477, "logits/rejected": -1.0661126375198364, "logps/chosen": -31.076984405517578, "logps/rejected": -34.91364288330078, "loss": 0.6028, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04974186420440674, "rewards/margins": 0.24639210104942322, "rewards/rejected": -0.19665023684501648, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 63.46121996218027, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.2128429412841797, "logits/rejected": -1.2360285520553589, "logps/chosen": -36.98728942871094, "logps/rejected": -38.032920837402344, "loss": 0.6111, "rewards/accuracies": 0.75, "rewards/chosen": 0.015447601675987244, "rewards/margins": 0.3491629958152771, "rewards/rejected": -0.33371537923812866, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 64.01628740909891, "learning_rate": 3.4375e-07, "logits/chosen": -1.2774078845977783, "logits/rejected": -1.252654790878296, "logps/chosen": -30.110694885253906, "logps/rejected": -39.82551956176758, "loss": 0.5955, "rewards/accuracies": 0.625, "rewards/chosen": -0.12499848008155823, "rewards/margins": 0.44781434535980225, "rewards/rejected": -0.5728127956390381, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 61.04313764198192, "learning_rate": 3.541666666666667e-07, "logits/chosen": -1.2864689826965332, "logits/rejected": -1.1125664710998535, "logps/chosen": -29.14852523803711, "logps/rejected": -35.5062255859375, "loss": 0.5925, "rewards/accuracies": 0.875, "rewards/chosen": -0.07132556289434433, "rewards/margins": 0.29910576343536377, "rewards/rejected": -0.3704313337802887, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 61.863987476747845, "learning_rate": 3.645833333333333e-07, "logits/chosen": -1.1560570001602173, "logits/rejected": -1.1620285511016846, "logps/chosen": -21.810134887695312, "logps/rejected": -45.566375732421875, "loss": 0.5656, "rewards/accuracies": 0.875, "rewards/chosen": -0.03650933504104614, "rewards/margins": 0.7581607699394226, "rewards/rejected": -0.7946701049804688, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 59.689844232963395, "learning_rate": 3.75e-07, "logits/chosen": -1.1466398239135742, "logits/rejected": -1.1647982597351074, "logps/chosen": -24.348777770996094, "logps/rejected": -29.572818756103516, "loss": 0.5827, "rewards/accuracies": 0.5, "rewards/chosen": -0.0986182689666748, "rewards/margins": 0.30504968762397766, "rewards/rejected": -0.40366795659065247, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 66.8530097795874, "learning_rate": 3.8541666666666665e-07, "logits/chosen": -1.1411319971084595, "logits/rejected": -1.1138074398040771, "logps/chosen": -29.88798713684082, "logps/rejected": -30.512065887451172, "loss": 0.5748, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1340489685535431, "rewards/margins": 0.17574873566627502, "rewards/rejected": -0.3097977042198181, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 59.54927270244813, "learning_rate": 3.958333333333333e-07, "logits/chosen": -1.2669272422790527, "logits/rejected": -1.2261359691619873, "logps/chosen": -33.28898239135742, "logps/rejected": -52.16841506958008, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": -0.1103217601776123, "rewards/margins": 0.4893791675567627, "rewards/rejected": -0.599700927734375, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 61.22261904686975, "learning_rate": 4.0625e-07, "logits/chosen": -1.2993597984313965, "logits/rejected": -1.2113492488861084, "logps/chosen": -38.285438537597656, "logps/rejected": -51.55641555786133, "loss": 0.5969, "rewards/accuracies": 0.75, "rewards/chosen": -0.14819863438606262, "rewards/margins": 0.8797410726547241, "rewards/rejected": -1.027939796447754, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 60.12706158104359, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.1191147565841675, "logits/rejected": -0.9404773712158203, "logps/chosen": -27.851015090942383, "logps/rejected": -40.25725173950195, "loss": 0.5564, "rewards/accuracies": 0.625, "rewards/chosen": -0.17417365312576294, "rewards/margins": 0.5528932809829712, "rewards/rejected": -0.7270669341087341, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 57.92726362837684, "learning_rate": 4.270833333333333e-07, "logits/chosen": -1.0899916887283325, "logits/rejected": -0.9453008770942688, "logps/chosen": -26.283432006835938, "logps/rejected": -45.57765579223633, "loss": 0.5461, "rewards/accuracies": 0.875, "rewards/chosen": -0.13252140581607819, "rewards/margins": 0.8450538516044617, "rewards/rejected": -0.9775752425193787, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 58.169474320406394, "learning_rate": 4.375e-07, "logits/chosen": -1.1068731546401978, "logits/rejected": -1.0945335626602173, "logps/chosen": -24.04115104675293, "logps/rejected": -36.62716293334961, "loss": 0.5406, "rewards/accuracies": 0.5, "rewards/chosen": -0.10331939160823822, "rewards/margins": 0.5659677386283875, "rewards/rejected": -0.6692871451377869, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 66.66372536069926, "learning_rate": 4.479166666666667e-07, "logits/chosen": -1.1794297695159912, "logits/rejected": -1.193390965461731, "logps/chosen": -46.91223907470703, "logps/rejected": -41.650333404541016, "loss": 0.5573, "rewards/accuracies": 0.5, "rewards/chosen": -0.3094208538532257, "rewards/margins": 0.14515355229377747, "rewards/rejected": -0.4545744061470032, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 56.78835490397593, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -1.0120888948440552, "logits/rejected": -0.8681447505950928, "logps/chosen": -33.3481330871582, "logps/rejected": -48.61557388305664, "loss": 0.5516, "rewards/accuracies": 0.75, "rewards/chosen": -0.093438059091568, "rewards/margins": 0.624420166015625, "rewards/rejected": -0.7178582549095154, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 59.752166781716426, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.2927764654159546, "logits/rejected": -1.212005615234375, "logps/chosen": -27.116588592529297, "logps/rejected": -48.95513153076172, "loss": 0.5622, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08109956979751587, "rewards/margins": 1.1761130094528198, "rewards/rejected": -1.2572126388549805, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 56.0712139430381, "learning_rate": 4.791666666666667e-07, "logits/chosen": -1.310462474822998, "logits/rejected": -1.1343291997909546, "logps/chosen": -32.363792419433594, "logps/rejected": -41.77903747558594, "loss": 0.4924, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1404968947172165, "rewards/margins": 0.6709720492362976, "rewards/rejected": -0.8114689588546753, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 53.687876472368266, "learning_rate": 4.895833333333333e-07, "logits/chosen": -1.0838322639465332, "logits/rejected": -1.1518497467041016, "logps/chosen": -34.07114791870117, "logps/rejected": -38.58074188232422, "loss": 0.475, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14443516731262207, "rewards/margins": 0.33194631338119507, "rewards/rejected": -0.47638148069381714, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 57.81488552818391, "learning_rate": 5e-07, "logits/chosen": -1.3274493217468262, "logits/rejected": -1.099511981010437, "logps/chosen": -33.7588005065918, "logps/rejected": -51.24869918823242, "loss": 0.46, "rewards/accuracies": 0.875, "rewards/chosen": -0.2817200720310211, "rewards/margins": 1.4236443042755127, "rewards/rejected": -1.705364465713501, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 51.9232811376309, "learning_rate": 4.999931375995349e-07, "logits/chosen": -1.2194585800170898, "logits/rejected": -1.091806173324585, "logps/chosen": -28.378108978271484, "logps/rejected": -36.51158905029297, "loss": 0.4926, "rewards/accuracies": 0.5, "rewards/chosen": -0.31702089309692383, "rewards/margins": 0.6066832542419434, "rewards/rejected": -0.9237041473388672, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 58.79932831253633, "learning_rate": 4.999725507748798e-07, "logits/chosen": -1.4276092052459717, "logits/rejected": -1.2703173160552979, "logps/chosen": -27.375112533569336, "logps/rejected": -45.86325454711914, "loss": 0.5289, "rewards/accuracies": 0.875, "rewards/chosen": -0.21876651048660278, "rewards/margins": 1.0810855627059937, "rewards/rejected": -1.2998521327972412, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 56.405768925690126, "learning_rate": 4.99938240656235e-07, "logits/chosen": -1.1859700679779053, "logits/rejected": -1.110828161239624, "logps/chosen": -30.097301483154297, "logps/rejected": -56.47822189331055, "loss": 0.4842, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14388218522071838, "rewards/margins": 0.8802270889282227, "rewards/rejected": -1.0241093635559082, "step": 51 }, { "epoch": 0.8813559322033898, "grad_norm": 54.84466981430871, "learning_rate": 4.998902091271985e-07, "logits/chosen": -1.2907037734985352, "logits/rejected": -1.198894739151001, "logps/chosen": -23.458566665649414, "logps/rejected": -36.68768310546875, "loss": 0.4399, "rewards/accuracies": 0.875, "rewards/chosen": -0.09736257791519165, "rewards/margins": 0.8091447353363037, "rewards/rejected": -0.9065073132514954, "step": 52 }, { "epoch": 0.8983050847457628, "grad_norm": 53.59884977562605, "learning_rate": 4.998284588246634e-07, "logits/chosen": -1.2666661739349365, "logits/rejected": -1.1450353860855103, "logps/chosen": -32.4919319152832, "logps/rejected": -36.02308654785156, "loss": 0.4643, "rewards/accuracies": 0.875, "rewards/chosen": -0.2568899691104889, "rewards/margins": 0.8810305595397949, "rewards/rejected": -1.137920618057251, "step": 53 }, { "epoch": 0.9152542372881356, "grad_norm": 56.86352488023535, "learning_rate": 4.997529931386719e-07, "logits/chosen": -1.2422281503677368, "logits/rejected": -1.2324570417404175, "logps/chosen": -33.5006217956543, "logps/rejected": -30.282136917114258, "loss": 0.4844, "rewards/accuracies": 0.5, "rewards/chosen": -0.3842281401157379, "rewards/margins": 0.29243284463882446, "rewards/rejected": -0.6766610145568848, "step": 54 }, { "epoch": 0.9322033898305084, "grad_norm": 53.15925076232375, "learning_rate": 4.996638162122302e-07, "logits/chosen": -1.2868903875350952, "logits/rejected": -1.1907857656478882, "logps/chosen": -32.84450149536133, "logps/rejected": -39.07723617553711, "loss": 0.4865, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19609573483467102, "rewards/margins": 1.1344835758209229, "rewards/rejected": -1.3305792808532715, "step": 55 }, { "epoch": 0.9491525423728814, "grad_norm": 50.6107831993107, "learning_rate": 4.995609329410804e-07, "logits/chosen": -1.1869336366653442, "logits/rejected": -1.0896047353744507, "logps/chosen": -25.43665885925293, "logps/rejected": -37.78445816040039, "loss": 0.4331, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2436077892780304, "rewards/margins": 1.5263582468032837, "rewards/rejected": -1.7699658870697021, "step": 56 }, { "epoch": 0.9661016949152542, "grad_norm": 53.83410279226871, "learning_rate": 4.994443489734322e-07, "logits/chosen": -1.140820860862732, "logits/rejected": -1.0942250490188599, "logps/chosen": -28.2945499420166, "logps/rejected": -44.74162292480469, "loss": 0.4513, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28495079278945923, "rewards/margins": 1.6222182512283325, "rewards/rejected": -1.9071691036224365, "step": 57 }, { "epoch": 0.9830508474576272, "grad_norm": 57.20949918087681, "learning_rate": 4.993140707096525e-07, "logits/chosen": -1.2853411436080933, "logits/rejected": -1.1984596252441406, "logps/chosen": -35.58999252319336, "logps/rejected": -42.4873161315918, "loss": 0.4307, "rewards/accuracies": 0.875, "rewards/chosen": -0.26272836327552795, "rewards/margins": 1.1384968757629395, "rewards/rejected": -1.4012253284454346, "step": 58 }, { "epoch": 1.0, "grad_norm": 49.633369993703724, "learning_rate": 4.991701053019145e-07, "logits/chosen": -1.2317020893096924, "logits/rejected": -1.2267297506332397, "logps/chosen": -28.742332458496094, "logps/rejected": -50.28435516357422, "loss": 0.4259, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22372712194919586, "rewards/margins": 1.6622364521026611, "rewards/rejected": -1.8859635591506958, "step": 59 }, { "epoch": 1.0169491525423728, "grad_norm": 43.19271219759473, "learning_rate": 4.990124606538042e-07, "logits/chosen": -1.27030348777771, "logits/rejected": -1.2970830202102661, "logps/chosen": -22.824378967285156, "logps/rejected": -43.53241729736328, "loss": 0.3445, "rewards/accuracies": 0.875, "rewards/chosen": -0.11317068338394165, "rewards/margins": 1.718346118927002, "rewards/rejected": -1.8315167427062988, "step": 60 }, { "epoch": 1.0338983050847457, "grad_norm": 46.56532536625075, "learning_rate": 4.988411454198874e-07, "logits/chosen": -1.2344048023223877, "logits/rejected": -1.376710295677185, "logps/chosen": -30.1800537109375, "logps/rejected": -34.68271255493164, "loss": 0.4151, "rewards/accuracies": 0.625, "rewards/chosen": -0.3319806754589081, "rewards/margins": 0.3052244484424591, "rewards/rejected": -0.637205183506012, "step": 61 }, { "epoch": 1.0508474576271187, "grad_norm": 45.932270396743505, "learning_rate": 4.98656169005234e-07, "logits/chosen": -1.300619125366211, "logits/rejected": -1.1745063066482544, "logps/chosen": -32.63179016113281, "logps/rejected": -39.57646942138672, "loss": 0.3826, "rewards/accuracies": 0.8125, "rewards/chosen": -0.021081820130348206, "rewards/margins": 1.4243545532226562, "rewards/rejected": -1.4454363584518433, "step": 62 }, { "epoch": 1.0677966101694916, "grad_norm": 41.51255029654015, "learning_rate": 4.984575415649018e-07, "logits/chosen": -1.272727370262146, "logits/rejected": -1.1292237043380737, "logps/chosen": -26.08396339416504, "logps/rejected": -47.56403350830078, "loss": 0.3506, "rewards/accuracies": 0.875, "rewards/chosen": -0.11639979481697083, "rewards/margins": 2.3725996017456055, "rewards/rejected": -2.488999366760254, "step": 63 }, { "epoch": 1.0847457627118644, "grad_norm": 38.246979074883626, "learning_rate": 4.982452740033792e-07, "logits/chosen": -1.22151517868042, "logits/rejected": -1.082349181175232, "logps/chosen": -27.676637649536133, "logps/rejected": -33.685211181640625, "loss": 0.3068, "rewards/accuracies": 0.875, "rewards/chosen": 0.00457368791103363, "rewards/margins": 1.3627952337265015, "rewards/rejected": -1.358221411705017, "step": 64 }, { "epoch": 1.1016949152542372, "grad_norm": 40.134511453074204, "learning_rate": 4.980193779739863e-07, "logits/chosen": -1.1573750972747803, "logits/rejected": -1.0462363958358765, "logps/chosen": -29.544757843017578, "logps/rejected": -45.139732360839844, "loss": 0.3303, "rewards/accuracies": 0.875, "rewards/chosen": -0.12963010370731354, "rewards/margins": 1.9951367378234863, "rewards/rejected": -2.1247665882110596, "step": 65 }, { "epoch": 1.11864406779661, "grad_norm": 42.89070654695283, "learning_rate": 4.977798658782351e-07, "logits/chosen": -1.3436741828918457, "logits/rejected": -1.2738423347473145, "logps/chosen": -29.10391616821289, "logps/rejected": -42.66758346557617, "loss": 0.349, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23598036170005798, "rewards/margins": 1.5764625072479248, "rewards/rejected": -1.8124428987503052, "step": 66 }, { "epoch": 1.1355932203389831, "grad_norm": 41.22280324142371, "learning_rate": 4.975267508651491e-07, "logits/chosen": -1.202622890472412, "logits/rejected": -1.047250509262085, "logps/chosen": -27.090843200683594, "logps/rejected": -30.239742279052734, "loss": 0.3463, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1256638914346695, "rewards/margins": 1.4247705936431885, "rewards/rejected": -1.5504344701766968, "step": 67 }, { "epoch": 1.152542372881356, "grad_norm": 41.21696455893867, "learning_rate": 4.97260046830541e-07, "logits/chosen": -1.166504144668579, "logits/rejected": -0.8773643970489502, "logps/chosen": -22.584096908569336, "logps/rejected": -42.063133239746094, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": -0.07130265235900879, "rewards/margins": 1.9287859201431274, "rewards/rejected": -2.000088691711426, "step": 68 }, { "epoch": 1.1694915254237288, "grad_norm": 43.16486538306169, "learning_rate": 4.969797684162497e-07, "logits/chosen": -1.4693577289581299, "logits/rejected": -1.3569204807281494, "logps/chosen": -25.79123878479004, "logps/rejected": -37.476409912109375, "loss": 0.3706, "rewards/accuracies": 0.875, "rewards/chosen": -0.10858143866062164, "rewards/margins": 1.385354995727539, "rewards/rejected": -1.4939366579055786, "step": 69 }, { "epoch": 1.1864406779661016, "grad_norm": 42.7942644360943, "learning_rate": 4.966859310093372e-07, "logits/chosen": -1.168703317642212, "logits/rejected": -1.0707366466522217, "logps/chosen": -30.724082946777344, "logps/rejected": -41.230712890625, "loss": 0.354, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13678063452243805, "rewards/margins": 1.567689061164856, "rewards/rejected": -1.7044697999954224, "step": 70 }, { "epoch": 1.2033898305084745, "grad_norm": 39.25048654439285, "learning_rate": 4.96378550741243e-07, "logits/chosen": -1.3091105222702026, "logits/rejected": -1.2656583786010742, "logps/chosen": -30.91085433959961, "logps/rejected": -41.49336624145508, "loss": 0.307, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22276180982589722, "rewards/margins": 1.757472276687622, "rewards/rejected": -1.980234146118164, "step": 71 }, { "epoch": 1.2203389830508475, "grad_norm": 40.23010655586107, "learning_rate": 4.960576444868992e-07, "logits/chosen": -1.4617582559585571, "logits/rejected": -1.4617631435394287, "logps/chosen": -28.715673446655273, "logps/rejected": -48.552024841308594, "loss": 0.317, "rewards/accuracies": 0.875, "rewards/chosen": -0.21487601101398468, "rewards/margins": 2.4703986644744873, "rewards/rejected": -2.685274362564087, "step": 72 }, { "epoch": 1.2372881355932204, "grad_norm": 45.05758163705817, "learning_rate": 4.957232298638035e-07, "logits/chosen": -1.3103129863739014, "logits/rejected": -1.297128677368164, "logps/chosen": -29.75772476196289, "logps/rejected": -42.82856750488281, "loss": 0.3574, "rewards/accuracies": 0.9375, "rewards/chosen": -0.16225725412368774, "rewards/margins": 1.7887846231460571, "rewards/rejected": -1.9510418176651, "step": 73 }, { "epoch": 1.2542372881355932, "grad_norm": 40.337686604932905, "learning_rate": 4.953753252310525e-07, "logits/chosen": -1.351475477218628, "logits/rejected": -1.2956455945968628, "logps/chosen": -30.769058227539062, "logps/rejected": -37.68085479736328, "loss": 0.3211, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22842487692832947, "rewards/margins": 1.4851303100585938, "rewards/rejected": -1.713555097579956, "step": 74 }, { "epoch": 1.271186440677966, "grad_norm": 41.87339924313547, "learning_rate": 4.950139496883334e-07, "logits/chosen": -1.1046594381332397, "logits/rejected": -1.083458662033081, "logps/chosen": -23.34552764892578, "logps/rejected": -33.603294372558594, "loss": 0.2996, "rewards/accuracies": 0.875, "rewards/chosen": -0.22441284358501434, "rewards/margins": 1.64926016330719, "rewards/rejected": -1.8736729621887207, "step": 75 }, { "epoch": 1.288135593220339, "grad_norm": 40.59281009638228, "learning_rate": 4.94639123074876e-07, "logits/chosen": -1.5633933544158936, "logits/rejected": -1.3900222778320312, "logps/chosen": -28.575279235839844, "logps/rejected": -40.71543502807617, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": -0.23141731321811676, "rewards/margins": 1.784515619277954, "rewards/rejected": -2.015933036804199, "step": 76 }, { "epoch": 1.305084745762712, "grad_norm": 40.512541982486866, "learning_rate": 4.942508659683626e-07, "logits/chosen": -1.2337239980697632, "logits/rejected": -1.1204659938812256, "logps/chosen": -37.800514221191406, "logps/rejected": -57.468048095703125, "loss": 0.3173, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0504547655582428, "rewards/margins": 2.774332046508789, "rewards/rejected": -2.824786901473999, "step": 77 }, { "epoch": 1.3220338983050848, "grad_norm": 44.23393681805476, "learning_rate": 4.938491996837994e-07, "logits/chosen": -1.1162344217300415, "logits/rejected": -1.1677778959274292, "logps/chosen": -21.936918258666992, "logps/rejected": -36.89491653442383, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": -0.04354112595319748, "rewards/margins": 1.7271283864974976, "rewards/rejected": -1.7706694602966309, "step": 78 }, { "epoch": 1.3389830508474576, "grad_norm": 41.690947911283914, "learning_rate": 4.934341462723454e-07, "logits/chosen": -1.3248709440231323, "logits/rejected": -1.2905504703521729, "logps/chosen": -22.56363868713379, "logps/rejected": -36.43506622314453, "loss": 0.3275, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11233043670654297, "rewards/margins": 1.878725528717041, "rewards/rejected": -1.991055965423584, "step": 79 }, { "epoch": 1.3559322033898304, "grad_norm": 40.04810360539708, "learning_rate": 4.930057285201027e-07, "logits/chosen": -1.1112794876098633, "logits/rejected": -1.0777575969696045, "logps/chosen": -24.657052993774414, "logps/rejected": -39.67708969116211, "loss": 0.3229, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13391758501529694, "rewards/margins": 2.0548856258392334, "rewards/rejected": -2.18880295753479, "step": 80 }, { "epoch": 1.3728813559322033, "grad_norm": 34.996138065577895, "learning_rate": 4.925639699468645e-07, "logits/chosen": -1.2954164743423462, "logits/rejected": -1.2493962049484253, "logps/chosen": -22.976837158203125, "logps/rejected": -30.558788299560547, "loss": 0.2742, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00974225252866745, "rewards/margins": 1.3260502815246582, "rewards/rejected": -1.3357923030853271, "step": 81 }, { "epoch": 1.3898305084745763, "grad_norm": 37.19153270519795, "learning_rate": 4.921088948048246e-07, "logits/chosen": -1.0799801349639893, "logits/rejected": -1.029328465461731, "logps/chosen": -21.34770965576172, "logps/rejected": -27.523284912109375, "loss": 0.3197, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0038819462060928345, "rewards/margins": 1.294053316116333, "rewards/rejected": -1.2979353666305542, "step": 82 }, { "epoch": 1.4067796610169492, "grad_norm": 38.782824257992516, "learning_rate": 4.916405280772462e-07, "logits/chosen": -1.2136616706848145, "logits/rejected": -1.1575380563735962, "logps/chosen": -34.565452575683594, "logps/rejected": -37.70560836791992, "loss": 0.2773, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11810773611068726, "rewards/margins": 1.5449273586273193, "rewards/rejected": -1.6630350351333618, "step": 83 }, { "epoch": 1.423728813559322, "grad_norm": 46.057268803202, "learning_rate": 4.911588954770896e-07, "logits/chosen": -1.323722004890442, "logits/rejected": -1.2598729133605957, "logps/chosen": -28.280380249023438, "logps/rejected": -35.962127685546875, "loss": 0.3575, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18131475150585175, "rewards/margins": 1.3694193363189697, "rewards/rejected": -1.5507341623306274, "step": 84 }, { "epoch": 1.4406779661016949, "grad_norm": 39.078212479916175, "learning_rate": 4.906640234456011e-07, "logits/chosen": -1.254352331161499, "logits/rejected": -1.220250129699707, "logps/chosen": -22.146629333496094, "logps/rejected": -34.24107360839844, "loss": 0.3198, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16238263249397278, "rewards/margins": 2.182830810546875, "rewards/rejected": -2.3452136516571045, "step": 85 }, { "epoch": 1.457627118644068, "grad_norm": 40.017911222526, "learning_rate": 4.90155939150861e-07, "logits/chosen": -1.2662581205368042, "logits/rejected": -1.1520745754241943, "logps/chosen": -28.800233840942383, "logps/rejected": -45.41657638549805, "loss": 0.2885, "rewards/accuracies": 0.875, "rewards/chosen": -0.10078342258930206, "rewards/margins": 2.74984073638916, "rewards/rejected": -2.8506245613098145, "step": 86 }, { "epoch": 1.4745762711864407, "grad_norm": 38.422387432652926, "learning_rate": 4.896346704862927e-07, "logits/chosen": -1.1133819818496704, "logits/rejected": -1.1244077682495117, "logps/chosen": -26.18191909790039, "logps/rejected": -38.19125747680664, "loss": 0.2784, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2561756372451782, "rewards/margins": 2.0794429779052734, "rewards/rejected": -2.335618734359741, "step": 87 }, { "epoch": 1.4915254237288136, "grad_norm": 43.19306665638686, "learning_rate": 4.891002460691305e-07, "logits/chosen": -1.1532127857208252, "logits/rejected": -1.0514928102493286, "logps/chosen": -33.20591735839844, "logps/rejected": -44.78691101074219, "loss": 0.2942, "rewards/accuracies": 0.875, "rewards/chosen": -0.31594839692115784, "rewards/margins": 2.774695634841919, "rewards/rejected": -3.090643882751465, "step": 88 }, { "epoch": 1.5084745762711864, "grad_norm": 37.55592044811218, "learning_rate": 4.885526952388497e-07, "logits/chosen": -1.5443884134292603, "logits/rejected": -1.491202473640442, "logps/chosen": -29.413482666015625, "logps/rejected": -44.91068649291992, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": -0.08295662701129913, "rewards/margins": 2.9935202598571777, "rewards/rejected": -3.076476812362671, "step": 89 }, { "epoch": 1.5254237288135593, "grad_norm": 39.41480347028368, "learning_rate": 4.879920480555549e-07, "logits/chosen": -1.2505245208740234, "logits/rejected": -1.1800575256347656, "logps/chosen": -34.517356872558594, "logps/rejected": -54.461875915527344, "loss": 0.3016, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2518163323402405, "rewards/margins": 2.4639229774475098, "rewards/rejected": -2.7157392501831055, "step": 90 }, { "epoch": 1.542372881355932, "grad_norm": 41.233244462870424, "learning_rate": 4.874183352983297e-07, "logits/chosen": -1.2128483057022095, "logits/rejected": -1.1637994050979614, "logps/chosen": -26.66428565979004, "logps/rejected": -30.274539947509766, "loss": 0.3237, "rewards/accuracies": 0.8125, "rewards/chosen": 0.016297563910484314, "rewards/margins": 1.5082069635391235, "rewards/rejected": -1.4919092655181885, "step": 91 }, { "epoch": 1.559322033898305, "grad_norm": 33.074028930414165, "learning_rate": 4.868315884635478e-07, "logits/chosen": -1.3729506731033325, "logits/rejected": -1.363387107849121, "logps/chosen": -32.76537322998047, "logps/rejected": -40.88609313964844, "loss": 0.2209, "rewards/accuracies": 0.875, "rewards/chosen": -0.49252671003341675, "rewards/margins": 1.6107347011566162, "rewards/rejected": -2.1032614707946777, "step": 92 }, { "epoch": 1.576271186440678, "grad_norm": 44.45767411612402, "learning_rate": 4.862318397631433e-07, "logits/chosen": -1.290561318397522, "logits/rejected": -1.2587978839874268, "logps/chosen": -27.68136215209961, "logps/rejected": -37.53421401977539, "loss": 0.3135, "rewards/accuracies": 0.9375, "rewards/chosen": 0.045494288206100464, "rewards/margins": 2.3608152866363525, "rewards/rejected": -2.3153209686279297, "step": 93 }, { "epoch": 1.5932203389830508, "grad_norm": 38.75459636795706, "learning_rate": 4.856191221228422e-07, "logits/chosen": -1.4851233959197998, "logits/rejected": -1.3868330717086792, "logps/chosen": -25.41338348388672, "logps/rejected": -50.90999984741211, "loss": 0.3055, "rewards/accuracies": 0.9375, "rewards/chosen": 0.057565659284591675, "rewards/margins": 3.016381025314331, "rewards/rejected": -2.958815336227417, "step": 94 }, { "epoch": 1.6101694915254239, "grad_norm": 39.42511408680881, "learning_rate": 4.84993469180355e-07, "logits/chosen": -1.5986624956130981, "logits/rejected": -1.5039199590682983, "logps/chosen": -23.876976013183594, "logps/rejected": -37.30643081665039, "loss": 0.2918, "rewards/accuracies": 0.875, "rewards/chosen": 0.1558499038219452, "rewards/margins": 2.534642219543457, "rewards/rejected": -2.3787922859191895, "step": 95 }, { "epoch": 1.6271186440677967, "grad_norm": 40.62251102213997, "learning_rate": 4.843549152835302e-07, "logits/chosen": -1.4016410112380981, "logits/rejected": -1.330747127532959, "logps/chosen": -35.39458465576172, "logps/rejected": -41.057533264160156, "loss": 0.2683, "rewards/accuracies": 0.75, "rewards/chosen": -0.30797258019447327, "rewards/margins": 1.5779187679290771, "rewards/rejected": -1.8858911991119385, "step": 96 }, { "epoch": 1.6440677966101696, "grad_norm": 37.47292375573573, "learning_rate": 4.837034954884681e-07, "logits/chosen": -1.2742365598678589, "logits/rejected": -1.16330885887146, "logps/chosen": -19.86534881591797, "logps/rejected": -37.20149230957031, "loss": 0.3136, "rewards/accuracies": 1.0, "rewards/chosen": -0.16833439469337463, "rewards/margins": 2.6455376148223877, "rewards/rejected": -2.8138718605041504, "step": 97 }, { "epoch": 1.6610169491525424, "grad_norm": 34.360879838757626, "learning_rate": 4.83039245557597e-07, "logits/chosen": -1.4751869440078735, "logits/rejected": -1.3903852701187134, "logps/chosen": -27.246294021606445, "logps/rejected": -37.46033477783203, "loss": 0.2373, "rewards/accuracies": 1.0, "rewards/chosen": 0.015960171818733215, "rewards/margins": 1.7550795078277588, "rewards/rejected": -1.739119291305542, "step": 98 }, { "epoch": 1.6779661016949152, "grad_norm": 36.086860100703234, "learning_rate": 4.823622019577088e-07, "logits/chosen": -1.5154139995574951, "logits/rejected": -1.435178279876709, "logps/chosen": -29.518024444580078, "logps/rejected": -34.60513687133789, "loss": 0.2638, "rewards/accuracies": 1.0, "rewards/chosen": 0.023779883980751038, "rewards/margins": 1.6297731399536133, "rewards/rejected": -1.6059932708740234, "step": 99 }, { "epoch": 1.694915254237288, "grad_norm": 44.37484563053148, "learning_rate": 4.816724018579583e-07, "logits/chosen": -1.3004710674285889, "logits/rejected": -1.1905782222747803, "logps/chosen": -44.38223648071289, "logps/rejected": -44.088531494140625, "loss": 0.3148, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6569697856903076, "rewards/margins": 1.929584264755249, "rewards/rejected": -2.5865538120269775, "step": 100 }, { "epoch": 1.711864406779661, "grad_norm": 37.67689426318597, "learning_rate": 4.809698831278217e-07, "logits/chosen": -1.1910172700881958, "logits/rejected": -1.085993766784668, "logps/chosen": -25.45205307006836, "logps/rejected": -38.646697998046875, "loss": 0.2924, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10635387897491455, "rewards/margins": 2.1210451126098633, "rewards/rejected": -2.2273988723754883, "step": 101 }, { "epoch": 1.7288135593220337, "grad_norm": 44.53554229157193, "learning_rate": 4.802546843350177e-07, "logits/chosen": -1.3502918481826782, "logits/rejected": -1.3793635368347168, "logps/chosen": -31.245201110839844, "logps/rejected": -38.174922943115234, "loss": 0.308, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04055324196815491, "rewards/margins": 2.123584032058716, "rewards/rejected": -2.164137363433838, "step": 102 }, { "epoch": 1.7457627118644068, "grad_norm": 33.6410465461232, "learning_rate": 4.795268447433906e-07, "logits/chosen": -1.5270867347717285, "logits/rejected": -1.5600392818450928, "logps/chosen": -26.281536102294922, "logps/rejected": -40.25182342529297, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": -0.38495194911956787, "rewards/margins": 2.3371963500976562, "rewards/rejected": -2.7221484184265137, "step": 103 }, { "epoch": 1.7627118644067796, "grad_norm": 37.86791531616575, "learning_rate": 4.787864043107546e-07, "logits/chosen": -1.1703252792358398, "logits/rejected": -1.2337673902511597, "logps/chosen": -28.018447875976562, "logps/rejected": -26.48525619506836, "loss": 0.2904, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1365218162536621, "rewards/margins": 1.0608235597610474, "rewards/rejected": -1.197345495223999, "step": 104 }, { "epoch": 1.7796610169491527, "grad_norm": 39.89897092814635, "learning_rate": 4.780334036866996e-07, "logits/chosen": -1.4013991355895996, "logits/rejected": -1.3012598752975464, "logps/chosen": -30.404062271118164, "logps/rejected": -50.464717864990234, "loss": 0.2444, "rewards/accuracies": 0.9375, "rewards/chosen": -0.36456581950187683, "rewards/margins": 2.707120895385742, "rewards/rejected": -3.0716869831085205, "step": 105 }, { "epoch": 1.7966101694915255, "grad_norm": 38.70079485674391, "learning_rate": 4.772678842103605e-07, "logits/chosen": -1.396234393119812, "logits/rejected": -1.2844393253326416, "logps/chosen": -30.791566848754883, "logps/rejected": -41.80755615234375, "loss": 0.2245, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2901880741119385, "rewards/margins": 2.5994958877563477, "rewards/rejected": -2.8896842002868652, "step": 106 }, { "epoch": 1.8135593220338984, "grad_norm": 31.615929528424363, "learning_rate": 4.764898879081467e-07, "logits/chosen": -1.3441611528396606, "logits/rejected": -1.3465229272842407, "logps/chosen": -25.936952590942383, "logps/rejected": -44.00107192993164, "loss": 0.2261, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0497933030128479, "rewards/margins": 1.7863792181015015, "rewards/rejected": -1.7365858554840088, "step": 107 }, { "epoch": 1.8305084745762712, "grad_norm": 36.20936022186147, "learning_rate": 4.7569945749143586e-07, "logits/chosen": -1.3387551307678223, "logits/rejected": -1.329017996788025, "logps/chosen": -26.50401496887207, "logps/rejected": -48.64241027832031, "loss": 0.2448, "rewards/accuracies": 1.0, "rewards/chosen": -0.1088000237941742, "rewards/margins": 2.8309779167175293, "rewards/rejected": -2.9397776126861572, "step": 108 }, { "epoch": 1.847457627118644, "grad_norm": 36.766498225229036, "learning_rate": 4.748966363542285e-07, "logits/chosen": -1.2870004177093506, "logits/rejected": -1.2100244760513306, "logps/chosen": -24.399860382080078, "logps/rejected": -37.19389343261719, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 0.04130461812019348, "rewards/margins": 2.3760554790496826, "rewards/rejected": -2.3347508907318115, "step": 109 }, { "epoch": 1.8644067796610169, "grad_norm": 37.505379590427104, "learning_rate": 4.7408146857076563e-07, "logits/chosen": -1.2373127937316895, "logits/rejected": -1.1650217771530151, "logps/chosen": -39.90864562988281, "logps/rejected": -40.89439392089844, "loss": 0.2646, "rewards/accuracies": 0.75, "rewards/chosen": 0.0051874518394470215, "rewards/margins": 1.6469688415527344, "rewards/rejected": -1.6417814493179321, "step": 110 }, { "epoch": 1.8813559322033897, "grad_norm": 35.644427389294876, "learning_rate": 4.732539988931096e-07, "logits/chosen": -1.3618909120559692, "logits/rejected": -1.4531258344650269, "logps/chosen": -26.969541549682617, "logps/rejected": -42.32440185546875, "loss": 0.2208, "rewards/accuracies": 0.875, "rewards/chosen": -0.25816747546195984, "rewards/margins": 2.6419811248779297, "rewards/rejected": -2.900148391723633, "step": 111 }, { "epoch": 1.8983050847457628, "grad_norm": 39.403893355046144, "learning_rate": 4.7241427274868683e-07, "logits/chosen": -1.392714262008667, "logits/rejected": -1.2956047058105469, "logps/chosen": -25.744760513305664, "logps/rejected": -43.44940185546875, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": -0.19541674852371216, "rewards/margins": 2.4149329662323, "rewards/rejected": -2.610349655151367, "step": 112 }, { "epoch": 1.9152542372881356, "grad_norm": 42.63537438222289, "learning_rate": 4.7156233623779383e-07, "logits/chosen": -1.3376697301864624, "logits/rejected": -1.3315945863723755, "logps/chosen": -31.033275604248047, "logps/rejected": -34.59294128417969, "loss": 0.262, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01679442822933197, "rewards/margins": 1.9989566802978516, "rewards/rejected": -2.015751361846924, "step": 113 }, { "epoch": 1.9322033898305084, "grad_norm": 37.518445889823454, "learning_rate": 4.7069823613106687e-07, "logits/chosen": -1.3494609594345093, "logits/rejected": -1.4168843030929565, "logps/chosen": -37.80765914916992, "logps/rejected": -50.918888092041016, "loss": 0.2293, "rewards/accuracies": 0.875, "rewards/chosen": -0.5224637985229492, "rewards/margins": 3.094416618347168, "rewards/rejected": -3.6168806552886963, "step": 114 }, { "epoch": 1.9491525423728815, "grad_norm": 42.51900093030587, "learning_rate": 4.698220198669136e-07, "logits/chosen": -1.646604299545288, "logits/rejected": -1.517140507698059, "logps/chosen": -27.87681770324707, "logps/rejected": -42.672096252441406, "loss": 0.2965, "rewards/accuracies": 1.0, "rewards/chosen": -0.16765162348747253, "rewards/margins": 2.965381145477295, "rewards/rejected": -3.133033275604248, "step": 115 }, { "epoch": 1.9661016949152543, "grad_norm": 33.55277119465526, "learning_rate": 4.6893373554890917e-07, "logits/chosen": -1.5241327285766602, "logits/rejected": -1.2908215522766113, "logps/chosen": -33.22343444824219, "logps/rejected": -44.93342208862305, "loss": 0.226, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36639106273651123, "rewards/margins": 2.9403634071350098, "rewards/rejected": -3.3067543506622314, "step": 116 }, { "epoch": 1.9830508474576272, "grad_norm": 36.89279198796042, "learning_rate": 4.6803343194315546e-07, "logits/chosen": -1.3500347137451172, "logits/rejected": -1.3750879764556885, "logps/chosen": -33.52631759643555, "logps/rejected": -46.71184539794922, "loss": 0.2323, "rewards/accuracies": 1.0, "rewards/chosen": -0.17364656925201416, "rewards/margins": 2.6467905044555664, "rewards/rejected": -2.820436954498291, "step": 117 }, { "epoch": 2.0, "grad_norm": 29.673763942339104, "learning_rate": 4.6712115847560353e-07, "logits/chosen": -1.1543025970458984, "logits/rejected": -1.1016186475753784, "logps/chosen": -27.275094985961914, "logps/rejected": -35.34591293334961, "loss": 0.2172, "rewards/accuracies": 0.875, "rewards/chosen": 0.010025471448898315, "rewards/margins": 2.681260824203491, "rewards/rejected": -2.6712355613708496, "step": 118 }, { "epoch": 2.016949152542373, "grad_norm": 18.754455801142957, "learning_rate": 4.661969652293402e-07, "logits/chosen": -1.3498600721359253, "logits/rejected": -1.2991336584091187, "logps/chosen": -24.873241424560547, "logps/rejected": -45.2943000793457, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": -0.13535727560520172, "rewards/margins": 3.1097800731658936, "rewards/rejected": -3.2451369762420654, "step": 119 }, { "epoch": 2.0338983050847457, "grad_norm": 22.75173167375844, "learning_rate": 4.652609029418388e-07, "logits/chosen": -1.283535122871399, "logits/rejected": -1.2337732315063477, "logps/chosen": -26.472030639648438, "logps/rejected": -41.27444076538086, "loss": 0.1627, "rewards/accuracies": 1.0, "rewards/chosen": -0.14066343009471893, "rewards/margins": 2.8250813484191895, "rewards/rejected": -2.9657444953918457, "step": 120 }, { "epoch": 2.0508474576271185, "grad_norm": 20.99893381194414, "learning_rate": 4.6431302300217366e-07, "logits/chosen": -1.4252784252166748, "logits/rejected": -1.421356439590454, "logps/chosen": -31.787378311157227, "logps/rejected": -33.3957405090332, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 0.40507441759109497, "rewards/margins": 1.8697203397750854, "rewards/rejected": -1.4646459817886353, "step": 121 }, { "epoch": 2.0677966101694913, "grad_norm": 18.786273208958082, "learning_rate": 4.633533774481987e-07, "logits/chosen": -1.2753486633300781, "logits/rejected": -1.1164805889129639, "logps/chosen": -31.65105438232422, "logps/rejected": -46.21112823486328, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": -0.07824045419692993, "rewards/margins": 3.2129859924316406, "rewards/rejected": -3.291226387023926, "step": 122 }, { "epoch": 2.084745762711864, "grad_norm": 20.917039008362774, "learning_rate": 4.623820189636905e-07, "logits/chosen": -1.3903650045394897, "logits/rejected": -1.3099316358566284, "logps/chosen": -29.19454002380371, "logps/rejected": -46.168941497802734, "loss": 0.1536, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4621618092060089, "rewards/margins": 2.845735788345337, "rewards/rejected": -2.3835740089416504, "step": 123 }, { "epoch": 2.1016949152542375, "grad_norm": 20.45027445287867, "learning_rate": 4.613990008754565e-07, "logits/chosen": -1.3704748153686523, "logits/rejected": -1.2672369480133057, "logps/chosen": -34.432945251464844, "logps/rejected": -39.66848373413086, "loss": 0.1588, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1847638189792633, "rewards/margins": 2.653686761856079, "rewards/rejected": -2.4689230918884277, "step": 124 }, { "epoch": 2.1186440677966103, "grad_norm": 20.088146504282733, "learning_rate": 4.60404377150407e-07, "logits/chosen": -1.2935415506362915, "logits/rejected": -1.2802854776382446, "logps/chosen": -26.330522537231445, "logps/rejected": -41.638004302978516, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": -0.15763472020626068, "rewards/margins": 2.74660587310791, "rewards/rejected": -2.904240131378174, "step": 125 }, { "epoch": 2.135593220338983, "grad_norm": 23.38109010648461, "learning_rate": 4.593982023925925e-07, "logits/chosen": -1.1731081008911133, "logits/rejected": -1.0896246433258057, "logps/chosen": -33.23085021972656, "logps/rejected": -40.83133316040039, "loss": 0.1689, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19674523174762726, "rewards/margins": 2.6324305534362793, "rewards/rejected": -2.8291759490966797, "step": 126 }, { "epoch": 2.152542372881356, "grad_norm": 19.53451419738541, "learning_rate": 4.58380531840206e-07, "logits/chosen": -1.3832257986068726, "logits/rejected": -1.1368017196655273, "logps/chosen": -31.488880157470703, "logps/rejected": -37.851097106933594, "loss": 0.1448, "rewards/accuracies": 1.0, "rewards/chosen": 0.38223376870155334, "rewards/margins": 3.1422624588012695, "rewards/rejected": -2.76002836227417, "step": 127 }, { "epoch": 2.169491525423729, "grad_norm": 19.597219846897513, "learning_rate": 4.5735142136255045e-07, "logits/chosen": -1.3937269449234009, "logits/rejected": -1.3436622619628906, "logps/chosen": -28.44049644470215, "logps/rejected": -49.33514404296875, "loss": 0.1489, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09995569288730621, "rewards/margins": 3.583566665649414, "rewards/rejected": -3.4836111068725586, "step": 128 }, { "epoch": 2.1864406779661016, "grad_norm": 18.394803732689535, "learning_rate": 4.5631092745697164e-07, "logits/chosen": -1.1625999212265015, "logits/rejected": -1.1585655212402344, "logps/chosen": -29.666309356689453, "logps/rejected": -41.36751174926758, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 0.2516571879386902, "rewards/margins": 2.918815851211548, "rewards/rejected": -2.667158603668213, "step": 129 }, { "epoch": 2.2033898305084745, "grad_norm": 19.452927999271022, "learning_rate": 4.5525910724575645e-07, "logits/chosen": -1.298140048980713, "logits/rejected": -1.238019585609436, "logps/chosen": -30.736804962158203, "logps/rejected": -49.28110885620117, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 0.11459851264953613, "rewards/margins": 4.15794563293457, "rewards/rejected": -4.043347358703613, "step": 130 }, { "epoch": 2.2203389830508473, "grad_norm": 16.937231864925646, "learning_rate": 4.54196018472997e-07, "logits/chosen": -1.2767530679702759, "logits/rejected": -1.1214213371276855, "logps/chosen": -27.268341064453125, "logps/rejected": -52.620338439941406, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 0.010926365852355957, "rewards/margins": 4.242362976074219, "rewards/rejected": -4.231436729431152, "step": 131 }, { "epoch": 2.23728813559322, "grad_norm": 19.91080195782553, "learning_rate": 4.5312171950142033e-07, "logits/chosen": -1.443523645401001, "logits/rejected": -1.3692708015441895, "logps/chosen": -23.356483459472656, "logps/rejected": -39.884552001953125, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": 0.09224121272563934, "rewards/margins": 3.541024923324585, "rewards/rejected": -3.4487838745117188, "step": 132 }, { "epoch": 2.2542372881355934, "grad_norm": 20.224862204629513, "learning_rate": 4.520362693091845e-07, "logits/chosen": -1.3375797271728516, "logits/rejected": -1.2925443649291992, "logps/chosen": -24.31032371520996, "logps/rejected": -31.916282653808594, "loss": 0.1541, "rewards/accuracies": 0.875, "rewards/chosen": 0.05450062453746796, "rewards/margins": 1.724971055984497, "rewards/rejected": -1.6704705953598022, "step": 133 }, { "epoch": 2.2711864406779663, "grad_norm": 19.256180369703056, "learning_rate": 4.5093972748664087e-07, "logits/chosen": -1.3231240510940552, "logits/rejected": -1.2512582540512085, "logps/chosen": -35.62217330932617, "logps/rejected": -48.332237243652344, "loss": 0.1134, "rewards/accuracies": 0.9375, "rewards/chosen": -0.26731669902801514, "rewards/margins": 3.2187671661376953, "rewards/rejected": -3.486083745956421, "step": 134 }, { "epoch": 2.288135593220339, "grad_norm": 17.92618734739177, "learning_rate": 4.498321542330622e-07, "logits/chosen": -1.4986979961395264, "logits/rejected": -1.4225276708602905, "logps/chosen": -26.092084884643555, "logps/rejected": -52.08736038208008, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": -0.04193298518657684, "rewards/margins": 3.300752639770508, "rewards/rejected": -3.3426852226257324, "step": 135 }, { "epoch": 2.305084745762712, "grad_norm": 19.524384947131523, "learning_rate": 4.4871361035333833e-07, "logits/chosen": -1.302308201789856, "logits/rejected": -1.306333065032959, "logps/chosen": -25.217451095581055, "logps/rejected": -43.95090103149414, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": 0.24911952018737793, "rewards/margins": 3.123009204864502, "rewards/rejected": -2.873889684677124, "step": 136 }, { "epoch": 2.3220338983050848, "grad_norm": 21.654262830345974, "learning_rate": 4.475841572546374e-07, "logits/chosen": -1.333143711090088, "logits/rejected": -1.1934046745300293, "logps/chosen": -29.572952270507812, "logps/rejected": -42.41865539550781, "loss": 0.1608, "rewards/accuracies": 1.0, "rewards/chosen": -0.2631007432937622, "rewards/margins": 2.9843344688415527, "rewards/rejected": -3.2474350929260254, "step": 137 }, { "epoch": 2.3389830508474576, "grad_norm": 18.2902071706345, "learning_rate": 4.464438569430353e-07, "logits/chosen": -1.4010558128356934, "logits/rejected": -1.3596662282943726, "logps/chosen": -27.541452407836914, "logps/rejected": -37.1332893371582, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 0.17605237662792206, "rewards/margins": 2.4087536334991455, "rewards/rejected": -2.232701301574707, "step": 138 }, { "epoch": 2.3559322033898304, "grad_norm": 20.193211619680806, "learning_rate": 4.452927720201112e-07, "logits/chosen": -1.1419758796691895, "logits/rejected": -1.2467293739318848, "logps/chosen": -24.366790771484375, "logps/rejected": -44.22784423828125, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 0.2860787510871887, "rewards/margins": 3.2284557819366455, "rewards/rejected": -2.9423768520355225, "step": 139 }, { "epoch": 2.3728813559322033, "grad_norm": 16.846442500740498, "learning_rate": 4.441309656795106e-07, "logits/chosen": -1.3143264055252075, "logits/rejected": -1.1588001251220703, "logps/chosen": -25.230070114135742, "logps/rejected": -50.344722747802734, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 0.2081325650215149, "rewards/margins": 3.1986870765686035, "rewards/rejected": -2.990554094314575, "step": 140 }, { "epoch": 2.389830508474576, "grad_norm": 19.91817673644514, "learning_rate": 4.429585017034766e-07, "logits/chosen": -1.355256199836731, "logits/rejected": -1.3759305477142334, "logps/chosen": -30.238567352294922, "logps/rejected": -43.92167663574219, "loss": 0.1377, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17302727699279785, "rewards/margins": 3.385795831680298, "rewards/rejected": -3.558823347091675, "step": 141 }, { "epoch": 2.406779661016949, "grad_norm": 17.91719154232722, "learning_rate": 4.417754444593478e-07, "logits/chosen": -1.452804684638977, "logits/rejected": -1.394730567932129, "logps/chosen": -29.204275131225586, "logps/rejected": -45.112300872802734, "loss": 0.1072, "rewards/accuracies": 1.0, "rewards/chosen": 0.046540290117263794, "rewards/margins": 4.2701945304870605, "rewards/rejected": -4.223654270172119, "step": 142 }, { "epoch": 2.423728813559322, "grad_norm": 17.305416636387555, "learning_rate": 4.4058185889602497e-07, "logits/chosen": -1.375995397567749, "logits/rejected": -1.3006415367126465, "logps/chosen": -17.108240127563477, "logps/rejected": -35.922874450683594, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": 0.28150078654289246, "rewards/margins": 3.295111656188965, "rewards/rejected": -3.01361083984375, "step": 143 }, { "epoch": 2.440677966101695, "grad_norm": 23.303228169178016, "learning_rate": 4.39377810540405e-07, "logits/chosen": -1.4052019119262695, "logits/rejected": -1.4526053667068481, "logps/chosen": -41.58496856689453, "logps/rejected": -40.82295227050781, "loss": 0.1742, "rewards/accuracies": 0.9375, "rewards/chosen": -0.30576008558273315, "rewards/margins": 2.3252716064453125, "rewards/rejected": -2.6310315132141113, "step": 144 }, { "epoch": 2.457627118644068, "grad_norm": 19.121378802553227, "learning_rate": 4.38163365493784e-07, "logits/chosen": -1.5245730876922607, "logits/rejected": -1.401250958442688, "logps/chosen": -34.83244705200195, "logps/rejected": -64.91217041015625, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 0.20400622487068176, "rewards/margins": 4.261959552764893, "rewards/rejected": -4.057952880859375, "step": 145 }, { "epoch": 2.4745762711864407, "grad_norm": 19.034390396216985, "learning_rate": 4.3693859042822774e-07, "logits/chosen": -1.2971210479736328, "logits/rejected": -1.2576966285705566, "logps/chosen": -33.01571273803711, "logps/rejected": -45.87381362915039, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 0.19635039567947388, "rewards/margins": 3.7301583290100098, "rewards/rejected": -3.5338077545166016, "step": 146 }, { "epoch": 2.4915254237288136, "grad_norm": 18.357502635593228, "learning_rate": 4.3570355258291223e-07, "logits/chosen": -1.2477927207946777, "logits/rejected": -1.2029328346252441, "logps/chosen": -28.15603256225586, "logps/rejected": -39.89695358276367, "loss": 0.1305, "rewards/accuracies": 1.0, "rewards/chosen": 0.18680232763290405, "rewards/margins": 2.8544976711273193, "rewards/rejected": -2.6676955223083496, "step": 147 }, { "epoch": 2.5084745762711864, "grad_norm": 13.757078257538927, "learning_rate": 4.344583197604318e-07, "logits/chosen": -1.280619740486145, "logits/rejected": -1.2003180980682373, "logps/chosen": -24.585142135620117, "logps/rejected": -49.09882354736328, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": -0.009844973683357239, "rewards/margins": 3.587639331817627, "rewards/rejected": -3.5974843502044678, "step": 148 }, { "epoch": 2.5254237288135593, "grad_norm": 20.493002641349737, "learning_rate": 4.332029603230767e-07, "logits/chosen": -1.2804765701293945, "logits/rejected": -1.2299047708511353, "logps/chosen": -38.74854278564453, "logps/rejected": -40.60469055175781, "loss": 0.1209, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31216520071029663, "rewards/margins": 3.550785541534424, "rewards/rejected": -3.2386200428009033, "step": 149 }, { "epoch": 2.542372881355932, "grad_norm": 18.873521616647515, "learning_rate": 4.319375431890806e-07, "logits/chosen": -1.5655155181884766, "logits/rejected": -1.5413960218429565, "logps/chosen": -29.58742904663086, "logps/rejected": -38.81654739379883, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": -0.10849936306476593, "rewards/margins": 4.284536361694336, "rewards/rejected": -4.393035888671875, "step": 150 }, { "epoch": 2.559322033898305, "grad_norm": 19.108581899079446, "learning_rate": 4.306621378288364e-07, "logits/chosen": -1.227691650390625, "logits/rejected": -1.206729769706726, "logps/chosen": -29.225234985351562, "logps/rejected": -47.78348159790039, "loss": 0.1189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18846926093101501, "rewards/margins": 3.5759382247924805, "rewards/rejected": -3.3874690532684326, "step": 151 }, { "epoch": 2.576271186440678, "grad_norm": 17.54679094674662, "learning_rate": 4.2937681426108275e-07, "logits/chosen": -1.2980151176452637, "logits/rejected": -1.2792203426361084, "logps/chosen": -29.963348388671875, "logps/rejected": -36.31569290161133, "loss": 0.119, "rewards/accuracies": 0.875, "rewards/chosen": -0.15102972090244293, "rewards/margins": 2.2603495121002197, "rewards/rejected": -2.411379098892212, "step": 152 }, { "epoch": 2.593220338983051, "grad_norm": 20.293681732169816, "learning_rate": 4.280816430490602e-07, "logits/chosen": -1.6751010417938232, "logits/rejected": -1.579331874847412, "logps/chosen": -27.965721130371094, "logps/rejected": -41.831912994384766, "loss": 0.1398, "rewards/accuracies": 0.875, "rewards/chosen": 0.03810068964958191, "rewards/margins": 3.356261968612671, "rewards/rejected": -3.3181610107421875, "step": 153 }, { "epoch": 2.610169491525424, "grad_norm": 18.30214205628312, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -1.2230945825576782, "logits/rejected": -1.3257890939712524, "logps/chosen": -22.84085464477539, "logps/rejected": -30.21630859375, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 0.4078897535800934, "rewards/margins": 2.349480152130127, "rewards/rejected": -1.9415905475616455, "step": 154 }, { "epoch": 2.6271186440677967, "grad_norm": 16.933166318116392, "learning_rate": 4.254620426444053e-07, "logits/chosen": -1.2983089685440063, "logits/rejected": -1.2278701066970825, "logps/chosen": -28.5270938873291, "logps/rejected": -47.79298400878906, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 0.39613279700279236, "rewards/margins": 4.555056571960449, "rewards/rejected": -4.158923625946045, "step": 155 }, { "epoch": 2.6440677966101696, "grad_norm": 16.954026895333676, "learning_rate": 4.2413775726574923e-07, "logits/chosen": -1.4136017560958862, "logits/rejected": -1.308146357536316, "logps/chosen": -26.033409118652344, "logps/rejected": -45.159584045410156, "loss": 0.1098, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3105984926223755, "rewards/margins": 3.2515172958374023, "rewards/rejected": -3.5621156692504883, "step": 156 }, { "epoch": 2.6610169491525424, "grad_norm": 23.018543543838756, "learning_rate": 4.228039118628815e-07, "logits/chosen": -1.3158849477767944, "logits/rejected": -1.1991815567016602, "logps/chosen": -25.21765899658203, "logps/rejected": -40.39363479614258, "loss": 0.1629, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04916132241487503, "rewards/margins": 2.8159801959991455, "rewards/rejected": -2.7668187618255615, "step": 157 }, { "epoch": 2.6779661016949152, "grad_norm": 18.04681645919104, "learning_rate": 4.214605796628526e-07, "logits/chosen": -1.3597509860992432, "logits/rejected": -1.375614881515503, "logps/chosen": -26.287364959716797, "logps/rejected": -41.234405517578125, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 0.18970844149589539, "rewards/margins": 3.569676399230957, "rewards/rejected": -3.3799679279327393, "step": 158 }, { "epoch": 2.694915254237288, "grad_norm": 14.068268306629102, "learning_rate": 4.201078344135306e-07, "logits/chosen": -1.497442364692688, "logits/rejected": -1.4510717391967773, "logps/chosen": -27.79704475402832, "logps/rejected": -43.55573272705078, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": 0.037288397550582886, "rewards/margins": 3.6583452224731445, "rewards/rejected": -3.6210567951202393, "step": 159 }, { "epoch": 2.711864406779661, "grad_norm": 20.29404199604278, "learning_rate": 4.187457503795526e-07, "logits/chosen": -1.6163471937179565, "logits/rejected": -1.5419741868972778, "logps/chosen": -26.61087417602539, "logps/rejected": -30.924564361572266, "loss": 0.1336, "rewards/accuracies": 1.0, "rewards/chosen": 0.18512818217277527, "rewards/margins": 2.91485857963562, "rewards/rejected": -2.7297308444976807, "step": 160 }, { "epoch": 2.7288135593220337, "grad_norm": 14.433423874260647, "learning_rate": 4.173744023382474e-07, "logits/chosen": -1.6092435121536255, "logits/rejected": -1.4594898223876953, "logps/chosen": -24.235836029052734, "logps/rejected": -42.99752426147461, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": -0.11106225848197937, "rewards/margins": 3.523545742034912, "rewards/rejected": -3.6346077919006348, "step": 161 }, { "epoch": 2.7457627118644066, "grad_norm": 18.073004370156372, "learning_rate": 4.159938655755306e-07, "logits/chosen": -1.2371647357940674, "logits/rejected": -1.249834656715393, "logps/chosen": -26.12664031982422, "logps/rejected": -41.84228515625, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 0.07593405246734619, "rewards/margins": 3.008237361907959, "rewards/rejected": -2.9323031902313232, "step": 162 }, { "epoch": 2.7627118644067794, "grad_norm": 17.580682721769048, "learning_rate": 4.1460421588177094e-07, "logits/chosen": -1.3883872032165527, "logits/rejected": -1.284624695777893, "logps/chosen": -24.988061904907227, "logps/rejected": -44.79413986206055, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 0.04829689860343933, "rewards/margins": 4.283046722412109, "rewards/rejected": -4.234749794006348, "step": 163 }, { "epoch": 2.7796610169491527, "grad_norm": 14.683936946298637, "learning_rate": 4.1320552954763037e-07, "logits/chosen": -1.3138792514801025, "logits/rejected": -1.3704159259796143, "logps/chosen": -34.95574188232422, "logps/rejected": -41.10405731201172, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 0.08980777859687805, "rewards/margins": 3.34732723236084, "rewards/rejected": -3.257519483566284, "step": 164 }, { "epoch": 2.7966101694915255, "grad_norm": 19.53577404998919, "learning_rate": 4.117978833598747e-07, "logits/chosen": -1.4069619178771973, "logits/rejected": -1.3315298557281494, "logps/chosen": -38.807586669921875, "logps/rejected": -41.68088912963867, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": -0.32595694065093994, "rewards/margins": 2.6362242698669434, "rewards/rejected": -2.962181329727173, "step": 165 }, { "epoch": 2.8135593220338984, "grad_norm": 15.630747955128056, "learning_rate": 4.1038135459715885e-07, "logits/chosen": -1.3965390920639038, "logits/rejected": -1.3702296018600464, "logps/chosen": -17.21418571472168, "logps/rejected": -33.933189392089844, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 0.29006946086883545, "rewards/margins": 3.5608468055725098, "rewards/rejected": -3.270777463912964, "step": 166 }, { "epoch": 2.830508474576271, "grad_norm": 15.319247967225127, "learning_rate": 4.0895602102578373e-07, "logits/chosen": -1.2587236166000366, "logits/rejected": -1.2784702777862549, "logps/chosen": -33.745662689208984, "logps/rejected": -53.36175537109375, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": -0.5272909998893738, "rewards/margins": 3.7912838459014893, "rewards/rejected": -4.318574905395508, "step": 167 }, { "epoch": 2.847457627118644, "grad_norm": 17.91197797858757, "learning_rate": 4.075219608954278e-07, "logits/chosen": -1.220982313156128, "logits/rejected": -1.119415044784546, "logps/chosen": -25.41081428527832, "logps/rejected": -43.74126434326172, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 0.16412924230098724, "rewards/margins": 3.79860258102417, "rewards/rejected": -3.6344728469848633, "step": 168 }, { "epoch": 2.864406779661017, "grad_norm": 19.216931577152895, "learning_rate": 4.0607925293484997e-07, "logits/chosen": -1.3364936113357544, "logits/rejected": -1.3348599672317505, "logps/chosen": -28.252498626708984, "logps/rejected": -36.41025161743164, "loss": 0.1405, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07597078382968903, "rewards/margins": 2.509685516357422, "rewards/rejected": -2.5856566429138184, "step": 169 }, { "epoch": 2.8813559322033897, "grad_norm": 18.004667351375737, "learning_rate": 4.046279763475687e-07, "logits/chosen": -1.4610190391540527, "logits/rejected": -1.467834234237671, "logps/chosen": -23.43694305419922, "logps/rejected": -40.96953582763672, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": 0.00839678943157196, "rewards/margins": 3.222750186920166, "rewards/rejected": -3.214353561401367, "step": 170 }, { "epoch": 2.898305084745763, "grad_norm": 16.724266176013543, "learning_rate": 4.031682108075128e-07, "logits/chosen": -1.3930025100708008, "logits/rejected": -1.3053603172302246, "logps/chosen": -26.837438583374023, "logps/rejected": -46.042606353759766, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": -0.3490511476993561, "rewards/margins": 3.341874122619629, "rewards/rejected": -3.6909255981445312, "step": 171 }, { "epoch": 2.915254237288136, "grad_norm": 17.501626125315028, "learning_rate": 4.0170003645464835e-07, "logits/chosen": -1.4981375932693481, "logits/rejected": -1.4686487913131714, "logps/chosen": -31.091188430786133, "logps/rejected": -41.67449188232422, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": -0.08511877059936523, "rewards/margins": 3.4050216674804688, "rewards/rejected": -3.490140438079834, "step": 172 }, { "epoch": 2.9322033898305087, "grad_norm": 17.505812920779697, "learning_rate": 4.0022353389057793e-07, "logits/chosen": -1.5352020263671875, "logits/rejected": -1.4121986627578735, "logps/chosen": -29.729156494140625, "logps/rejected": -48.48210525512695, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 0.050699926912784576, "rewards/margins": 3.282029390335083, "rewards/rejected": -3.2313296794891357, "step": 173 }, { "epoch": 2.9491525423728815, "grad_norm": 15.718044631960725, "learning_rate": 3.9873878417411685e-07, "logits/chosen": -1.3283764123916626, "logits/rejected": -1.2386645078659058, "logps/chosen": -31.367496490478516, "logps/rejected": -48.3538818359375, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": -0.02908661961555481, "rewards/margins": 4.318341255187988, "rewards/rejected": -4.3474273681640625, "step": 174 }, { "epoch": 2.9661016949152543, "grad_norm": 19.307114282390735, "learning_rate": 3.97245868816842e-07, "logits/chosen": -1.61316978931427, "logits/rejected": -1.4183673858642578, "logps/chosen": -25.636213302612305, "logps/rejected": -31.219690322875977, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": 0.19882389903068542, "rewards/margins": 2.8216686248779297, "rewards/rejected": -2.622844696044922, "step": 175 }, { "epoch": 2.983050847457627, "grad_norm": 19.852640661420647, "learning_rate": 3.95744869778618e-07, "logits/chosen": -1.3724339008331299, "logits/rejected": -1.2494310140609741, "logps/chosen": -37.323822021484375, "logps/rejected": -48.69821548461914, "loss": 0.1322, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2126758098602295, "rewards/margins": 3.236673355102539, "rewards/rejected": -3.4493494033813477, "step": 176 }, { "epoch": 3.0, "grad_norm": 17.961834616955002, "learning_rate": 3.942358694630967e-07, "logits/chosen": -1.4469400644302368, "logits/rejected": -1.4965747594833374, "logps/chosen": -27.073068618774414, "logps/rejected": -47.31336212158203, "loss": 0.1379, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2840898931026459, "rewards/margins": 3.4035041332244873, "rewards/rejected": -3.687593936920166, "step": 177 }, { "epoch": 3.016949152542373, "grad_norm": 11.404351341704286, "learning_rate": 3.927189507131938e-07, "logits/chosen": -1.340846300125122, "logits/rejected": -1.3331762552261353, "logps/chosen": -29.038299560546875, "logps/rejected": -44.76627731323242, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": -0.44353172183036804, "rewards/margins": 3.546970844268799, "rewards/rejected": -3.99050235748291, "step": 178 }, { "epoch": 3.0338983050847457, "grad_norm": 13.04527280968449, "learning_rate": 3.9119419680654083e-07, "logits/chosen": -1.3644622564315796, "logits/rejected": -1.2482867240905762, "logps/chosen": -29.963117599487305, "logps/rejected": -44.21002960205078, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 0.05338460952043533, "rewards/margins": 3.796813488006592, "rewards/rejected": -3.7434287071228027, "step": 179 }, { "epoch": 3.0508474576271185, "grad_norm": 10.551592699584061, "learning_rate": 3.896616914509131e-07, "logits/chosen": -1.1854358911514282, "logits/rejected": -1.2167768478393555, "logps/chosen": -28.745718002319336, "logps/rejected": -39.30556869506836, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 0.07138124108314514, "rewards/margins": 3.6734557151794434, "rewards/rejected": -3.60207462310791, "step": 180 }, { "epoch": 3.0677966101694913, "grad_norm": 12.613695441723, "learning_rate": 3.881215187796344e-07, "logits/chosen": -1.5086756944656372, "logits/rejected": -1.4750981330871582, "logps/chosen": -24.299089431762695, "logps/rejected": -48.68684005737305, "loss": 0.088, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2250959873199463, "rewards/margins": 5.23423433303833, "rewards/rejected": -5.0091376304626465, "step": 181 }, { "epoch": 3.084745762711864, "grad_norm": 12.720295333105653, "learning_rate": 3.865737633469579e-07, "logits/chosen": -1.456554889678955, "logits/rejected": -1.4655078649520874, "logps/chosen": -34.70986557006836, "logps/rejected": -45.1373405456543, "loss": 0.0913, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5282381176948547, "rewards/margins": 3.892458438873291, "rewards/rejected": -4.420697212219238, "step": 182 }, { "epoch": 3.1016949152542375, "grad_norm": 11.42456365348978, "learning_rate": 3.8501851012342444e-07, "logits/chosen": -1.429445505142212, "logits/rejected": -1.2520623207092285, "logps/chosen": -33.7725830078125, "logps/rejected": -48.29920196533203, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -0.20377352833747864, "rewards/margins": 3.905850887298584, "rewards/rejected": -4.10962438583374, "step": 183 }, { "epoch": 3.1186440677966103, "grad_norm": 11.707964288174598, "learning_rate": 3.834558444911977e-07, "logits/chosen": -1.442047119140625, "logits/rejected": -1.2768933773040771, "logps/chosen": -31.306930541992188, "logps/rejected": -55.08317184448242, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 0.1050567626953125, "rewards/margins": 4.629131317138672, "rewards/rejected": -4.524074554443359, "step": 184 }, { "epoch": 3.135593220338983, "grad_norm": 12.341349720055733, "learning_rate": 3.818858522393763e-07, "logits/chosen": -1.4654240608215332, "logits/rejected": -1.2980097532272339, "logps/chosen": -25.214473724365234, "logps/rejected": -48.52466583251953, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": -0.06900361180305481, "rewards/margins": 3.743170738220215, "rewards/rejected": -3.8121743202209473, "step": 185 }, { "epoch": 3.152542372881356, "grad_norm": 11.013651552230316, "learning_rate": 3.8030861955928496e-07, "logits/chosen": -1.4930305480957031, "logits/rejected": -1.49806809425354, "logps/chosen": -33.961334228515625, "logps/rejected": -60.0030403137207, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": -0.1520581841468811, "rewards/margins": 4.225584983825684, "rewards/rejected": -4.377642631530762, "step": 186 }, { "epoch": 3.169491525423729, "grad_norm": 11.565802520156074, "learning_rate": 3.787242330397418e-07, "logits/chosen": -1.151625633239746, "logits/rejected": -1.229320764541626, "logps/chosen": -27.614501953125, "logps/rejected": -46.39374542236328, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": 0.1534918248653412, "rewards/margins": 3.757617235183716, "rewards/rejected": -3.6041250228881836, "step": 187 }, { "epoch": 3.1864406779661016, "grad_norm": 11.91938723600508, "learning_rate": 3.7713277966230513e-07, "logits/chosen": -1.3814018964767456, "logits/rejected": -1.3967022895812988, "logps/chosen": -39.28313446044922, "logps/rejected": -51.139671325683594, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 0.14791953563690186, "rewards/margins": 3.787137508392334, "rewards/rejected": -3.639218330383301, "step": 188 }, { "epoch": 3.2033898305084745, "grad_norm": 11.7798233218555, "learning_rate": 3.755343467964981e-07, "logits/chosen": -1.4196237325668335, "logits/rejected": -1.281465768814087, "logps/chosen": -30.973888397216797, "logps/rejected": -60.48612976074219, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": -0.04596884548664093, "rewards/margins": 5.157370567321777, "rewards/rejected": -5.203339099884033, "step": 189 }, { "epoch": 3.2203389830508473, "grad_norm": 9.848668366337494, "learning_rate": 3.739290221950123e-07, "logits/chosen": -1.4920192956924438, "logits/rejected": -1.3025527000427246, "logps/chosen": -20.232595443725586, "logps/rejected": -40.525020599365234, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 0.6132769584655762, "rewards/margins": 4.133052349090576, "rewards/rejected": -3.519775390625, "step": 190 }, { "epoch": 3.23728813559322, "grad_norm": 11.384419516356248, "learning_rate": 3.723168939888901e-07, "logits/chosen": -1.4284577369689941, "logits/rejected": -1.3038253784179688, "logps/chosen": -36.025821685791016, "logps/rejected": -47.912906646728516, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": 0.4135657548904419, "rewards/margins": 4.515513896942139, "rewards/rejected": -4.101947784423828, "step": 191 }, { "epoch": 3.2542372881355934, "grad_norm": 12.741788906869417, "learning_rate": 3.7069805068268624e-07, "logits/chosen": -1.190822958946228, "logits/rejected": -1.2386127710342407, "logps/chosen": -25.53938865661621, "logps/rejected": -41.572757720947266, "loss": 0.0968, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21493886411190033, "rewards/margins": 3.464285373687744, "rewards/rejected": -3.6792244911193848, "step": 192 }, { "epoch": 3.2711864406779663, "grad_norm": 11.543121161706487, "learning_rate": 3.6907258114960915e-07, "logits/chosen": -1.44771146774292, "logits/rejected": -1.3371340036392212, "logps/chosen": -22.689008712768555, "logps/rejected": -33.55266571044922, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 0.20511241257190704, "rewards/margins": 3.822577953338623, "rewards/rejected": -3.617465019226074, "step": 193 }, { "epoch": 3.288135593220339, "grad_norm": 12.699281503282757, "learning_rate": 3.6744057462664194e-07, "logits/chosen": -1.2974333763122559, "logits/rejected": -1.2633615732192993, "logps/chosen": -35.57712936401367, "logps/rejected": -41.26565170288086, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": -0.1042805016040802, "rewards/margins": 3.9843153953552246, "rewards/rejected": -4.088595867156982, "step": 194 }, { "epoch": 3.305084745762712, "grad_norm": 11.346679553734594, "learning_rate": 3.658021207096432e-07, "logits/chosen": -1.250510811805725, "logits/rejected": -1.2325608730316162, "logps/chosen": -27.381729125976562, "logps/rejected": -39.44231414794922, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 0.14005836844444275, "rewards/margins": 3.225729465484619, "rewards/rejected": -3.0856711864471436, "step": 195 }, { "epoch": 3.3220338983050848, "grad_norm": 11.675211775416328, "learning_rate": 3.6415730934842825e-07, "logits/chosen": -1.4330198764801025, "logits/rejected": -1.2990684509277344, "logps/chosen": -26.091453552246094, "logps/rejected": -37.71536636352539, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 0.5305294990539551, "rewards/margins": 3.7357966899871826, "rewards/rejected": -3.2052674293518066, "step": 196 }, { "epoch": 3.3389830508474576, "grad_norm": 10.595079308232455, "learning_rate": 3.625062308418311e-07, "logits/chosen": -1.4256861209869385, "logits/rejected": -1.3300725221633911, "logps/chosen": -44.14484786987305, "logps/rejected": -49.18852233886719, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": -0.23929768800735474, "rewards/margins": 3.831993818283081, "rewards/rejected": -4.071290969848633, "step": 197 }, { "epoch": 3.3559322033898304, "grad_norm": 10.589818972158676, "learning_rate": 3.6084897583274715e-07, "logits/chosen": -1.5123655796051025, "logits/rejected": -1.4504189491271973, "logps/chosen": -21.742530822753906, "logps/rejected": -43.59285354614258, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 0.08142367005348206, "rewards/margins": 4.206247329711914, "rewards/rejected": -4.124823570251465, "step": 198 }, { "epoch": 3.3728813559322033, "grad_norm": 10.064171955563651, "learning_rate": 3.591856353031566e-07, "logits/chosen": -1.436945915222168, "logits/rejected": -1.475320816040039, "logps/chosen": -22.611244201660156, "logps/rejected": -43.448341369628906, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 0.2055773138999939, "rewards/margins": 4.247138500213623, "rewards/rejected": -4.041561126708984, "step": 199 }, { "epoch": 3.389830508474576, "grad_norm": 9.40202931235145, "learning_rate": 3.5751630056913013e-07, "logits/chosen": -1.5867615938186646, "logits/rejected": -1.559362769126892, "logps/chosen": -27.019208908081055, "logps/rejected": -38.67339324951172, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 0.21613261103630066, "rewards/margins": 3.4724230766296387, "rewards/rejected": -3.2562904357910156, "step": 200 }, { "epoch": 3.406779661016949, "grad_norm": 10.1184235462117, "learning_rate": 3.558410632758153e-07, "logits/chosen": -1.4771666526794434, "logits/rejected": -1.4069215059280396, "logps/chosen": -23.256723403930664, "logps/rejected": -42.76215744018555, "loss": 0.0723, "rewards/accuracies": 1.0, "rewards/chosen": 0.2688101828098297, "rewards/margins": 4.244396686553955, "rewards/rejected": -3.9755868911743164, "step": 201 }, { "epoch": 3.423728813559322, "grad_norm": 13.152928025513987, "learning_rate": 3.5416001539240574e-07, "logits/chosen": -1.5284346342086792, "logits/rejected": -1.4866607189178467, "logps/chosen": -24.796672821044922, "logps/rejected": -51.007835388183594, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": -0.19576720893383026, "rewards/margins": 4.06268835067749, "rewards/rejected": -4.258455753326416, "step": 202 }, { "epoch": 3.440677966101695, "grad_norm": 11.395001548472651, "learning_rate": 3.5247324920709147e-07, "logits/chosen": -1.3313159942626953, "logits/rejected": -1.2498857975006104, "logps/chosen": -30.9143009185791, "logps/rejected": -41.243194580078125, "loss": 0.07, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04568520188331604, "rewards/margins": 3.3133530616760254, "rewards/rejected": -3.267667770385742, "step": 203 }, { "epoch": 3.457627118644068, "grad_norm": 9.512575018928173, "learning_rate": 3.5078085732199307e-07, "logits/chosen": -1.567063331604004, "logits/rejected": -1.4618712663650513, "logps/chosen": -25.394840240478516, "logps/rejected": -49.12550354003906, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": 0.02158541977405548, "rewards/margins": 4.889016628265381, "rewards/rejected": -4.867431163787842, "step": 204 }, { "epoch": 3.4745762711864407, "grad_norm": 11.586078932679674, "learning_rate": 3.490829326480773e-07, "logits/chosen": -1.4906607866287231, "logits/rejected": -1.3863712549209595, "logps/chosen": -31.574989318847656, "logps/rejected": -48.25616455078125, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -0.011842876672744751, "rewards/margins": 4.549272537231445, "rewards/rejected": -4.561115741729736, "step": 205 }, { "epoch": 3.4915254237288136, "grad_norm": 12.547434003386876, "learning_rate": 3.4737956840005684e-07, "logits/chosen": -1.3435784578323364, "logits/rejected": -1.3757704496383667, "logps/chosen": -25.303531646728516, "logps/rejected": -39.15208053588867, "loss": 0.086, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02988174557685852, "rewards/margins": 3.5366082191467285, "rewards/rejected": -3.5067262649536133, "step": 206 }, { "epoch": 3.5084745762711864, "grad_norm": 9.788457273992568, "learning_rate": 3.4567085809127245e-07, "logits/chosen": -1.5033990144729614, "logits/rejected": -1.4766664505004883, "logps/chosen": -26.12259864807129, "logps/rejected": -52.18391418457031, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 0.08881500363349915, "rewards/margins": 4.650575160980225, "rewards/rejected": -4.561759948730469, "step": 207 }, { "epoch": 3.5254237288135593, "grad_norm": 11.249455307964162, "learning_rate": 3.439568955285595e-07, "logits/chosen": -1.600437045097351, "logits/rejected": -1.5185781717300415, "logps/chosen": -20.018888473510742, "logps/rejected": -41.524349212646484, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": -0.08864006400108337, "rewards/margins": 3.782201051712036, "rewards/rejected": -3.8708412647247314, "step": 208 }, { "epoch": 3.542372881355932, "grad_norm": 9.990126506747924, "learning_rate": 3.4223777480709804e-07, "logits/chosen": -1.3552483320236206, "logits/rejected": -1.2629144191741943, "logps/chosen": -21.040348052978516, "logps/rejected": -41.19926071166992, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": -0.106040358543396, "rewards/margins": 4.690126419067383, "rewards/rejected": -4.796167373657227, "step": 209 }, { "epoch": 3.559322033898305, "grad_norm": 9.838092383663449, "learning_rate": 3.405135903052465e-07, "logits/chosen": -1.4207416772842407, "logits/rejected": -1.2667282819747925, "logps/chosen": -30.103904724121094, "logps/rejected": -46.88092803955078, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": -0.3432961106300354, "rewards/margins": 4.319201946258545, "rewards/rejected": -4.6624979972839355, "step": 210 }, { "epoch": 3.576271186440678, "grad_norm": 9.67907653448832, "learning_rate": 3.3878443667936136e-07, "logits/chosen": -1.2791118621826172, "logits/rejected": -1.2004616260528564, "logps/chosen": -40.10679244995117, "logps/rejected": -60.650394439697266, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -0.6684077978134155, "rewards/margins": 4.151851654052734, "rewards/rejected": -4.820259094238281, "step": 211 }, { "epoch": 3.593220338983051, "grad_norm": 9.853143082693608, "learning_rate": 3.3705040885859967e-07, "logits/chosen": -1.5361111164093018, "logits/rejected": -1.4038530588150024, "logps/chosen": -37.90250015258789, "logps/rejected": -45.059024810791016, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030466020107269287, "rewards/margins": 3.964195728302002, "rewards/rejected": -3.9672420024871826, "step": 212 }, { "epoch": 3.610169491525424, "grad_norm": 9.44047901416316, "learning_rate": 3.3531160203970805e-07, "logits/chosen": -1.4581642150878906, "logits/rejected": -1.4039422273635864, "logps/chosen": -30.59682846069336, "logps/rejected": -44.48968505859375, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": -0.1339409351348877, "rewards/margins": 4.1027374267578125, "rewards/rejected": -4.236677646636963, "step": 213 }, { "epoch": 3.6271186440677967, "grad_norm": 11.435287785470077, "learning_rate": 3.3356811168179627e-07, "logits/chosen": -1.2856285572052002, "logits/rejected": -1.177187204360962, "logps/chosen": -28.70745086669922, "logps/rejected": -39.22813415527344, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": 0.319486528635025, "rewards/margins": 4.963109016418457, "rewards/rejected": -4.643622398376465, "step": 214 }, { "epoch": 3.6440677966101696, "grad_norm": 10.041339419341044, "learning_rate": 3.318200335010967e-07, "logits/chosen": -1.720375418663025, "logits/rejected": -1.5207815170288086, "logps/chosen": -26.1734676361084, "logps/rejected": -41.08108901977539, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 0.5092735290527344, "rewards/margins": 4.717857360839844, "rewards/rejected": -4.208584308624268, "step": 215 }, { "epoch": 3.6610169491525424, "grad_norm": 10.722521061649259, "learning_rate": 3.3006746346570935e-07, "logits/chosen": -1.5194891691207886, "logits/rejected": -1.5225661993026733, "logps/chosen": -21.88874626159668, "logps/rejected": -31.313539505004883, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 0.42684128880500793, "rewards/margins": 3.6699740886688232, "rewards/rejected": -3.2431328296661377, "step": 216 }, { "epoch": 3.6779661016949152, "grad_norm": 11.239607248752012, "learning_rate": 3.2831049779033395e-07, "logits/chosen": -1.585048794746399, "logits/rejected": -1.447473406791687, "logps/chosen": -44.55853271484375, "logps/rejected": -62.56214141845703, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": -0.5582241415977478, "rewards/margins": 5.07534122467041, "rewards/rejected": -5.633565425872803, "step": 217 }, { "epoch": 3.694915254237288, "grad_norm": 11.121200361780803, "learning_rate": 3.2654923293098666e-07, "logits/chosen": -1.4354138374328613, "logits/rejected": -1.431335210800171, "logps/chosen": -29.88983917236328, "logps/rejected": -41.21036911010742, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": -0.12309768795967102, "rewards/margins": 4.238857746124268, "rewards/rejected": -4.361955642700195, "step": 218 }, { "epoch": 3.711864406779661, "grad_norm": 8.46695075496944, "learning_rate": 3.247837655797061e-07, "logits/chosen": -1.4146300554275513, "logits/rejected": -1.3639909029006958, "logps/chosen": -23.79798698425293, "logps/rejected": -42.03700256347656, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 0.14847984910011292, "rewards/margins": 4.2562971115112305, "rewards/rejected": -4.107817649841309, "step": 219 }, { "epoch": 3.7288135593220337, "grad_norm": 10.875773708222669, "learning_rate": 3.2301419265924393e-07, "logits/chosen": -1.3614307641983032, "logits/rejected": -1.263061285018921, "logps/chosen": -26.913053512573242, "logps/rejected": -38.41120910644531, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 0.12246422469615936, "rewards/margins": 3.324322462081909, "rewards/rejected": -3.2018585205078125, "step": 220 }, { "epoch": 3.7457627118644066, "grad_norm": 10.149586335689621, "learning_rate": 3.2124061131774443e-07, "logits/chosen": -1.3240911960601807, "logits/rejected": -1.3163329362869263, "logps/chosen": -25.73955535888672, "logps/rejected": -49.833221435546875, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 0.2507280111312866, "rewards/margins": 4.252786636352539, "rewards/rejected": -4.002058982849121, "step": 221 }, { "epoch": 3.7627118644067794, "grad_norm": 9.140932864263574, "learning_rate": 3.194631189234109e-07, "logits/chosen": -1.6033962965011597, "logits/rejected": -1.4346346855163574, "logps/chosen": -35.82508087158203, "logps/rejected": -44.355918884277344, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.09858936071395874, "rewards/margins": 4.44521951675415, "rewards/rejected": -4.543808937072754, "step": 222 }, { "epoch": 3.7796610169491527, "grad_norm": 9.775231597153738, "learning_rate": 3.1768181305916063e-07, "logits/chosen": -1.4034669399261475, "logits/rejected": -1.2983992099761963, "logps/chosen": -36.48939514160156, "logps/rejected": -50.544036865234375, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 0.08162027597427368, "rewards/margins": 4.144961357116699, "rewards/rejected": -4.06334114074707, "step": 223 }, { "epoch": 3.7966101694915255, "grad_norm": 11.927845816995063, "learning_rate": 3.158967915172669e-07, "logits/chosen": -1.4804027080535889, "logits/rejected": -1.349886178970337, "logps/chosen": -27.929018020629883, "logps/rejected": -37.72224426269531, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": -0.21573437750339508, "rewards/margins": 3.54447078704834, "rewards/rejected": -3.760205030441284, "step": 224 }, { "epoch": 3.8135593220338984, "grad_norm": 11.052999347550179, "learning_rate": 3.141081522939911e-07, "logits/chosen": -1.4779460430145264, "logits/rejected": -1.349549412727356, "logps/chosen": -37.28151321411133, "logps/rejected": -47.440765380859375, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": -0.3217180073261261, "rewards/margins": 4.7478485107421875, "rewards/rejected": -5.06956672668457, "step": 225 }, { "epoch": 3.830508474576271, "grad_norm": 10.970513938481727, "learning_rate": 3.1231599358420233e-07, "logits/chosen": -1.3360522985458374, "logits/rejected": -1.2304054498672485, "logps/chosen": -25.628524780273438, "logps/rejected": -38.20055389404297, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 0.2233581840991974, "rewards/margins": 4.349393844604492, "rewards/rejected": -4.126035690307617, "step": 226 }, { "epoch": 3.847457627118644, "grad_norm": 9.433851363193908, "learning_rate": 3.105204137759867e-07, "logits/chosen": -1.2719545364379883, "logits/rejected": -1.330519676208496, "logps/chosen": -34.64011001586914, "logps/rejected": -52.32704162597656, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": -0.2245815396308899, "rewards/margins": 4.7216315269470215, "rewards/rejected": -4.9462127685546875, "step": 227 }, { "epoch": 3.864406779661017, "grad_norm": 10.439526673557062, "learning_rate": 3.0872151144524594e-07, "logits/chosen": -1.6759734153747559, "logits/rejected": -1.579871654510498, "logps/chosen": -27.196908950805664, "logps/rejected": -50.7559700012207, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": -0.22928810119628906, "rewards/margins": 4.84774923324585, "rewards/rejected": -5.077037811279297, "step": 228 }, { "epoch": 3.8813559322033897, "grad_norm": 11.392292474810198, "learning_rate": 3.069193853502855e-07, "logits/chosen": -1.5869011878967285, "logits/rejected": -1.5820071697235107, "logps/chosen": -25.63404655456543, "logps/rejected": -38.69296646118164, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": -0.3039775788784027, "rewards/margins": 3.5710806846618652, "rewards/rejected": -3.875058174133301, "step": 229 }, { "epoch": 3.898305084745763, "grad_norm": 10.225877059440036, "learning_rate": 3.0511413442639297e-07, "logits/chosen": -1.434234619140625, "logits/rejected": -1.3351249694824219, "logps/chosen": -24.643152236938477, "logps/rejected": -60.13444519042969, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": -0.10867035388946533, "rewards/margins": 6.075805187225342, "rewards/rejected": -6.184475898742676, "step": 230 }, { "epoch": 3.915254237288136, "grad_norm": 9.295415437658166, "learning_rate": 3.0330585778040675e-07, "logits/chosen": -1.3108859062194824, "logits/rejected": -1.2643885612487793, "logps/chosen": -24.4619140625, "logps/rejected": -34.88125228881836, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": -0.13806024193763733, "rewards/margins": 3.5883846282958984, "rewards/rejected": -3.726444721221924, "step": 231 }, { "epoch": 3.9322033898305087, "grad_norm": 9.365373350685292, "learning_rate": 3.0149465468527457e-07, "logits/chosen": -1.4852244853973389, "logits/rejected": -1.5285433530807495, "logps/chosen": -25.74740982055664, "logps/rejected": -39.37394332885742, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.005698531866073608, "rewards/margins": 4.059448719024658, "rewards/rejected": -4.0651469230651855, "step": 232 }, { "epoch": 3.9491525423728815, "grad_norm": 8.974250718981809, "learning_rate": 2.9968062457460437e-07, "logits/chosen": -1.5756818056106567, "logits/rejected": -1.4879854917526245, "logps/chosen": -24.878393173217773, "logps/rejected": -43.14397048950195, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": -0.1284530758857727, "rewards/margins": 4.223859786987305, "rewards/rejected": -4.3523125648498535, "step": 233 }, { "epoch": 3.9661016949152543, "grad_norm": 11.054495120717768, "learning_rate": 2.978638670372047e-07, "logits/chosen": -1.4321879148483276, "logits/rejected": -1.3549392223358154, "logps/chosen": -35.11801528930664, "logps/rejected": -50.857975006103516, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": -0.5975885391235352, "rewards/margins": 4.9589338302612305, "rewards/rejected": -5.556522846221924, "step": 234 }, { "epoch": 3.983050847457627, "grad_norm": 10.55567875619197, "learning_rate": 2.9604448181161755e-07, "logits/chosen": -1.4181180000305176, "logits/rejected": -1.247128963470459, "logps/chosen": -24.004146575927734, "logps/rejected": -40.258331298828125, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": -0.09078972041606903, "rewards/margins": 3.5850982666015625, "rewards/rejected": -3.6758880615234375, "step": 235 }, { "epoch": 4.0, "grad_norm": 8.789282244689215, "learning_rate": 2.9422256878064324e-07, "logits/chosen": -1.3495515584945679, "logits/rejected": -1.3198853731155396, "logps/chosen": -35.33921432495117, "logps/rejected": -48.3183708190918, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.6972587704658508, "rewards/margins": 4.445403575897217, "rewards/rejected": -5.142662525177002, "step": 236 }, { "epoch": 4.016949152542373, "grad_norm": 6.75184146677303, "learning_rate": 2.923982279658564e-07, "logits/chosen": -1.633570909500122, "logits/rejected": -1.5905429124832153, "logps/chosen": -38.822261810302734, "logps/rejected": -48.57697296142578, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": -0.40955615043640137, "rewards/margins": 4.950255393981934, "rewards/rejected": -5.359812259674072, "step": 237 }, { "epoch": 4.033898305084746, "grad_norm": 9.349730549756508, "learning_rate": 2.90571559522115e-07, "logits/chosen": -1.0709235668182373, "logits/rejected": -1.1218361854553223, "logps/chosen": -29.26075553894043, "logps/rejected": -34.48569869995117, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": 0.08125007152557373, "rewards/margins": 3.5675737857818604, "rewards/rejected": -3.486323833465576, "step": 238 }, { "epoch": 4.0508474576271185, "grad_norm": 8.738621249147451, "learning_rate": 2.8874266373206215e-07, "logits/chosen": -1.5969672203063965, "logits/rejected": -1.5486068725585938, "logps/chosen": -32.66074752807617, "logps/rejected": -43.64617919921875, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 0.13383600115776062, "rewards/margins": 4.7373738288879395, "rewards/rejected": -4.603538513183594, "step": 239 }, { "epoch": 4.067796610169491, "grad_norm": 8.414244576812601, "learning_rate": 2.8691164100062034e-07, "logits/chosen": -1.4231804609298706, "logits/rejected": -1.2792776823043823, "logps/chosen": -31.69137954711914, "logps/rejected": -54.63775634765625, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 0.08693331480026245, "rewards/margins": 5.980134963989258, "rewards/rejected": -5.89320182800293, "step": 240 }, { "epoch": 4.084745762711864, "grad_norm": 8.9348895906443, "learning_rate": 2.8507859184947953e-07, "logits/chosen": -1.4366453886032104, "logits/rejected": -1.3272255659103394, "logps/chosen": -26.352733612060547, "logps/rejected": -46.89934158325195, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -0.2997795641422272, "rewards/margins": 3.753899574279785, "rewards/rejected": -4.053679466247559, "step": 241 }, { "epoch": 4.101694915254237, "grad_norm": 8.250352811668128, "learning_rate": 2.8324361691157853e-07, "logits/chosen": -1.374919056892395, "logits/rejected": -1.421012043952942, "logps/chosen": -31.536663055419922, "logps/rejected": -55.928749084472656, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.2735545039176941, "rewards/margins": 4.6181135177612305, "rewards/rejected": -4.891667366027832, "step": 242 }, { "epoch": 4.11864406779661, "grad_norm": 9.40465609028747, "learning_rate": 2.8140681692558034e-07, "logits/chosen": -1.7210100889205933, "logits/rejected": -1.6137436628341675, "logps/chosen": -29.081104278564453, "logps/rejected": -42.040245056152344, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 0.13167661428451538, "rewards/margins": 4.6320929527282715, "rewards/rejected": -4.500416278839111, "step": 243 }, { "epoch": 4.135593220338983, "grad_norm": 7.575529614099465, "learning_rate": 2.7956829273034146e-07, "logits/chosen": -1.3278542757034302, "logits/rejected": -1.2801333665847778, "logps/chosen": -28.36415672302246, "logps/rejected": -45.0773811340332, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": 0.0763159990310669, "rewards/margins": 4.670356750488281, "rewards/rejected": -4.594040393829346, "step": 244 }, { "epoch": 4.1525423728813555, "grad_norm": 7.841549437235225, "learning_rate": 2.7772814525937634e-07, "logits/chosen": -1.521530032157898, "logits/rejected": -1.3261444568634033, "logps/chosen": -25.693857192993164, "logps/rejected": -44.95489501953125, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 0.31940776109695435, "rewards/margins": 4.865126132965088, "rewards/rejected": -4.545718193054199, "step": 245 }, { "epoch": 4.169491525423728, "grad_norm": 7.437246530691066, "learning_rate": 2.7588647553531576e-07, "logits/chosen": -1.3834595680236816, "logits/rejected": -1.366625428199768, "logps/chosen": -27.27410125732422, "logps/rejected": -47.83835220336914, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 0.1564972996711731, "rewards/margins": 4.690642833709717, "rewards/rejected": -4.534145355224609, "step": 246 }, { "epoch": 4.186440677966102, "grad_norm": 8.374701794746429, "learning_rate": 2.7404338466436116e-07, "logits/chosen": -1.4731414318084717, "logits/rejected": -1.4213758707046509, "logps/chosen": -32.91023635864258, "logps/rejected": -48.37405014038086, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 0.12676534056663513, "rewards/margins": 5.037107944488525, "rewards/rejected": -4.910342216491699, "step": 247 }, { "epoch": 4.203389830508475, "grad_norm": 8.684915932739882, "learning_rate": 2.721989738307337e-07, "logits/chosen": -1.5974880456924438, "logits/rejected": -1.5572988986968994, "logps/chosen": -29.11380386352539, "logps/rejected": -43.61833953857422, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 0.09620954096317291, "rewards/margins": 3.6240577697753906, "rewards/rejected": -3.527848482131958, "step": 248 }, { "epoch": 4.220338983050848, "grad_norm": 8.422615969153167, "learning_rate": 2.7035334429111955e-07, "logits/chosen": -1.4860804080963135, "logits/rejected": -1.4129899740219116, "logps/chosen": -38.593204498291016, "logps/rejected": -55.535247802734375, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": -0.007733196020126343, "rewards/margins": 4.608980178833008, "rewards/rejected": -4.616713047027588, "step": 249 }, { "epoch": 4.237288135593221, "grad_norm": 8.150205311828831, "learning_rate": 2.685065973691107e-07, "logits/chosen": -1.509846806526184, "logits/rejected": -1.440779447555542, "logps/chosen": -32.75022506713867, "logps/rejected": -53.621826171875, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": -0.10550498962402344, "rewards/margins": 5.389019966125488, "rewards/rejected": -5.49452543258667, "step": 250 }, { "epoch": 4.254237288135593, "grad_norm": 8.158516596968319, "learning_rate": 2.6665883444964277e-07, "logits/chosen": -1.2622435092926025, "logits/rejected": -1.2126017808914185, "logps/chosen": -22.526697158813477, "logps/rejected": -49.31510543823242, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": -0.2732163071632385, "rewards/margins": 5.386449337005615, "rewards/rejected": -5.659666061401367, "step": 251 }, { "epoch": 4.271186440677966, "grad_norm": 8.708471545136758, "learning_rate": 2.6481015697342856e-07, "logits/chosen": -1.3532803058624268, "logits/rejected": -1.263253092765808, "logps/chosen": -18.96270751953125, "logps/rejected": -40.195404052734375, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 0.013826161623001099, "rewards/margins": 4.144740581512451, "rewards/rejected": -4.130914688110352, "step": 252 }, { "epoch": 4.288135593220339, "grad_norm": 7.692204259130146, "learning_rate": 2.629606664313896e-07, "logits/chosen": -1.4633291959762573, "logits/rejected": -1.2908146381378174, "logps/chosen": -26.60018539428711, "logps/rejected": -42.72929763793945, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": -0.05975392460823059, "rewards/margins": 3.8384809494018555, "rewards/rejected": -3.8982346057891846, "step": 253 }, { "epoch": 4.305084745762712, "grad_norm": 7.840483273382048, "learning_rate": 2.611104643590838e-07, "logits/chosen": -1.3723235130310059, "logits/rejected": -1.3505971431732178, "logps/chosen": -22.22472381591797, "logps/rejected": -48.225982666015625, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 0.1287948191165924, "rewards/margins": 4.58843994140625, "rewards/rejected": -4.4596452713012695, "step": 254 }, { "epoch": 4.322033898305085, "grad_norm": 7.007309716178494, "learning_rate": 2.592596523311317e-07, "logits/chosen": -1.6482080221176147, "logits/rejected": -1.5536173582077026, "logps/chosen": -34.07274627685547, "logps/rejected": -39.57206344604492, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 0.029447555541992188, "rewards/margins": 4.474746227264404, "rewards/rejected": -4.445298194885254, "step": 255 }, { "epoch": 4.338983050847458, "grad_norm": 8.655754602681508, "learning_rate": 2.5740833195563994e-07, "logits/chosen": -1.4583147764205933, "logits/rejected": -1.4426932334899902, "logps/chosen": -28.750776290893555, "logps/rejected": -42.55610656738281, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": -0.13989472389221191, "rewards/margins": 4.060596942901611, "rewards/rejected": -4.200491905212402, "step": 256 }, { "epoch": 4.3559322033898304, "grad_norm": 7.666532517465646, "learning_rate": 2.5555660486862293e-07, "logits/chosen": -1.3961790800094604, "logits/rejected": -1.3545511960983276, "logps/chosen": -30.55471420288086, "logps/rejected": -46.441307067871094, "loss": 0.0527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01744025945663452, "rewards/margins": 4.828250885009766, "rewards/rejected": -4.810810565948486, "step": 257 }, { "epoch": 4.372881355932203, "grad_norm": 7.38904178241979, "learning_rate": 2.5370457272842315e-07, "logits/chosen": -1.2144039869308472, "logits/rejected": -1.1987119913101196, "logps/chosen": -31.387115478515625, "logps/rejected": -44.41786193847656, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 0.37475213408470154, "rewards/margins": 4.554241180419922, "rewards/rejected": -4.179489612579346, "step": 258 }, { "epoch": 4.389830508474576, "grad_norm": 7.357227804203978, "learning_rate": 2.5185233721013053e-07, "logits/chosen": -1.564170479774475, "logits/rejected": -1.5079408884048462, "logps/chosen": -24.936302185058594, "logps/rejected": -44.029632568359375, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": -0.1059635728597641, "rewards/margins": 4.297895908355713, "rewards/rejected": -4.403859615325928, "step": 259 }, { "epoch": 4.406779661016949, "grad_norm": 9.22648891874881, "learning_rate": 2.5e-07, "logits/chosen": -1.3310877084732056, "logits/rejected": -1.3538789749145508, "logps/chosen": -24.936809539794922, "logps/rejected": -49.063575744628906, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -0.07093064486980438, "rewards/margins": 4.918918132781982, "rewards/rejected": -4.989849090576172, "step": 260 }, { "epoch": 4.423728813559322, "grad_norm": 6.836226567179652, "learning_rate": 2.4814766278986944e-07, "logits/chosen": -1.6707507371902466, "logits/rejected": -1.5290323495864868, "logps/chosen": -29.512426376342773, "logps/rejected": -54.653114318847656, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 0.0224665105342865, "rewards/margins": 5.460002899169922, "rewards/rejected": -5.437536239624023, "step": 261 }, { "epoch": 4.440677966101695, "grad_norm": 8.63432296696573, "learning_rate": 2.462954272715768e-07, "logits/chosen": -1.5188168287277222, "logits/rejected": -1.4816163778305054, "logps/chosen": -31.95643424987793, "logps/rejected": -42.647403717041016, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.3785313367843628, "rewards/margins": 4.157118320465088, "rewards/rejected": -4.535649299621582, "step": 262 }, { "epoch": 4.4576271186440675, "grad_norm": 8.141729762323788, "learning_rate": 2.4444339513137716e-07, "logits/chosen": -1.590928316116333, "logits/rejected": -1.5425117015838623, "logps/chosen": -30.981969833374023, "logps/rejected": -55.22663116455078, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.1410723626613617, "rewards/margins": 5.534295082092285, "rewards/rejected": -5.393222808837891, "step": 263 }, { "epoch": 4.47457627118644, "grad_norm": 6.553586136504455, "learning_rate": 2.4259166804436003e-07, "logits/chosen": -1.6111406087875366, "logits/rejected": -1.5660130977630615, "logps/chosen": -30.944381713867188, "logps/rejected": -48.914039611816406, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -0.1980370730161667, "rewards/margins": 5.023958683013916, "rewards/rejected": -5.2219953536987305, "step": 264 }, { "epoch": 4.491525423728813, "grad_norm": 7.7695916088798045, "learning_rate": 2.4074034766886826e-07, "logits/chosen": -1.4081194400787354, "logits/rejected": -1.4115426540374756, "logps/chosen": -26.77755355834961, "logps/rejected": -44.787628173828125, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -0.2828730642795563, "rewards/margins": 5.210730075836182, "rewards/rejected": -5.493603229522705, "step": 265 }, { "epoch": 4.508474576271187, "grad_norm": 9.138228801008614, "learning_rate": 2.3888953564091616e-07, "logits/chosen": -1.5158981084823608, "logits/rejected": -1.5783250331878662, "logps/chosen": -34.667503356933594, "logps/rejected": -53.5840950012207, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": -0.20706957578659058, "rewards/margins": 5.503992080688477, "rewards/rejected": -5.711061477661133, "step": 266 }, { "epoch": 4.52542372881356, "grad_norm": 6.507174183795552, "learning_rate": 2.3703933356861044e-07, "logits/chosen": -1.399531602859497, "logits/rejected": -1.419585108757019, "logps/chosen": -33.15267562866211, "logps/rejected": -45.78288269042969, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -0.5228755474090576, "rewards/margins": 4.130328178405762, "rewards/rejected": -4.653203964233398, "step": 267 }, { "epoch": 4.5423728813559325, "grad_norm": 7.507653099421495, "learning_rate": 2.3518984302657144e-07, "logits/chosen": -1.5033714771270752, "logits/rejected": -1.427080750465393, "logps/chosen": -24.752490997314453, "logps/rejected": -57.89167022705078, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -0.4027895927429199, "rewards/margins": 5.957125186920166, "rewards/rejected": -6.359914779663086, "step": 268 }, { "epoch": 4.559322033898305, "grad_norm": 7.12313349874887, "learning_rate": 2.333411655503572e-07, "logits/chosen": -1.4686946868896484, "logits/rejected": -1.3002254962921143, "logps/chosen": -29.05103302001953, "logps/rejected": -53.72389221191406, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036399289965629578, "rewards/margins": 5.409298419952393, "rewards/rejected": -5.412938117980957, "step": 269 }, { "epoch": 4.576271186440678, "grad_norm": 7.93618380947219, "learning_rate": 2.3149340263088927e-07, "logits/chosen": -1.7106562852859497, "logits/rejected": -1.5941462516784668, "logps/chosen": -23.763492584228516, "logps/rejected": -47.54367446899414, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.3024110496044159, "rewards/margins": 5.066253185272217, "rewards/rejected": -4.7638421058654785, "step": 270 }, { "epoch": 4.593220338983051, "grad_norm": 7.441146999396367, "learning_rate": 2.296466557088805e-07, "logits/chosen": -1.432921051979065, "logits/rejected": -1.415549635887146, "logps/chosen": -27.215051651000977, "logps/rejected": -50.206668853759766, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": -0.3797184228897095, "rewards/margins": 5.635693550109863, "rewards/rejected": -6.015412330627441, "step": 271 }, { "epoch": 4.610169491525424, "grad_norm": 7.203519404793784, "learning_rate": 2.278010261692663e-07, "logits/chosen": -1.6040518283843994, "logits/rejected": -1.5052812099456787, "logps/chosen": -27.241743087768555, "logps/rejected": -47.83567810058594, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": -0.10971636325120926, "rewards/margins": 5.601438999176025, "rewards/rejected": -5.711155891418457, "step": 272 }, { "epoch": 4.627118644067797, "grad_norm": 7.1515886159837505, "learning_rate": 2.2595661533563887e-07, "logits/chosen": -1.5058000087738037, "logits/rejected": -1.4968637228012085, "logps/chosen": -31.407421112060547, "logps/rejected": -50.27019500732422, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -0.42635661363601685, "rewards/margins": 4.159647464752197, "rewards/rejected": -4.58600378036499, "step": 273 }, { "epoch": 4.6440677966101696, "grad_norm": 7.65650118033261, "learning_rate": 2.2411352446468424e-07, "logits/chosen": -1.3374431133270264, "logits/rejected": -1.3260188102722168, "logps/chosen": -21.7060489654541, "logps/rejected": -47.063289642333984, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 0.14809060096740723, "rewards/margins": 4.805155277252197, "rewards/rejected": -4.657064437866211, "step": 274 }, { "epoch": 4.661016949152542, "grad_norm": 8.277053566352693, "learning_rate": 2.2227185474062374e-07, "logits/chosen": -1.49006986618042, "logits/rejected": -1.3998165130615234, "logps/chosen": -24.71628761291504, "logps/rejected": -47.67393493652344, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.02221532166004181, "rewards/margins": 4.3083624839782715, "rewards/rejected": -4.286147117614746, "step": 275 }, { "epoch": 4.677966101694915, "grad_norm": 7.8454820880953715, "learning_rate": 2.2043170726965857e-07, "logits/chosen": -1.4486889839172363, "logits/rejected": -1.4862079620361328, "logps/chosen": -27.138608932495117, "logps/rejected": -44.235687255859375, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 0.02510838210582733, "rewards/margins": 4.74564266204834, "rewards/rejected": -4.720534324645996, "step": 276 }, { "epoch": 4.694915254237288, "grad_norm": 7.405600784416248, "learning_rate": 2.1859318307441966e-07, "logits/chosen": -1.444726586341858, "logits/rejected": -1.4401050806045532, "logps/chosen": -32.68350601196289, "logps/rejected": -45.99779510498047, "loss": 0.0469, "rewards/accuracies": 0.9375, "rewards/chosen": 0.024221569299697876, "rewards/margins": 5.507167816162109, "rewards/rejected": -5.482946872711182, "step": 277 }, { "epoch": 4.711864406779661, "grad_norm": 8.104155066599384, "learning_rate": 2.1675638308842142e-07, "logits/chosen": -1.2842341661453247, "logits/rejected": -1.330397605895996, "logps/chosen": -27.432506561279297, "logps/rejected": -40.2528076171875, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 0.19671624898910522, "rewards/margins": 4.634299278259277, "rewards/rejected": -4.4375834465026855, "step": 278 }, { "epoch": 4.728813559322034, "grad_norm": 5.864508514550939, "learning_rate": 2.149214081505205e-07, "logits/chosen": -1.5249521732330322, "logits/rejected": -1.404898762702942, "logps/chosen": -32.8936653137207, "logps/rejected": -37.96503829956055, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 0.00440371036529541, "rewards/margins": 4.577935218811035, "rewards/rejected": -4.573531627655029, "step": 279 }, { "epoch": 4.745762711864407, "grad_norm": 7.425009165674246, "learning_rate": 2.1308835899937972e-07, "logits/chosen": -1.463742733001709, "logits/rejected": -1.4091339111328125, "logps/chosen": -28.56261444091797, "logps/rejected": -44.104896545410156, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -0.04231783747673035, "rewards/margins": 4.795005798339844, "rewards/rejected": -4.837324142456055, "step": 280 }, { "epoch": 4.762711864406779, "grad_norm": 8.25860440969276, "learning_rate": 2.112573362679379e-07, "logits/chosen": -1.4964463710784912, "logits/rejected": -1.355745553970337, "logps/chosen": -37.27642059326172, "logps/rejected": -61.14379119873047, "loss": 0.0524, "rewards/accuracies": 0.9375, "rewards/chosen": 0.36257362365722656, "rewards/margins": 5.591864109039307, "rewards/rejected": -5.22929048538208, "step": 281 }, { "epoch": 4.779661016949152, "grad_norm": 8.825180205529128, "learning_rate": 2.09428440477885e-07, "logits/chosen": -1.5772864818572998, "logits/rejected": -1.3255836963653564, "logps/chosen": -27.248455047607422, "logps/rejected": -47.95212173461914, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": -0.1756860911846161, "rewards/margins": 6.540701866149902, "rewards/rejected": -6.71638822555542, "step": 282 }, { "epoch": 4.796610169491525, "grad_norm": 6.459544380114144, "learning_rate": 2.0760177203414366e-07, "logits/chosen": -1.4836134910583496, "logits/rejected": -1.5253633260726929, "logps/chosen": -28.740909576416016, "logps/rejected": -38.53902816772461, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -0.034703388810157776, "rewards/margins": 4.7387285232543945, "rewards/rejected": -4.773431777954102, "step": 283 }, { "epoch": 4.813559322033898, "grad_norm": 8.201593648267453, "learning_rate": 2.0577743121935682e-07, "logits/chosen": -1.5238087177276611, "logits/rejected": -1.479065179824829, "logps/chosen": -23.523155212402344, "logps/rejected": -49.87913131713867, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": -0.09851184487342834, "rewards/margins": 4.452397346496582, "rewards/rejected": -4.550909519195557, "step": 284 }, { "epoch": 4.830508474576272, "grad_norm": 8.037104047323464, "learning_rate": 2.0395551818838243e-07, "logits/chosen": -1.5587154626846313, "logits/rejected": -1.473785638809204, "logps/chosen": -38.94770812988281, "logps/rejected": -56.7118034362793, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -0.5131532549858093, "rewards/margins": 5.551197052001953, "rewards/rejected": -6.06434965133667, "step": 285 }, { "epoch": 4.847457627118644, "grad_norm": 6.619262748940799, "learning_rate": 2.021361329627953e-07, "logits/chosen": -1.5830734968185425, "logits/rejected": -1.4701610803604126, "logps/chosen": -25.216768264770508, "logps/rejected": -52.16200637817383, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -0.12712815403938293, "rewards/margins": 5.188778877258301, "rewards/rejected": -5.3159074783325195, "step": 286 }, { "epoch": 4.864406779661017, "grad_norm": 7.108190229695481, "learning_rate": 2.003193754253957e-07, "logits/chosen": -1.499477744102478, "logits/rejected": -1.442081093788147, "logps/chosen": -29.464977264404297, "logps/rejected": -44.91366958618164, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": -0.14971515536308289, "rewards/margins": 4.699173927307129, "rewards/rejected": -4.848889350891113, "step": 287 }, { "epoch": 4.88135593220339, "grad_norm": 6.501653298714645, "learning_rate": 1.9850534531472544e-07, "logits/chosen": -1.384242057800293, "logits/rejected": -1.2570266723632812, "logps/chosen": -27.370962142944336, "logps/rejected": -41.63351058959961, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 0.09189403057098389, "rewards/margins": 5.2285356521606445, "rewards/rejected": -5.136641502380371, "step": 288 }, { "epoch": 4.898305084745763, "grad_norm": 6.999053945822559, "learning_rate": 1.966941422195933e-07, "logits/chosen": -1.4301297664642334, "logits/rejected": -1.3571842908859253, "logps/chosen": -30.701709747314453, "logps/rejected": -47.53770065307617, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -0.2589109241962433, "rewards/margins": 4.9735002517700195, "rewards/rejected": -5.232410907745361, "step": 289 }, { "epoch": 4.915254237288136, "grad_norm": 7.2234737195409275, "learning_rate": 1.94885865573607e-07, "logits/chosen": -1.5849264860153198, "logits/rejected": -1.5440596342086792, "logps/chosen": -22.58307647705078, "logps/rejected": -42.317161560058594, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": -0.05918128788471222, "rewards/margins": 4.245813846588135, "rewards/rejected": -4.304995536804199, "step": 290 }, { "epoch": 4.932203389830509, "grad_norm": 7.881613271557854, "learning_rate": 1.930806146497146e-07, "logits/chosen": -1.5185688734054565, "logits/rejected": -1.498981237411499, "logps/chosen": -27.832983016967773, "logps/rejected": -44.11700439453125, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 0.012274429202079773, "rewards/margins": 4.80919885635376, "rewards/rejected": -4.796924591064453, "step": 291 }, { "epoch": 4.9491525423728815, "grad_norm": 7.929413571170103, "learning_rate": 1.912784885547541e-07, "logits/chosen": -1.4354244470596313, "logits/rejected": -1.2839109897613525, "logps/chosen": -28.58751678466797, "logps/rejected": -52.627559661865234, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -0.09678968787193298, "rewards/margins": 3.94634747505188, "rewards/rejected": -4.043137550354004, "step": 292 }, { "epoch": 4.966101694915254, "grad_norm": 8.10839834515343, "learning_rate": 1.8947958622401328e-07, "logits/chosen": -1.2912473678588867, "logits/rejected": -1.3136945962905884, "logps/chosen": -28.488567352294922, "logps/rejected": -44.88406753540039, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -0.5085054636001587, "rewards/margins": 4.020839691162109, "rewards/rejected": -4.529345512390137, "step": 293 }, { "epoch": 4.983050847457627, "grad_norm": 6.68942425307024, "learning_rate": 1.876840064157976e-07, "logits/chosen": -1.4644904136657715, "logits/rejected": -1.4102816581726074, "logps/chosen": -26.936355590820312, "logps/rejected": -47.06075668334961, "loss": 0.0416, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07672211527824402, "rewards/margins": 4.622737884521484, "rewards/rejected": -4.699460029602051, "step": 294 }, { "epoch": 5.0, "grad_norm": 8.672333086068912, "learning_rate": 1.858918477060089e-07, "logits/chosen": -1.4006508588790894, "logits/rejected": -1.4321238994598389, "logps/chosen": -25.137971878051758, "logps/rejected": -42.432716369628906, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": -0.02563352882862091, "rewards/margins": 4.927007675170898, "rewards/rejected": -4.952641010284424, "step": 295 }, { "epoch": 5.016949152542373, "grad_norm": 6.442205099947888, "learning_rate": 1.8410320848273313e-07, "logits/chosen": -1.4923574924468994, "logits/rejected": -1.4842016696929932, "logps/chosen": -26.232227325439453, "logps/rejected": -48.02200698852539, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -1.0712697505950928, "rewards/margins": 5.136242866516113, "rewards/rejected": -6.207512855529785, "step": 296 }, { "epoch": 5.033898305084746, "grad_norm": 6.132421236608343, "learning_rate": 1.8231818694083938e-07, "logits/chosen": -1.4439387321472168, "logits/rejected": -1.4443602561950684, "logps/chosen": -39.26667404174805, "logps/rejected": -56.92470169067383, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -0.1077936589717865, "rewards/margins": 6.438662528991699, "rewards/rejected": -6.546456336975098, "step": 297 }, { "epoch": 5.0508474576271185, "grad_norm": 5.496092091450637, "learning_rate": 1.8053688107658905e-07, "logits/chosen": -1.2399489879608154, "logits/rejected": -1.1487674713134766, "logps/chosen": -24.482892990112305, "logps/rejected": -38.75669860839844, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 0.1850048452615738, "rewards/margins": 4.235044479370117, "rewards/rejected": -4.050039291381836, "step": 298 }, { "epoch": 5.067796610169491, "grad_norm": 6.371224857913382, "learning_rate": 1.787593886822556e-07, "logits/chosen": -1.5012649297714233, "logits/rejected": -1.5702931880950928, "logps/chosen": -26.034366607666016, "logps/rejected": -54.3376579284668, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -0.206559419631958, "rewards/margins": 5.6937360763549805, "rewards/rejected": -5.900295257568359, "step": 299 }, { "epoch": 5.084745762711864, "grad_norm": 6.572183005650043, "learning_rate": 1.7698580734075607e-07, "logits/chosen": -1.48176908493042, "logits/rejected": -1.4655022621154785, "logps/chosen": -27.89720344543457, "logps/rejected": -45.596282958984375, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.14924107491970062, "rewards/margins": 4.559385299682617, "rewards/rejected": -4.7086262702941895, "step": 300 }, { "epoch": 5.101694915254237, "grad_norm": 6.034869634249788, "learning_rate": 1.7521623442029388e-07, "logits/chosen": -1.5313202142715454, "logits/rejected": -1.564162015914917, "logps/chosen": -22.828670501708984, "logps/rejected": -48.36279296875, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 0.2727990448474884, "rewards/margins": 4.790719985961914, "rewards/rejected": -4.51792049407959, "step": 301 }, { "epoch": 5.11864406779661, "grad_norm": 6.764929177752538, "learning_rate": 1.7345076706901326e-07, "logits/chosen": -1.4571880102157593, "logits/rejected": -1.4540932178497314, "logps/chosen": -30.096073150634766, "logps/rejected": -55.99176788330078, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -0.1132214367389679, "rewards/margins": 5.543183326721191, "rewards/rejected": -5.656404972076416, "step": 302 }, { "epoch": 5.135593220338983, "grad_norm": 7.398683409083888, "learning_rate": 1.7168950220966614e-07, "logits/chosen": -1.430177927017212, "logits/rejected": -1.482927918434143, "logps/chosen": -30.467327117919922, "logps/rejected": -45.27473831176758, "loss": 0.0516, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06274604797363281, "rewards/margins": 4.2044267654418945, "rewards/rejected": -4.267173767089844, "step": 303 }, { "epoch": 5.1525423728813555, "grad_norm": 5.9915990011633555, "learning_rate": 1.6993253653429062e-07, "logits/chosen": -1.4992166757583618, "logits/rejected": -1.4459744691848755, "logps/chosen": -33.046180725097656, "logps/rejected": -52.499366760253906, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -0.5871363878250122, "rewards/margins": 5.686320781707764, "rewards/rejected": -6.2734575271606445, "step": 304 }, { "epoch": 5.169491525423728, "grad_norm": 6.387235286093505, "learning_rate": 1.681799664989033e-07, "logits/chosen": -1.4004420042037964, "logits/rejected": -1.3709527254104614, "logps/chosen": -26.409391403198242, "logps/rejected": -37.188838958740234, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 0.3655552864074707, "rewards/margins": 4.699047565460205, "rewards/rejected": -4.333492279052734, "step": 305 }, { "epoch": 5.186440677966102, "grad_norm": 5.71477855668128, "learning_rate": 1.6643188831820374e-07, "logits/chosen": -1.4717719554901123, "logits/rejected": -1.5952577590942383, "logps/chosen": -27.931270599365234, "logps/rejected": -50.50065231323242, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.6083439588546753, "rewards/margins": 5.1790900230407715, "rewards/rejected": -5.787433624267578, "step": 306 }, { "epoch": 5.203389830508475, "grad_norm": 5.369095807459635, "learning_rate": 1.6468839796029198e-07, "logits/chosen": -1.5481698513031006, "logits/rejected": -1.4016451835632324, "logps/chosen": -33.04383850097656, "logps/rejected": -58.85445022583008, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -0.07381519675254822, "rewards/margins": 5.094147205352783, "rewards/rejected": -5.167962074279785, "step": 307 }, { "epoch": 5.220338983050848, "grad_norm": 6.478677176582888, "learning_rate": 1.6294959114140033e-07, "logits/chosen": -1.6205213069915771, "logits/rejected": -1.5254000425338745, "logps/chosen": -29.354225158691406, "logps/rejected": -43.66874694824219, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -0.08936242759227753, "rewards/margins": 4.020651817321777, "rewards/rejected": -4.110013961791992, "step": 308 }, { "epoch": 5.237288135593221, "grad_norm": 6.290350105724975, "learning_rate": 1.6121556332063861e-07, "logits/chosen": -1.3170948028564453, "logits/rejected": -1.281385898590088, "logps/chosen": -37.28167724609375, "logps/rejected": -45.006961822509766, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.06686612963676453, "rewards/margins": 4.521815776824951, "rewards/rejected": -4.588682174682617, "step": 309 }, { "epoch": 5.254237288135593, "grad_norm": 7.295722008430091, "learning_rate": 1.5948640969475345e-07, "logits/chosen": -1.5421946048736572, "logits/rejected": -1.406693935394287, "logps/chosen": -26.546001434326172, "logps/rejected": -40.070228576660156, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": -0.11074566841125488, "rewards/margins": 4.666855812072754, "rewards/rejected": -4.77760124206543, "step": 310 }, { "epoch": 5.271186440677966, "grad_norm": 6.299381642433967, "learning_rate": 1.5776222519290204e-07, "logits/chosen": -1.5537474155426025, "logits/rejected": -1.501166820526123, "logps/chosen": -26.153173446655273, "logps/rejected": -50.52534103393555, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -0.541397213935852, "rewards/margins": 5.92680025100708, "rewards/rejected": -6.468197822570801, "step": 311 }, { "epoch": 5.288135593220339, "grad_norm": 6.901736790648853, "learning_rate": 1.560431044714405e-07, "logits/chosen": -1.547554850578308, "logits/rejected": -1.4092559814453125, "logps/chosen": -32.46575927734375, "logps/rejected": -52.17520523071289, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -0.18227432668209076, "rewards/margins": 5.195803642272949, "rewards/rejected": -5.378078460693359, "step": 312 }, { "epoch": 5.305084745762712, "grad_norm": 6.537170506793814, "learning_rate": 1.5432914190872756e-07, "logits/chosen": -1.394778847694397, "logits/rejected": -1.3562414646148682, "logps/chosen": -26.12644386291504, "logps/rejected": -38.51447677612305, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 0.43324902653694153, "rewards/margins": 4.649719715118408, "rewards/rejected": -4.216470718383789, "step": 313 }, { "epoch": 5.322033898305085, "grad_norm": 6.822269638404017, "learning_rate": 1.5262043159994314e-07, "logits/chosen": -1.603257179260254, "logits/rejected": -1.41001558303833, "logps/chosen": -27.069252014160156, "logps/rejected": -54.39444351196289, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 0.2375459223985672, "rewards/margins": 5.968645095825195, "rewards/rejected": -5.731099605560303, "step": 314 }, { "epoch": 5.338983050847458, "grad_norm": 6.852429555090263, "learning_rate": 1.5091706735192266e-07, "logits/chosen": -1.5043295621871948, "logits/rejected": -1.4766517877578735, "logps/chosen": -21.581401824951172, "logps/rejected": -50.377071380615234, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 0.24750420451164246, "rewards/margins": 5.493175506591797, "rewards/rejected": -5.24567174911499, "step": 315 }, { "epoch": 5.3559322033898304, "grad_norm": 6.421398270201425, "learning_rate": 1.4921914267800699e-07, "logits/chosen": -1.4200242757797241, "logits/rejected": -1.3987623453140259, "logps/chosen": -19.360551834106445, "logps/rejected": -35.37394714355469, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 0.14433270692825317, "rewards/margins": 3.7558863162994385, "rewards/rejected": -3.611553430557251, "step": 316 }, { "epoch": 5.372881355932203, "grad_norm": 6.030116263122066, "learning_rate": 1.4752675079290848e-07, "logits/chosen": -1.3269891738891602, "logits/rejected": -1.2499594688415527, "logps/chosen": -29.73949432373047, "logps/rejected": -37.97323989868164, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -0.44939878582954407, "rewards/margins": 3.9090001583099365, "rewards/rejected": -4.358399391174316, "step": 317 }, { "epoch": 5.389830508474576, "grad_norm": 6.16190997278292, "learning_rate": 1.458399846075942e-07, "logits/chosen": -1.5378894805908203, "logits/rejected": -1.5542147159576416, "logps/chosen": -33.65290832519531, "logps/rejected": -57.606727600097656, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -0.47632092237472534, "rewards/margins": 5.512063980102539, "rewards/rejected": -5.98838472366333, "step": 318 }, { "epoch": 5.406779661016949, "grad_norm": 6.084242732250159, "learning_rate": 1.441589367241846e-07, "logits/chosen": -1.3787221908569336, "logits/rejected": -1.2671631574630737, "logps/chosen": -25.762453079223633, "logps/rejected": -41.90483093261719, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 0.07579733431339264, "rewards/margins": 4.506519317626953, "rewards/rejected": -4.430721759796143, "step": 319 }, { "epoch": 5.423728813559322, "grad_norm": 6.400287756731778, "learning_rate": 1.4248369943086995e-07, "logits/chosen": -1.609413504600525, "logits/rejected": -1.440029501914978, "logps/chosen": -29.613908767700195, "logps/rejected": -45.63286590576172, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -0.1749686449766159, "rewards/margins": 4.805515766143799, "rewards/rejected": -4.980484485626221, "step": 320 }, { "epoch": 5.440677966101695, "grad_norm": 5.58673826227153, "learning_rate": 1.4081436469684337e-07, "logits/chosen": -1.4598129987716675, "logits/rejected": -1.3000261783599854, "logps/chosen": -26.955453872680664, "logps/rejected": -49.03952407836914, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -0.2353724241256714, "rewards/margins": 5.155057907104492, "rewards/rejected": -5.390429973602295, "step": 321 }, { "epoch": 5.4576271186440675, "grad_norm": 5.7806024491841885, "learning_rate": 1.3915102416725286e-07, "logits/chosen": -1.5266464948654175, "logits/rejected": -1.406123161315918, "logps/chosen": -21.365182876586914, "logps/rejected": -44.53782653808594, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 0.19776439666748047, "rewards/margins": 4.6383209228515625, "rewards/rejected": -4.440556526184082, "step": 322 }, { "epoch": 5.47457627118644, "grad_norm": 6.663910066076515, "learning_rate": 1.3749376915816885e-07, "logits/chosen": -1.5701993703842163, "logits/rejected": -1.552007794380188, "logps/chosen": -36.67780685424805, "logps/rejected": -47.98915481567383, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -0.8618345856666565, "rewards/margins": 4.562122821807861, "rewards/rejected": -5.423957347869873, "step": 323 }, { "epoch": 5.491525423728813, "grad_norm": 6.220022143578053, "learning_rate": 1.3584269065157172e-07, "logits/chosen": -1.3734331130981445, "logits/rejected": -1.3631477355957031, "logps/chosen": -36.97685623168945, "logps/rejected": -53.4107780456543, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.10500967502593994, "rewards/margins": 4.521078109741211, "rewards/rejected": -4.6260881423950195, "step": 324 }, { "epoch": 5.508474576271187, "grad_norm": 6.094670518205716, "learning_rate": 1.341978792903568e-07, "logits/chosen": -1.4123265743255615, "logits/rejected": -1.3930362462997437, "logps/chosen": -24.757308959960938, "logps/rejected": -50.27605438232422, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 0.1657731980085373, "rewards/margins": 6.393916606903076, "rewards/rejected": -6.22814416885376, "step": 325 }, { "epoch": 5.52542372881356, "grad_norm": 7.344149893164419, "learning_rate": 1.3255942537335804e-07, "logits/chosen": -1.4898028373718262, "logits/rejected": -1.529905915260315, "logps/chosen": -30.63724136352539, "logps/rejected": -47.23683547973633, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": -0.2261982262134552, "rewards/margins": 4.666712760925293, "rewards/rejected": -4.892910957336426, "step": 326 }, { "epoch": 5.5423728813559325, "grad_norm": 5.4257600584522185, "learning_rate": 1.3092741885039085e-07, "logits/chosen": -1.430129051208496, "logits/rejected": -1.2972102165222168, "logps/chosen": -29.192171096801758, "logps/rejected": -61.473453521728516, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -0.42023560404777527, "rewards/margins": 5.649378299713135, "rewards/rejected": -6.069613933563232, "step": 327 }, { "epoch": 5.559322033898305, "grad_norm": 6.750041211234325, "learning_rate": 1.2930194931731382e-07, "logits/chosen": -1.5282580852508545, "logits/rejected": -1.464250087738037, "logps/chosen": -21.08121109008789, "logps/rejected": -36.48762512207031, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": -0.21070019900798798, "rewards/margins": 4.739938735961914, "rewards/rejected": -4.950639247894287, "step": 328 }, { "epoch": 5.576271186440678, "grad_norm": 6.346382606047745, "learning_rate": 1.2768310601110993e-07, "logits/chosen": -1.544804334640503, "logits/rejected": -1.5036790370941162, "logps/chosen": -25.425878524780273, "logps/rejected": -57.694950103759766, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 0.15332584083080292, "rewards/margins": 6.2687883377075195, "rewards/rejected": -6.115462303161621, "step": 329 }, { "epoch": 5.593220338983051, "grad_norm": 5.891747930904324, "learning_rate": 1.260709778049877e-07, "logits/chosen": -1.4886832237243652, "logits/rejected": -1.5345224142074585, "logps/chosen": -24.217357635498047, "logps/rejected": -38.936588287353516, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 0.14570964872837067, "rewards/margins": 3.9201903343200684, "rewards/rejected": -3.7744803428649902, "step": 330 }, { "epoch": 5.610169491525424, "grad_norm": 6.479009923461722, "learning_rate": 1.2446565320350182e-07, "logits/chosen": -1.6119954586029053, "logits/rejected": -1.5358697175979614, "logps/chosen": -23.517757415771484, "logps/rejected": -43.86408233642578, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 0.01591832935810089, "rewards/margins": 5.240238189697266, "rewards/rejected": -5.2243194580078125, "step": 331 }, { "epoch": 5.627118644067797, "grad_norm": 5.354023576639168, "learning_rate": 1.2286722033769492e-07, "logits/chosen": -1.630919337272644, "logits/rejected": -1.538849949836731, "logps/chosen": -28.93423080444336, "logps/rejected": -53.66632080078125, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -0.17018574476242065, "rewards/margins": 5.786593437194824, "rewards/rejected": -5.9567790031433105, "step": 332 }, { "epoch": 5.6440677966101696, "grad_norm": 5.571846024722403, "learning_rate": 1.2127576696025826e-07, "logits/chosen": -1.3449130058288574, "logits/rejected": -1.3209434747695923, "logps/chosen": -28.3823299407959, "logps/rejected": -57.359275817871094, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -0.19171355664730072, "rewards/margins": 6.2648420333862305, "rewards/rejected": -6.4565558433532715, "step": 333 }, { "epoch": 5.661016949152542, "grad_norm": 6.138956177743218, "learning_rate": 1.19691380440715e-07, "logits/chosen": -1.358786702156067, "logits/rejected": -1.3909467458724976, "logps/chosen": -30.639657974243164, "logps/rejected": -48.24367141723633, "loss": 0.0411, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4908216595649719, "rewards/margins": 4.40742301940918, "rewards/rejected": -4.898244857788086, "step": 334 }, { "epoch": 5.677966101694915, "grad_norm": 4.507338496018484, "learning_rate": 1.1811414776062365e-07, "logits/chosen": -1.2691717147827148, "logits/rejected": -1.2181487083435059, "logps/chosen": -32.79166793823242, "logps/rejected": -39.962955474853516, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 0.39959853887557983, "rewards/margins": 4.66073751449585, "rewards/rejected": -4.261138916015625, "step": 335 }, { "epoch": 5.694915254237288, "grad_norm": 6.700687388710383, "learning_rate": 1.1654415550880242e-07, "logits/chosen": -1.4832508563995361, "logits/rejected": -1.4977812767028809, "logps/chosen": -25.554107666015625, "logps/rejected": -42.260276794433594, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -0.07877922058105469, "rewards/margins": 4.69774055480957, "rewards/rejected": -4.776519775390625, "step": 336 }, { "epoch": 5.711864406779661, "grad_norm": 6.852779788605717, "learning_rate": 1.1498148987657549e-07, "logits/chosen": -1.2651898860931396, "logits/rejected": -1.2874916791915894, "logps/chosen": -31.26464080810547, "logps/rejected": -56.77597427368164, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -0.4548006057739258, "rewards/margins": 6.219876289367676, "rewards/rejected": -6.67467737197876, "step": 337 }, { "epoch": 5.728813559322034, "grad_norm": 5.857910583133337, "learning_rate": 1.1342623665304207e-07, "logits/chosen": -1.5274604558944702, "logits/rejected": -1.5151126384735107, "logps/chosen": -26.850894927978516, "logps/rejected": -47.350067138671875, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -0.5475198030471802, "rewards/margins": 4.643810272216797, "rewards/rejected": -5.1913299560546875, "step": 338 }, { "epoch": 5.745762711864407, "grad_norm": 7.203472778094057, "learning_rate": 1.1187848122036562e-07, "logits/chosen": -1.6401658058166504, "logits/rejected": -1.6699111461639404, "logps/chosen": -28.927499771118164, "logps/rejected": -36.704795837402344, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.15964704751968384, "rewards/margins": 4.4911370277404785, "rewards/rejected": -4.3314900398254395, "step": 339 }, { "epoch": 5.762711864406779, "grad_norm": 7.2779221460952, "learning_rate": 1.1033830854908691e-07, "logits/chosen": -1.502543330192566, "logits/rejected": -1.4199045896530151, "logps/chosen": -23.966299057006836, "logps/rejected": -46.33234786987305, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -0.10161617398262024, "rewards/margins": 5.133113861083984, "rewards/rejected": -5.2347307205200195, "step": 340 }, { "epoch": 5.779661016949152, "grad_norm": 7.302109736865858, "learning_rate": 1.0880580319345919e-07, "logits/chosen": -1.4564778804779053, "logits/rejected": -1.5227388143539429, "logps/chosen": -33.69976806640625, "logps/rejected": -42.49582290649414, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -0.1525229811668396, "rewards/margins": 4.674422264099121, "rewards/rejected": -4.8269453048706055, "step": 341 }, { "epoch": 5.796610169491525, "grad_norm": 7.067583835233995, "learning_rate": 1.0728104928680623e-07, "logits/chosen": -1.5719774961471558, "logits/rejected": -1.502136468887329, "logps/chosen": -21.33773422241211, "logps/rejected": -42.372676849365234, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": -0.12551027536392212, "rewards/margins": 5.258235454559326, "rewards/rejected": -5.3837456703186035, "step": 342 }, { "epoch": 5.813559322033898, "grad_norm": 5.957807006157361, "learning_rate": 1.0576413053690326e-07, "logits/chosen": -1.4468128681182861, "logits/rejected": -1.3548585176467896, "logps/chosen": -24.791799545288086, "logps/rejected": -44.49803161621094, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 0.11925530433654785, "rewards/margins": 5.184385299682617, "rewards/rejected": -5.06512975692749, "step": 343 }, { "epoch": 5.830508474576272, "grad_norm": 6.401362630388005, "learning_rate": 1.0425513022138202e-07, "logits/chosen": -1.491408348083496, "logits/rejected": -1.3371365070343018, "logps/chosen": -29.460378646850586, "logps/rejected": -53.63321304321289, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -0.3325144648551941, "rewards/margins": 5.398627758026123, "rewards/rejected": -5.731142520904541, "step": 344 }, { "epoch": 5.847457627118644, "grad_norm": 5.5542840449584405, "learning_rate": 1.0275413118315798e-07, "logits/chosen": -1.3796604871749878, "logits/rejected": -1.377408504486084, "logps/chosen": -26.488426208496094, "logps/rejected": -46.708282470703125, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": 0.1414252519607544, "rewards/margins": 5.361431121826172, "rewards/rejected": -5.220005989074707, "step": 345 }, { "epoch": 5.864406779661017, "grad_norm": 6.360073336005713, "learning_rate": 1.0126121582588315e-07, "logits/chosen": -1.5218093395233154, "logits/rejected": -1.3046934604644775, "logps/chosen": -41.216285705566406, "logps/rejected": -43.98678970336914, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": -0.5423004627227783, "rewards/margins": 4.604074001312256, "rewards/rejected": -5.146374702453613, "step": 346 }, { "epoch": 5.88135593220339, "grad_norm": 5.539790259394075, "learning_rate": 9.977646610942201e-08, "logits/chosen": -1.4420253038406372, "logits/rejected": -1.4948465824127197, "logps/chosen": -37.67629623413086, "logps/rejected": -53.65730667114258, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -0.8092373609542847, "rewards/margins": 5.268472671508789, "rewards/rejected": -6.077709674835205, "step": 347 }, { "epoch": 5.898305084745763, "grad_norm": 6.554303392876224, "learning_rate": 9.829996354535172e-08, "logits/chosen": -1.5454871654510498, "logits/rejected": -1.5333365201950073, "logps/chosen": -21.350933074951172, "logps/rejected": -49.00358200073242, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": -0.24133503437042236, "rewards/margins": 4.911167144775391, "rewards/rejected": -5.152502059936523, "step": 348 }, { "epoch": 5.915254237288136, "grad_norm": 6.546704494284942, "learning_rate": 9.68317891924871e-08, "logits/chosen": -1.5749708414077759, "logits/rejected": -1.433280110359192, "logps/chosen": -34.201988220214844, "logps/rejected": -51.28511428833008, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.3138273358345032, "rewards/margins": 4.917507648468018, "rewards/rejected": -5.231335163116455, "step": 349 }, { "epoch": 5.932203389830509, "grad_norm": 6.294270520592479, "learning_rate": 9.53720236524313e-08, "logits/chosen": -1.5306050777435303, "logits/rejected": -1.442819356918335, "logps/chosen": -39.66340255737305, "logps/rejected": -47.36227798461914, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.3249303102493286, "rewards/margins": 4.536349296569824, "rewards/rejected": -4.861279487609863, "step": 350 }, { "epoch": 5.9491525423728815, "grad_norm": 6.24196436709209, "learning_rate": 9.392074706515002e-08, "logits/chosen": -1.3876663446426392, "logits/rejected": -1.3757574558258057, "logps/chosen": -29.42030143737793, "logps/rejected": -51.045021057128906, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -0.15947160124778748, "rewards/margins": 4.901638031005859, "rewards/rejected": -5.06110954284668, "step": 351 }, { "epoch": 5.966101694915254, "grad_norm": 7.215999575793012, "learning_rate": 9.247803910457225e-08, "logits/chosen": -1.5945814847946167, "logits/rejected": -1.5649042129516602, "logps/chosen": -23.595874786376953, "logps/rejected": -45.547088623046875, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -0.08822253346443176, "rewards/margins": 4.996213912963867, "rewards/rejected": -5.084437370300293, "step": 352 }, { "epoch": 5.983050847457627, "grad_norm": 6.19260019151604, "learning_rate": 9.104397897421623e-08, "logits/chosen": -1.4307911396026611, "logits/rejected": -1.3201934099197388, "logps/chosen": -26.220760345458984, "logps/rejected": -51.94208526611328, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -0.05383840203285217, "rewards/margins": 5.268675804138184, "rewards/rejected": -5.322514057159424, "step": 353 }, { "epoch": 6.0, "grad_norm": 5.881521162946629, "learning_rate": 8.961864540284119e-08, "logits/chosen": -1.46354079246521, "logits/rejected": -1.369706392288208, "logps/chosen": -21.57988739013672, "logps/rejected": -39.70952606201172, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 0.02779284119606018, "rewards/margins": 4.516615867614746, "rewards/rejected": -4.488822937011719, "step": 354 }, { "epoch": 6.016949152542373, "grad_norm": 6.0145355879659625, "learning_rate": 8.82021166401253e-08, "logits/chosen": -1.124272108078003, "logits/rejected": -1.1505348682403564, "logps/chosen": -46.4174919128418, "logps/rejected": -55.31433868408203, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": -0.7474560737609863, "rewards/margins": 5.104282379150391, "rewards/rejected": -5.851738452911377, "step": 355 }, { "epoch": 6.033898305084746, "grad_norm": 5.799187572293291, "learning_rate": 8.679447045236962e-08, "logits/chosen": -1.4048717021942139, "logits/rejected": -1.479861855506897, "logps/chosen": -21.200166702270508, "logps/rejected": -38.02268600463867, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.14473703503608704, "rewards/margins": 5.024458885192871, "rewards/rejected": -5.169196128845215, "step": 356 }, { "epoch": 6.0508474576271185, "grad_norm": 7.041713270895366, "learning_rate": 8.539578411822901e-08, "logits/chosen": -1.3607594966888428, "logits/rejected": -1.3202852010726929, "logps/chosen": -27.951833724975586, "logps/rejected": -46.871917724609375, "loss": 0.0439, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13125565648078918, "rewards/margins": 4.497035026550293, "rewards/rejected": -4.365779399871826, "step": 357 }, { "epoch": 6.067796610169491, "grad_norm": 4.947363488557164, "learning_rate": 8.400613442446947e-08, "logits/chosen": -1.571787714958191, "logits/rejected": -1.4330024719238281, "logps/chosen": -28.946853637695312, "logps/rejected": -45.973785400390625, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -0.589256763458252, "rewards/margins": 5.187601566314697, "rewards/rejected": -5.776858329772949, "step": 358 }, { "epoch": 6.084745762711864, "grad_norm": 5.199584429039793, "learning_rate": 8.262559766175253e-08, "logits/chosen": -1.4506189823150635, "logits/rejected": -1.3975802659988403, "logps/chosen": -24.713668823242188, "logps/rejected": -51.15769577026367, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -0.11659543216228485, "rewards/margins": 6.322598457336426, "rewards/rejected": -6.439194202423096, "step": 359 }, { "epoch": 6.101694915254237, "grad_norm": 5.228661804484342, "learning_rate": 8.125424962044741e-08, "logits/chosen": -1.679395079612732, "logits/rejected": -1.5920395851135254, "logps/chosen": -32.77759552001953, "logps/rejected": -51.247283935546875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -0.6732353568077087, "rewards/margins": 5.932892322540283, "rewards/rejected": -6.6061272621154785, "step": 360 }, { "epoch": 6.11864406779661, "grad_norm": 4.99263433146697, "learning_rate": 7.989216558646941e-08, "logits/chosen": -1.5995450019836426, "logits/rejected": -1.5377026796340942, "logps/chosen": -30.13187026977539, "logps/rejected": -47.109947204589844, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -0.2468140721321106, "rewards/margins": 5.169532299041748, "rewards/rejected": -5.416346073150635, "step": 361 }, { "epoch": 6.135593220338983, "grad_norm": 5.703896702738386, "learning_rate": 7.853942033714736e-08, "logits/chosen": -1.5143556594848633, "logits/rejected": -1.3968244791030884, "logps/chosen": -37.41975402832031, "logps/rejected": -56.349769592285156, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.31496256589889526, "rewards/margins": 5.113517761230469, "rewards/rejected": -5.428481101989746, "step": 362 }, { "epoch": 6.1525423728813555, "grad_norm": 5.916237621370921, "learning_rate": 7.719608813711847e-08, "logits/chosen": -1.5250320434570312, "logits/rejected": -1.475509762763977, "logps/chosen": -26.13006591796875, "logps/rejected": -35.852359771728516, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 0.13443481922149658, "rewards/margins": 4.1385908126831055, "rewards/rejected": -4.00415563583374, "step": 363 }, { "epoch": 6.169491525423728, "grad_norm": 6.510656924645236, "learning_rate": 7.586224273425081e-08, "logits/chosen": -1.3499259948730469, "logits/rejected": -1.244497299194336, "logps/chosen": -34.813541412353516, "logps/rejected": -47.603271484375, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 0.11887124180793762, "rewards/margins": 4.864650726318359, "rewards/rejected": -4.745779514312744, "step": 364 }, { "epoch": 6.186440677966102, "grad_norm": 5.630103461727817, "learning_rate": 7.45379573555947e-08, "logits/chosen": -1.4300577640533447, "logits/rejected": -1.4099435806274414, "logps/chosen": -34.18559646606445, "logps/rejected": -45.739933013916016, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.07826349139213562, "rewards/margins": 4.771849155426025, "rewards/rejected": -4.8501129150390625, "step": 365 }, { "epoch": 6.203389830508475, "grad_norm": 5.238611793677947, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.5490187406539917, "logits/rejected": -1.6131647825241089, "logps/chosen": -29.01973533630371, "logps/rejected": -48.46769332885742, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -0.043659090995788574, "rewards/margins": 4.948011875152588, "rewards/rejected": -4.991670608520508, "step": 366 }, { "epoch": 6.220338983050848, "grad_norm": 7.178172383273588, "learning_rate": 7.19183569509398e-08, "logits/chosen": -1.4348180294036865, "logits/rejected": -1.4342725276947021, "logps/chosen": -26.032867431640625, "logps/rejected": -35.465248107910156, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 0.11982511729001999, "rewards/margins": 4.798130035400391, "rewards/rejected": -4.678304672241211, "step": 367 }, { "epoch": 6.237288135593221, "grad_norm": 5.80114325638975, "learning_rate": 7.062318573891715e-08, "logits/chosen": -1.3932957649230957, "logits/rejected": -1.3466596603393555, "logps/chosen": -26.193689346313477, "logps/rejected": -43.341983795166016, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 0.26808837056159973, "rewards/margins": 5.052790641784668, "rewards/rejected": -4.784702301025391, "step": 368 }, { "epoch": 6.254237288135593, "grad_norm": 5.8991450258769875, "learning_rate": 6.933786217116364e-08, "logits/chosen": -1.5599933862686157, "logits/rejected": -1.4719116687774658, "logps/chosen": -26.387096405029297, "logps/rejected": -41.33942794799805, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 0.40677589178085327, "rewards/margins": 4.713696479797363, "rewards/rejected": -4.306921005249023, "step": 369 }, { "epoch": 6.271186440677966, "grad_norm": 5.827867650343381, "learning_rate": 6.806245681091944e-08, "logits/chosen": -1.4101459980010986, "logits/rejected": -1.3021355867385864, "logps/chosen": -27.591569900512695, "logps/rejected": -47.71153259277344, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 0.1557801067829132, "rewards/margins": 5.925251483917236, "rewards/rejected": -5.769471168518066, "step": 370 }, { "epoch": 6.288135593220339, "grad_norm": 5.390338841256569, "learning_rate": 6.679703967692321e-08, "logits/chosen": -1.584707498550415, "logits/rejected": -1.4573631286621094, "logps/chosen": -21.927671432495117, "logps/rejected": -49.91468048095703, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 0.393250048160553, "rewards/margins": 5.488755226135254, "rewards/rejected": -5.095505714416504, "step": 371 }, { "epoch": 6.305084745762712, "grad_norm": 4.571609396535961, "learning_rate": 6.554168023956816e-08, "logits/chosen": -1.4562060832977295, "logits/rejected": -1.3005712032318115, "logps/chosen": -25.927963256835938, "logps/rejected": -44.458621978759766, "loss": 0.0284, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2479822039604187, "rewards/margins": 4.508713722229004, "rewards/rejected": -4.756695747375488, "step": 372 }, { "epoch": 6.322033898305085, "grad_norm": 6.286732270647367, "learning_rate": 6.429644741708779e-08, "logits/chosen": -1.4326915740966797, "logits/rejected": -1.3545727729797363, "logps/chosen": -25.131105422973633, "logps/rejected": -40.29548263549805, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 0.04497765004634857, "rewards/margins": 4.998070240020752, "rewards/rejected": -4.953092575073242, "step": 373 }, { "epoch": 6.338983050847458, "grad_norm": 6.303722387593113, "learning_rate": 6.306140957177225e-08, "logits/chosen": -1.452634334564209, "logits/rejected": -1.427751898765564, "logps/chosen": -26.491840362548828, "logps/rejected": -47.47591781616211, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -0.015405148267745972, "rewards/margins": 5.423872947692871, "rewards/rejected": -5.439278602600098, "step": 374 }, { "epoch": 6.3559322033898304, "grad_norm": 5.766682311294022, "learning_rate": 6.183663450621607e-08, "logits/chosen": -1.506758213043213, "logits/rejected": -1.4310308694839478, "logps/chosen": -38.03307342529297, "logps/rejected": -46.39859390258789, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -0.16017396748065948, "rewards/margins": 4.915417194366455, "rewards/rejected": -5.075592041015625, "step": 375 }, { "epoch": 6.372881355932203, "grad_norm": 4.912000082420141, "learning_rate": 6.062218945959496e-08, "logits/chosen": -1.4634639024734497, "logits/rejected": -1.5328123569488525, "logps/chosen": -34.974857330322266, "logps/rejected": -43.447689056396484, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -0.06525677442550659, "rewards/margins": 5.0226850509643555, "rewards/rejected": -5.087942123413086, "step": 376 }, { "epoch": 6.389830508474576, "grad_norm": 4.923323155963838, "learning_rate": 5.9418141103975026e-08, "logits/chosen": -1.4239155054092407, "logits/rejected": -1.4835269451141357, "logps/chosen": -30.768068313598633, "logps/rejected": -60.89370346069336, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -0.3536137342453003, "rewards/margins": 6.844842910766602, "rewards/rejected": -7.198456764221191, "step": 377 }, { "epoch": 6.406779661016949, "grad_norm": 6.31358841065967, "learning_rate": 5.822455554065217e-08, "logits/chosen": -1.3272855281829834, "logits/rejected": -1.2906978130340576, "logps/chosen": -25.13174819946289, "logps/rejected": -41.51945877075195, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 0.2586425840854645, "rewards/margins": 5.224736213684082, "rewards/rejected": -4.966093063354492, "step": 378 }, { "epoch": 6.423728813559322, "grad_norm": 5.9217345486120365, "learning_rate": 5.704149829652341e-08, "logits/chosen": -1.5300796031951904, "logits/rejected": -1.4160737991333008, "logps/chosen": -29.946929931640625, "logps/rejected": -49.227298736572266, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.1305888444185257, "rewards/margins": 5.067564487457275, "rewards/rejected": -5.198153495788574, "step": 379 }, { "epoch": 6.440677966101695, "grad_norm": 5.394022162647855, "learning_rate": 5.586903432048942e-08, "logits/chosen": -1.5190213918685913, "logits/rejected": -1.4037725925445557, "logps/chosen": -31.318918228149414, "logps/rejected": -49.82917404174805, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -0.9247037768363953, "rewards/margins": 5.2036919593811035, "rewards/rejected": -6.128396034240723, "step": 380 }, { "epoch": 6.4576271186440675, "grad_norm": 5.307172264281174, "learning_rate": 5.470722797988883e-08, "logits/chosen": -1.3760260343551636, "logits/rejected": -1.3698216676712036, "logps/chosen": -24.693878173828125, "logps/rejected": -37.211387634277344, "loss": 0.0353, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05154569447040558, "rewards/margins": 4.664517879486084, "rewards/rejected": -4.612972736358643, "step": 381 }, { "epoch": 6.47457627118644, "grad_norm": 5.039925613354933, "learning_rate": 5.355614305696468e-08, "logits/chosen": -1.56493079662323, "logits/rejected": -1.5143284797668457, "logps/chosen": -27.657060623168945, "logps/rejected": -43.6398811340332, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -0.09710407257080078, "rewards/margins": 4.8609700202941895, "rewards/rejected": -4.958074569702148, "step": 382 }, { "epoch": 6.491525423728813, "grad_norm": 6.304781751843407, "learning_rate": 5.241584274536259e-08, "logits/chosen": -1.3160762786865234, "logits/rejected": -1.426667332649231, "logps/chosen": -31.549245834350586, "logps/rejected": -49.771148681640625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -0.3802323043346405, "rewards/margins": 5.496129035949707, "rewards/rejected": -5.876360893249512, "step": 383 }, { "epoch": 6.508474576271187, "grad_norm": 5.063738993506685, "learning_rate": 5.1286389646661654e-08, "logits/chosen": -1.396496057510376, "logits/rejected": -1.3829492330551147, "logps/chosen": -28.830703735351562, "logps/rejected": -47.824317932128906, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -0.2612743079662323, "rewards/margins": 4.95106315612793, "rewards/rejected": -5.212337017059326, "step": 384 }, { "epoch": 6.52542372881356, "grad_norm": 5.637690264352778, "learning_rate": 5.0167845766937806e-08, "logits/chosen": -1.4855574369430542, "logits/rejected": -1.3984534740447998, "logps/chosen": -27.114734649658203, "logps/rejected": -43.69512939453125, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": -0.21004855632781982, "rewards/margins": 4.845144748687744, "rewards/rejected": -5.055192947387695, "step": 385 }, { "epoch": 6.5423728813559325, "grad_norm": 6.025392312759281, "learning_rate": 4.906027251335917e-08, "logits/chosen": -1.4550882577896118, "logits/rejected": -1.4542597532272339, "logps/chosen": -23.857240676879883, "logps/rejected": -47.817237854003906, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.31768307089805603, "rewards/margins": 4.948706150054932, "rewards/rejected": -5.266389846801758, "step": 386 }, { "epoch": 6.559322033898305, "grad_norm": 5.587911152077424, "learning_rate": 4.7963730690815467e-08, "logits/chosen": -1.4407036304473877, "logits/rejected": -1.3747568130493164, "logps/chosen": -19.481971740722656, "logps/rejected": -42.736568450927734, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.19361047446727753, "rewards/margins": 5.198715686798096, "rewards/rejected": -5.392325401306152, "step": 387 }, { "epoch": 6.576271186440678, "grad_norm": 6.586187305125031, "learning_rate": 4.687828049857967e-08, "logits/chosen": -1.3945553302764893, "logits/rejected": -1.322791337966919, "logps/chosen": -29.22484016418457, "logps/rejected": -40.85685348510742, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.17647796869277954, "rewards/margins": 4.56741189956665, "rewards/rejected": -4.390933990478516, "step": 388 }, { "epoch": 6.593220338983051, "grad_norm": 6.218480481228633, "learning_rate": 4.580398152700304e-08, "logits/chosen": -1.6018644571304321, "logits/rejected": -1.4887254238128662, "logps/chosen": -25.33257484436035, "logps/rejected": -45.91154479980469, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.20426103472709656, "rewards/margins": 5.0216498374938965, "rewards/rejected": -5.225910186767578, "step": 389 }, { "epoch": 6.610169491525424, "grad_norm": 6.569813291973876, "learning_rate": 4.47408927542435e-08, "logits/chosen": -1.3770880699157715, "logits/rejected": -1.286376714706421, "logps/chosen": -23.488061904907227, "logps/rejected": -38.757720947265625, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -0.2643284499645233, "rewards/margins": 3.8822245597839355, "rewards/rejected": -4.146553039550781, "step": 390 }, { "epoch": 6.627118644067797, "grad_norm": 5.39352903238084, "learning_rate": 4.368907254302837e-08, "logits/chosen": -1.5808112621307373, "logits/rejected": -1.5275750160217285, "logps/chosen": -20.95781707763672, "logps/rejected": -38.6187744140625, "loss": 0.0366, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05846305191516876, "rewards/margins": 4.244011878967285, "rewards/rejected": -4.302474498748779, "step": 391 }, { "epoch": 6.6440677966101696, "grad_norm": 5.323944978561868, "learning_rate": 4.264857863744956e-08, "logits/chosen": -1.5523847341537476, "logits/rejected": -1.39801824092865, "logps/chosen": -22.804271697998047, "logps/rejected": -38.31208419799805, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 0.4400111436843872, "rewards/margins": 5.238194465637207, "rewards/rejected": -4.798183917999268, "step": 392 }, { "epoch": 6.661016949152542, "grad_norm": 7.086981471555236, "learning_rate": 4.161946815979403e-08, "logits/chosen": -1.4906023740768433, "logits/rejected": -1.4377923011779785, "logps/chosen": -37.56535720825195, "logps/rejected": -52.383487701416016, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": -0.3086543679237366, "rewards/margins": 5.0754618644714355, "rewards/rejected": -5.384116172790527, "step": 393 }, { "epoch": 6.677966101694915, "grad_norm": 5.2671704490811075, "learning_rate": 4.0601797607407505e-08, "logits/chosen": -1.4550049304962158, "logits/rejected": -1.3966783285140991, "logps/chosen": -22.132400512695312, "logps/rejected": -41.57554626464844, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -0.2699892520904541, "rewards/margins": 4.588739395141602, "rewards/rejected": -4.858728408813477, "step": 394 }, { "epoch": 6.694915254237288, "grad_norm": 5.32453491166638, "learning_rate": 3.9595622849593e-08, "logits/chosen": -1.512937307357788, "logits/rejected": -1.3621115684509277, "logps/chosen": -28.885116577148438, "logps/rejected": -50.2530517578125, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": -0.5175858736038208, "rewards/margins": 5.390725135803223, "rewards/rejected": -5.908310890197754, "step": 395 }, { "epoch": 6.711864406779661, "grad_norm": 5.792264114569628, "learning_rate": 3.8600999124543455e-08, "logits/chosen": -1.5662391185760498, "logits/rejected": -1.42247474193573, "logps/chosen": -25.263622283935547, "logps/rejected": -45.11002731323242, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 0.27481862902641296, "rewards/margins": 5.28562593460083, "rewards/rejected": -5.010807037353516, "step": 396 }, { "epoch": 6.728813559322034, "grad_norm": 5.997208149184034, "learning_rate": 3.7617981036309533e-08, "logits/chosen": -1.5715502500534058, "logits/rejected": -1.6154096126556396, "logps/chosen": -23.581790924072266, "logps/rejected": -45.5838623046875, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": -0.18038499355316162, "rewards/margins": 4.884888172149658, "rewards/rejected": -5.065273284912109, "step": 397 }, { "epoch": 6.745762711864407, "grad_norm": 5.245999983483119, "learning_rate": 3.664662255180134e-08, "logits/chosen": -1.4324719905853271, "logits/rejected": -1.329777717590332, "logps/chosen": -26.615516662597656, "logps/rejected": -41.86264419555664, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 0.05792872607707977, "rewards/margins": 4.163052082061768, "rewards/rejected": -4.105123519897461, "step": 398 }, { "epoch": 6.762711864406779, "grad_norm": 4.518676670270046, "learning_rate": 3.5686976997826245e-08, "logits/chosen": -1.6349799633026123, "logits/rejected": -1.6590909957885742, "logps/chosen": -41.426727294921875, "logps/rejected": -52.470558166503906, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": -0.4126446843147278, "rewards/margins": 5.763060569763184, "rewards/rejected": -6.175705432891846, "step": 399 }, { "epoch": 6.779661016949152, "grad_norm": 5.721240252343364, "learning_rate": 3.473909705816111e-08, "logits/chosen": -1.4431383609771729, "logits/rejected": -1.3899328708648682, "logps/chosen": -36.3721923828125, "logps/rejected": -52.30086898803711, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -1.106335163116455, "rewards/margins": 5.6671295166015625, "rewards/rejected": -6.773464679718018, "step": 400 }, { "epoch": 6.796610169491525, "grad_norm": 5.164348258065598, "learning_rate": 3.3803034770659824e-08, "logits/chosen": -1.6824212074279785, "logits/rejected": -1.5971221923828125, "logps/chosen": -37.007022857666016, "logps/rejected": -74.48016357421875, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -0.6765580177307129, "rewards/margins": 7.421613693237305, "rewards/rejected": -8.09817123413086, "step": 401 }, { "epoch": 6.813559322033898, "grad_norm": 4.808650804090459, "learning_rate": 3.287884152439646e-08, "logits/chosen": -1.3810476064682007, "logits/rejected": -1.4028950929641724, "logps/chosen": -29.925891876220703, "logps/rejected": -49.08871841430664, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 0.16381314396858215, "rewards/margins": 6.113283157348633, "rewards/rejected": -5.949470520019531, "step": 402 }, { "epoch": 6.830508474576272, "grad_norm": 6.236701718893695, "learning_rate": 3.19665680568445e-08, "logits/chosen": -1.5909409523010254, "logits/rejected": -1.5455948114395142, "logps/chosen": -33.62629699707031, "logps/rejected": -39.20241165161133, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -0.20260879397392273, "rewards/margins": 4.192948341369629, "rewards/rejected": -4.395556926727295, "step": 403 }, { "epoch": 6.847457627118644, "grad_norm": 4.563195691909389, "learning_rate": 3.106626445109081e-08, "logits/chosen": -1.4258350133895874, "logits/rejected": -1.471103549003601, "logps/chosen": -32.27821731567383, "logps/rejected": -55.883445739746094, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -0.37013059854507446, "rewards/margins": 5.775850296020508, "rewards/rejected": -6.1459808349609375, "step": 404 }, { "epoch": 6.864406779661017, "grad_norm": 6.319828827539619, "learning_rate": 3.017798013308645e-08, "logits/chosen": -1.6110063791275024, "logits/rejected": -1.5970886945724487, "logps/chosen": -33.942317962646484, "logps/rejected": -42.78126525878906, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 0.08474293351173401, "rewards/margins": 4.63005256652832, "rewards/rejected": -4.545310020446777, "step": 405 }, { "epoch": 6.88135593220339, "grad_norm": 6.956985946783924, "learning_rate": 2.9301763868933153e-08, "logits/chosen": -1.4388988018035889, "logits/rejected": -1.4342849254608154, "logps/chosen": -23.345102310180664, "logps/rejected": -40.68294906616211, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.11982996761798859, "rewards/margins": 4.858503818511963, "rewards/rejected": -4.978332996368408, "step": 406 }, { "epoch": 6.898305084745763, "grad_norm": 5.4291102026527716, "learning_rate": 2.843766376220616e-08, "logits/chosen": -1.7093875408172607, "logits/rejected": -1.6210181713104248, "logps/chosen": -29.290470123291016, "logps/rejected": -50.43269729614258, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.7307620048522949, "rewards/margins": 5.429152488708496, "rewards/rejected": -6.159914016723633, "step": 407 }, { "epoch": 6.915254237288136, "grad_norm": 5.393300173055395, "learning_rate": 2.7585727251313195e-08, "logits/chosen": -1.3670405149459839, "logits/rejected": -1.1494945287704468, "logps/chosen": -38.649288177490234, "logps/rejected": -55.98222351074219, "loss": 0.0388, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4593397378921509, "rewards/margins": 5.185694694519043, "rewards/rejected": -5.645034313201904, "step": 408 }, { "epoch": 6.932203389830509, "grad_norm": 4.837513841498393, "learning_rate": 2.6746001106890377e-08, "logits/chosen": -1.656294584274292, "logits/rejected": -1.5977482795715332, "logps/chosen": -29.096288681030273, "logps/rejected": -46.76082229614258, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -0.21669504046440125, "rewards/margins": 5.154686450958252, "rewards/rejected": -5.371381759643555, "step": 409 }, { "epoch": 6.9491525423728815, "grad_norm": 5.248560918966433, "learning_rate": 2.5918531429234364e-08, "logits/chosen": -1.592848300933838, "logits/rejected": -1.540168285369873, "logps/chosen": -31.085020065307617, "logps/rejected": -53.143741607666016, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": -0.6363227963447571, "rewards/margins": 5.588076591491699, "rewards/rejected": -6.224399089813232, "step": 410 }, { "epoch": 6.966101694915254, "grad_norm": 5.2195974344109715, "learning_rate": 2.5103363645771536e-08, "logits/chosen": -1.65935218334198, "logits/rejected": -1.6805386543273926, "logps/chosen": -34.833702087402344, "logps/rejected": -48.904422760009766, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.1815856397151947, "rewards/margins": 5.786221981048584, "rewards/rejected": -5.967808246612549, "step": 411 }, { "epoch": 6.983050847457627, "grad_norm": 5.470555151396108, "learning_rate": 2.4300542508564114e-08, "logits/chosen": -1.4322288036346436, "logits/rejected": -1.3301326036453247, "logps/chosen": -28.12655258178711, "logps/rejected": -46.921104431152344, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -0.31704282760620117, "rewards/margins": 4.567241191864014, "rewards/rejected": -4.884284019470215, "step": 412 }, { "epoch": 7.0, "grad_norm": 5.578953824776314, "learning_rate": 2.3510112091853357e-08, "logits/chosen": -1.276556134223938, "logits/rejected": -1.2297403812408447, "logps/chosen": -21.443805694580078, "logps/rejected": -48.807159423828125, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 0.017134033143520355, "rewards/margins": 5.347229957580566, "rewards/rejected": -5.330096244812012, "step": 413 }, { "epoch": 7.016949152542373, "grad_norm": 6.999197685418954, "learning_rate": 2.27321157896396e-08, "logits/chosen": -1.5116349458694458, "logits/rejected": -1.3849687576293945, "logps/chosen": -27.718652725219727, "logps/rejected": -47.36982345581055, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 0.1311918944120407, "rewards/margins": 5.171109676361084, "rewards/rejected": -5.039917469024658, "step": 414 }, { "epoch": 7.033898305084746, "grad_norm": 4.883886921689459, "learning_rate": 2.1966596313300362e-08, "logits/chosen": -1.6032100915908813, "logits/rejected": -1.518362045288086, "logps/chosen": -29.143009185791016, "logps/rejected": -40.435001373291016, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": -0.24970372021198273, "rewards/margins": 4.1580705642700195, "rewards/rejected": -4.407774448394775, "step": 415 }, { "epoch": 7.0508474576271185, "grad_norm": 5.855476388920514, "learning_rate": 2.1213595689245384e-08, "logits/chosen": -1.350822925567627, "logits/rejected": -1.3723689317703247, "logps/chosen": -23.788740158081055, "logps/rejected": -40.06031799316406, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -0.003228999674320221, "rewards/margins": 4.06948184967041, "rewards/rejected": -4.0727105140686035, "step": 416 }, { "epoch": 7.067796610169491, "grad_norm": 5.942075006867736, "learning_rate": 2.0473155256609363e-08, "logits/chosen": -1.6869771480560303, "logits/rejected": -1.6048226356506348, "logps/chosen": -27.607175827026367, "logps/rejected": -46.171112060546875, "loss": 0.0478, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15510573983192444, "rewards/margins": 4.710604667663574, "rewards/rejected": -4.865710258483887, "step": 417 }, { "epoch": 7.084745762711864, "grad_norm": 5.9322015599453595, "learning_rate": 1.9745315664982277e-08, "logits/chosen": -1.5960508584976196, "logits/rejected": -1.4311569929122925, "logps/chosen": -20.32171630859375, "logps/rejected": -41.58867645263672, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 0.00745101273059845, "rewards/margins": 5.94549560546875, "rewards/rejected": -5.938044548034668, "step": 418 }, { "epoch": 7.101694915254237, "grad_norm": 5.918275937210799, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -1.699689269065857, "logits/rejected": -1.5470737218856812, "logps/chosen": -28.80027198791504, "logps/rejected": -46.22833251953125, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -0.4729538857936859, "rewards/margins": 4.527037143707275, "rewards/rejected": -4.999991416931152, "step": 419 }, { "epoch": 7.11864406779661, "grad_norm": 6.2740518918594015, "learning_rate": 1.8327598142041656e-08, "logits/chosen": -1.4966247081756592, "logits/rejected": -1.5006301403045654, "logps/chosen": -39.597660064697266, "logps/rejected": -59.670684814453125, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 0.10089424252510071, "rewards/margins": 5.958191871643066, "rewards/rejected": -5.857297420501709, "step": 420 }, { "epoch": 7.135593220338983, "grad_norm": 5.848230208495223, "learning_rate": 1.7637798042291125e-08, "logits/chosen": -1.4466344118118286, "logits/rejected": -1.4158880710601807, "logps/chosen": -34.30101013183594, "logps/rejected": -41.198646545410156, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -0.5382071733474731, "rewards/margins": 4.64283561706543, "rewards/rejected": -5.1810431480407715, "step": 421 }, { "epoch": 7.1525423728813555, "grad_norm": 4.83673555496754, "learning_rate": 1.696075444240305e-08, "logits/chosen": -1.481490135192871, "logits/rejected": -1.3944220542907715, "logps/chosen": -23.669601440429688, "logps/rejected": -44.325191497802734, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -0.12855800986289978, "rewards/margins": 4.879583835601807, "rewards/rejected": -5.008142471313477, "step": 422 }, { "epoch": 7.169491525423728, "grad_norm": 5.610847549034329, "learning_rate": 1.6296504511531834e-08, "logits/chosen": -1.418038249015808, "logits/rejected": -1.3723573684692383, "logps/chosen": -27.353422164916992, "logps/rejected": -49.684974670410156, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -0.7425976395606995, "rewards/margins": 4.4584550857543945, "rewards/rejected": -5.201053142547607, "step": 423 }, { "epoch": 7.186440677966102, "grad_norm": 5.7865923023012975, "learning_rate": 1.5645084716469776e-08, "logits/chosen": -1.6516090631484985, "logits/rejected": -1.5412788391113281, "logps/chosen": -33.868499755859375, "logps/rejected": -49.333709716796875, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -0.2233295440673828, "rewards/margins": 5.964905261993408, "rewards/rejected": -6.188235282897949, "step": 424 }, { "epoch": 7.203389830508475, "grad_norm": 5.701505367137342, "learning_rate": 1.5006530819644923e-08, "logits/chosen": -1.416776418685913, "logits/rejected": -1.5832912921905518, "logps/chosen": -32.08591842651367, "logps/rejected": -47.3320198059082, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -0.342715859413147, "rewards/margins": 5.240063667297363, "rewards/rejected": -5.582779884338379, "step": 425 }, { "epoch": 7.220338983050848, "grad_norm": 4.437542789195646, "learning_rate": 1.4380877877157832e-08, "logits/chosen": -1.4847077131271362, "logits/rejected": -1.4457132816314697, "logps/chosen": -32.51253128051758, "logps/rejected": -53.59151077270508, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -0.6102309226989746, "rewards/margins": 5.707812786102295, "rewards/rejected": -6.3180437088012695, "step": 426 }, { "epoch": 7.237288135593221, "grad_norm": 5.065631167693018, "learning_rate": 1.3768160236856674e-08, "logits/chosen": -1.4158098697662354, "logits/rejected": -1.4317882061004639, "logps/chosen": -29.830453872680664, "logps/rejected": -54.25461196899414, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -0.038821518421173096, "rewards/margins": 5.01283597946167, "rewards/rejected": -5.051657676696777, "step": 427 }, { "epoch": 7.254237288135593, "grad_norm": 5.220392921602965, "learning_rate": 1.316841153645215e-08, "logits/chosen": -1.560931921005249, "logits/rejected": -1.5064420700073242, "logps/chosen": -30.01360321044922, "logps/rejected": -48.26612854003906, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -0.3863828778266907, "rewards/margins": 5.301400184631348, "rewards/rejected": -5.687783241271973, "step": 428 }, { "epoch": 7.271186440677966, "grad_norm": 5.688311376504444, "learning_rate": 1.2581664701670296e-08, "logits/chosen": -1.52787446975708, "logits/rejected": -1.4021316766738892, "logps/chosen": -28.248119354248047, "logps/rejected": -41.31006622314453, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -0.3207528591156006, "rewards/margins": 5.154188632965088, "rewards/rejected": -5.474941253662109, "step": 429 }, { "epoch": 7.288135593220339, "grad_norm": 5.800049450685864, "learning_rate": 1.2007951944445121e-08, "logits/chosen": -1.4032049179077148, "logits/rejected": -1.3456979990005493, "logps/chosen": -24.016460418701172, "logps/rejected": -41.04709243774414, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.04568904638290405, "rewards/margins": 3.968392848968506, "rewards/rejected": -4.0140814781188965, "step": 430 }, { "epoch": 7.305084745762712, "grad_norm": 6.591249552173131, "learning_rate": 1.144730476115019e-08, "logits/chosen": -1.4663312435150146, "logits/rejected": -1.437060832977295, "logps/chosen": -25.586219787597656, "logps/rejected": -59.44462585449219, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -0.418854296207428, "rewards/margins": 6.551999092102051, "rewards/rejected": -6.970853328704834, "step": 431 }, { "epoch": 7.322033898305085, "grad_norm": 5.1436479741963765, "learning_rate": 1.0899753930869394e-08, "logits/chosen": -1.479379415512085, "logits/rejected": -1.4856846332550049, "logps/chosen": -24.105491638183594, "logps/rejected": -42.0874137878418, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -0.016550958156585693, "rewards/margins": 4.623503684997559, "rewards/rejected": -4.640054702758789, "step": 432 }, { "epoch": 7.338983050847458, "grad_norm": 6.13135090791532, "learning_rate": 1.036532951370736e-08, "logits/chosen": -1.5114121437072754, "logits/rejected": -1.487284541130066, "logps/chosen": -30.636520385742188, "logps/rejected": -55.582489013671875, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 0.0013150721788406372, "rewards/margins": 6.1965837478637695, "rewards/rejected": -6.1952691078186035, "step": 433 }, { "epoch": 7.3559322033898304, "grad_norm": 5.332642447483903, "learning_rate": 9.844060849138997e-09, "logits/chosen": -1.599272608757019, "logits/rejected": -1.610948085784912, "logps/chosen": -24.916534423828125, "logps/rejected": -39.19096755981445, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -0.01982739567756653, "rewards/margins": 4.651766777038574, "rewards/rejected": -4.671594142913818, "step": 434 }, { "epoch": 7.372881355932203, "grad_norm": 6.134889313671512, "learning_rate": 9.335976554398912e-09, "logits/chosen": -1.540908932685852, "logits/rejected": -1.5109443664550781, "logps/chosen": -32.279666900634766, "logps/rejected": -38.13484191894531, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.6547858715057373, "rewards/margins": 3.962502956390381, "rewards/rejected": -4.617289066314697, "step": 435 }, { "epoch": 7.389830508474576, "grad_norm": 5.165828974962686, "learning_rate": 8.841104522910342e-09, "logits/chosen": -1.5303874015808105, "logits/rejected": -1.4300156831741333, "logps/chosen": -33.03150939941406, "logps/rejected": -49.22145080566406, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -0.05898144841194153, "rewards/margins": 5.506203651428223, "rewards/rejected": -5.565184593200684, "step": 436 }, { "epoch": 7.406779661016949, "grad_norm": 5.59136345566115, "learning_rate": 8.359471922753714e-09, "logits/chosen": -1.5242373943328857, "logits/rejected": -1.3456315994262695, "logps/chosen": -29.918407440185547, "logps/rejected": -53.36083984375, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 0.07045301795005798, "rewards/margins": 5.991230487823486, "rewards/rejected": -5.920777320861816, "step": 437 }, { "epoch": 7.423728813559322, "grad_norm": 5.700922848308937, "learning_rate": 7.891105195175356e-09, "logits/chosen": -1.421931505203247, "logits/rejected": -1.4313148260116577, "logps/chosen": -30.72463607788086, "logps/rejected": -39.33094787597656, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.35566258430480957, "rewards/margins": 4.320624351501465, "rewards/rejected": -4.676286697387695, "step": 438 }, { "epoch": 7.440677966101695, "grad_norm": 4.895447924779327, "learning_rate": 7.4360300531355894e-09, "logits/chosen": -1.2597088813781738, "logits/rejected": -1.1974666118621826, "logps/chosen": -34.747318267822266, "logps/rejected": -60.813480377197266, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": -0.4029719829559326, "rewards/margins": 5.782626628875732, "rewards/rejected": -6.185598373413086, "step": 439 }, { "epoch": 7.4576271186440675, "grad_norm": 4.753436790014126, "learning_rate": 6.994271479897313e-09, "logits/chosen": -1.2228598594665527, "logits/rejected": -1.2148244380950928, "logps/chosen": -23.159412384033203, "logps/rejected": -39.971435546875, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 0.20260876417160034, "rewards/margins": 4.695430278778076, "rewards/rejected": -4.49282169342041, "step": 440 }, { "epoch": 7.47457627118644, "grad_norm": 5.465778826325062, "learning_rate": 6.565853727654502e-09, "logits/chosen": -1.6256568431854248, "logits/rejected": -1.6748077869415283, "logps/chosen": -35.98029327392578, "logps/rejected": -51.9576530456543, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.7835733890533447, "rewards/margins": 5.101205825805664, "rewards/rejected": -5.884779453277588, "step": 441 }, { "epoch": 7.491525423728813, "grad_norm": 5.19068823671553, "learning_rate": 6.150800316200605e-09, "logits/chosen": -1.5622632503509521, "logits/rejected": -1.6051169633865356, "logps/chosen": -26.639972686767578, "logps/rejected": -39.97083282470703, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 0.18523569405078888, "rewards/margins": 5.357626914978027, "rewards/rejected": -5.172390937805176, "step": 442 }, { "epoch": 7.508474576271187, "grad_norm": 5.290094026909458, "learning_rate": 5.7491340316373485e-09, "logits/chosen": -1.3767602443695068, "logits/rejected": -1.2797472476959229, "logps/chosen": -26.007776260375977, "logps/rejected": -53.1204719543457, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -0.03208550810813904, "rewards/margins": 6.148884296417236, "rewards/rejected": -6.18096923828125, "step": 443 }, { "epoch": 7.52542372881356, "grad_norm": 5.384685543036387, "learning_rate": 5.360876925123992e-09, "logits/chosen": -1.660988450050354, "logits/rejected": -1.495483160018921, "logps/chosen": -38.71052551269531, "logps/rejected": -63.9692497253418, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -0.767485499382019, "rewards/margins": 6.337566375732422, "rewards/rejected": -7.105052947998047, "step": 444 }, { "epoch": 7.5423728813559325, "grad_norm": 6.204818072148143, "learning_rate": 4.9860503116665176e-09, "logits/chosen": -1.7063543796539307, "logits/rejected": -1.692810297012329, "logps/chosen": -25.562942504882812, "logps/rejected": -48.51639175415039, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.01574818789958954, "rewards/margins": 4.9537529945373535, "rewards/rejected": -4.969500541687012, "step": 445 }, { "epoch": 7.559322033898305, "grad_norm": 5.655663554463141, "learning_rate": 4.624674768947484e-09, "logits/chosen": -1.598649024963379, "logits/rejected": -1.4650629758834839, "logps/chosen": -27.018341064453125, "logps/rejected": -46.08781051635742, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027409642934799194, "rewards/margins": 4.6786322593688965, "rewards/rejected": -4.681373596191406, "step": 446 }, { "epoch": 7.576271186440678, "grad_norm": 5.701136400159739, "learning_rate": 4.2767701361964835e-09, "logits/chosen": -1.3610193729400635, "logits/rejected": -1.3563426733016968, "logps/chosen": -35.585716247558594, "logps/rejected": -50.32651901245117, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -0.6664023995399475, "rewards/margins": 4.827932834625244, "rewards/rejected": -5.494334697723389, "step": 447 }, { "epoch": 7.593220338983051, "grad_norm": 6.209794879281624, "learning_rate": 3.942355513100792e-09, "logits/chosen": -1.4733314514160156, "logits/rejected": -1.4402073621749878, "logps/chosen": -28.033966064453125, "logps/rejected": -54.08881378173828, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -0.519957423210144, "rewards/margins": 5.857639312744141, "rewards/rejected": -6.377596855163574, "step": 448 }, { "epoch": 7.610169491525424, "grad_norm": 4.997367172232428, "learning_rate": 3.6214492587569313e-09, "logits/chosen": -1.5576658248901367, "logits/rejected": -1.6168861389160156, "logps/chosen": -32.894317626953125, "logps/rejected": -41.69158172607422, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": -0.22915619611740112, "rewards/margins": 4.693375110626221, "rewards/rejected": -4.922531604766846, "step": 449 }, { "epoch": 7.627118644067797, "grad_norm": 5.4083679319206555, "learning_rate": 3.314068990662805e-09, "logits/chosen": -1.619524359703064, "logits/rejected": -1.469055414199829, "logps/chosen": -25.54304313659668, "logps/rejected": -38.69428634643555, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 0.059964776039123535, "rewards/margins": 5.082298278808594, "rewards/rejected": -5.02233362197876, "step": 450 }, { "epoch": 7.6440677966101696, "grad_norm": 5.217030218721459, "learning_rate": 3.0202315837502545e-09, "logits/chosen": -1.4242594242095947, "logits/rejected": -1.439193844795227, "logps/chosen": -31.73979949951172, "logps/rejected": -39.235633850097656, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -0.5601508617401123, "rewards/margins": 4.23047399520874, "rewards/rejected": -4.790624618530273, "step": 451 }, { "epoch": 7.661016949152542, "grad_norm": 5.102186940922818, "learning_rate": 2.7399531694589917e-09, "logits/chosen": -1.5301525592803955, "logits/rejected": -1.5297596454620361, "logps/chosen": -27.26993179321289, "logps/rejected": -46.745147705078125, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -0.15520310401916504, "rewards/margins": 5.617114543914795, "rewards/rejected": -5.772317409515381, "step": 452 }, { "epoch": 7.677966101694915, "grad_norm": 5.403786442321042, "learning_rate": 2.473249134850808e-09, "logits/chosen": -1.3538662195205688, "logits/rejected": -1.3246500492095947, "logps/chosen": -22.38913345336914, "logps/rejected": -45.14683532714844, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 0.044996634125709534, "rewards/margins": 4.89539098739624, "rewards/rejected": -4.850394248962402, "step": 453 }, { "epoch": 7.694915254237288, "grad_norm": 6.421766143852716, "learning_rate": 2.220134121764833e-09, "logits/chosen": -1.4915080070495605, "logits/rejected": -1.452484130859375, "logps/chosen": -16.578571319580078, "logps/rejected": -37.22169876098633, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.8915931582450867, "rewards/margins": 5.497744560241699, "rewards/rejected": -4.606151580810547, "step": 454 }, { "epoch": 7.711864406779661, "grad_norm": 5.924650241904462, "learning_rate": 1.9806220260137065e-09, "logits/chosen": -1.5789787769317627, "logits/rejected": -1.4192255735397339, "logps/chosen": -30.962202072143555, "logps/rejected": -46.20201873779297, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 0.06508506834506989, "rewards/margins": 5.4069318771362305, "rewards/rejected": -5.341846466064453, "step": 455 }, { "epoch": 7.728813559322034, "grad_norm": 5.712693057718049, "learning_rate": 1.7547259966207705e-09, "logits/chosen": -1.5617210865020752, "logits/rejected": -1.489423394203186, "logps/chosen": -30.215980529785156, "logps/rejected": -46.79343795776367, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -0.2140485942363739, "rewards/margins": 6.268076419830322, "rewards/rejected": -6.482125282287598, "step": 456 }, { "epoch": 7.745762711864407, "grad_norm": 4.765004207763158, "learning_rate": 1.5424584350981485e-09, "logits/chosen": -1.5075204372406006, "logits/rejected": -1.446047067642212, "logps/chosen": -25.82567596435547, "logps/rejected": -43.98649597167969, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": -0.2662041187286377, "rewards/margins": 5.002760887145996, "rewards/rejected": -5.268965244293213, "step": 457 }, { "epoch": 7.762711864406779, "grad_norm": 4.613094215982073, "learning_rate": 1.343830994765982e-09, "logits/chosen": -1.5371060371398926, "logits/rejected": -1.4451144933700562, "logps/chosen": -26.201602935791016, "logps/rejected": -57.06352996826172, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -0.1458522379398346, "rewards/margins": 6.390477657318115, "rewards/rejected": -6.53632926940918, "step": 458 }, { "epoch": 7.779661016949152, "grad_norm": 4.4872892882814535, "learning_rate": 1.1588545801125837e-09, "logits/chosen": -1.7496167421340942, "logits/rejected": -1.6176550388336182, "logps/chosen": -36.218074798583984, "logps/rejected": -55.342689514160156, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -0.4042947292327881, "rewards/margins": 5.6281538009643555, "rewards/rejected": -6.032447814941406, "step": 459 }, { "epoch": 7.796610169491525, "grad_norm": 5.575240209393775, "learning_rate": 9.87539346195776e-10, "logits/chosen": -1.4099677801132202, "logits/rejected": -1.2582119703292847, "logps/chosen": -29.023414611816406, "logps/rejected": -42.5074577331543, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.27348968386650085, "rewards/margins": 4.847697734832764, "rewards/rejected": -5.121187210083008, "step": 460 }, { "epoch": 7.813559322033898, "grad_norm": 5.199396752692828, "learning_rate": 8.298946980855315e-10, "logits/chosen": -1.472285509109497, "logits/rejected": -1.330193281173706, "logps/chosen": -29.86899185180664, "logps/rejected": -41.80485153198242, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -0.15588414669036865, "rewards/margins": 5.636477470397949, "rewards/rejected": -5.792361259460449, "step": 461 }, { "epoch": 7.830508474576272, "grad_norm": 4.521518393613341, "learning_rate": 6.8592929034747e-10, "logits/chosen": -1.4151493310928345, "logits/rejected": -1.4073522090911865, "logps/chosen": -27.720748901367188, "logps/rejected": -53.330909729003906, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -0.1728513091802597, "rewards/margins": 5.166114807128906, "rewards/rejected": -5.338965892791748, "step": 462 }, { "epoch": 7.847457627118644, "grad_norm": 4.567347653583189, "learning_rate": 5.556510265678771e-10, "logits/chosen": -1.570703387260437, "logits/rejected": -1.5427438020706177, "logps/chosen": -24.184301376342773, "logps/rejected": -44.638912200927734, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -0.29552435874938965, "rewards/margins": 5.448472499847412, "rewards/rejected": -5.743997573852539, "step": 463 }, { "epoch": 7.864406779661017, "grad_norm": 5.544082314499855, "learning_rate": 4.390670589196621e-10, "logits/chosen": -1.6661534309387207, "logits/rejected": -1.5269910097122192, "logps/chosen": -24.99061393737793, "logps/rejected": -45.773014068603516, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -0.4490078091621399, "rewards/margins": 5.556340217590332, "rewards/rejected": -6.005348205566406, "step": 464 }, { "epoch": 7.88135593220339, "grad_norm": 4.867190786041254, "learning_rate": 3.3618378776981147e-10, "logits/chosen": -1.6171151399612427, "logits/rejected": -1.5128624439239502, "logps/chosen": -28.165693283081055, "logps/rejected": -41.43475341796875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 0.2984722852706909, "rewards/margins": 4.441769599914551, "rewards/rejected": -4.14329719543457, "step": 465 }, { "epoch": 7.898305084745763, "grad_norm": 5.660749716669039, "learning_rate": 2.4700686132803075e-10, "logits/chosen": -1.5283398628234863, "logits/rejected": -1.484012484550476, "logps/chosen": -29.490188598632812, "logps/rejected": -45.15831756591797, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 0.029849261045455933, "rewards/margins": 5.0221357345581055, "rewards/rejected": -4.992286682128906, "step": 466 }, { "epoch": 7.915254237288136, "grad_norm": 6.365487263229808, "learning_rate": 1.715411753365481e-10, "logits/chosen": -1.649171233177185, "logits/rejected": -1.701267957687378, "logps/chosen": -25.891307830810547, "logps/rejected": -47.70183181762695, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -0.6622889041900635, "rewards/margins": 5.317337989807129, "rewards/rejected": -5.9796271324157715, "step": 467 }, { "epoch": 7.932203389830509, "grad_norm": 4.885763801093592, "learning_rate": 1.0979087280141297e-10, "logits/chosen": -1.2668333053588867, "logits/rejected": -1.3214877843856812, "logps/chosen": -21.41905403137207, "logps/rejected": -38.722625732421875, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -0.182024747133255, "rewards/margins": 4.632743835449219, "rewards/rejected": -4.814768314361572, "step": 468 }, { "epoch": 7.9491525423728815, "grad_norm": 5.3854848332900085, "learning_rate": 6.175934376509429e-11, "logits/chosen": -1.5046411752700806, "logits/rejected": -1.5680880546569824, "logps/chosen": -28.198741912841797, "logps/rejected": -59.644596099853516, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 0.0053573548793792725, "rewards/margins": 6.178599834442139, "rewards/rejected": -6.173242568969727, "step": 469 }, { "epoch": 7.966101694915254, "grad_norm": 4.75232754040335, "learning_rate": 2.7449225120268482e-11, "logits/chosen": -1.4780237674713135, "logits/rejected": -1.427555799484253, "logps/chosen": -27.112102508544922, "logps/rejected": -52.26079177856445, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -0.19119581580162048, "rewards/margins": 6.167375087738037, "rewards/rejected": -6.3585710525512695, "step": 470 }, { "epoch": 7.983050847457627, "grad_norm": 5.233750895631027, "learning_rate": 6.862400465157403e-12, "logits/chosen": -1.5693848133087158, "logits/rejected": -1.5205610990524292, "logps/chosen": -33.29484558105469, "logps/rejected": -41.09092712402344, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -0.30308201909065247, "rewards/margins": 4.717187881469727, "rewards/rejected": -5.020270347595215, "step": 471 }, { "epoch": 8.0, "grad_norm": 6.397324083693935, "learning_rate": 0.0, "logits/chosen": -1.639676570892334, "logits/rejected": -1.6202343702316284, "logps/chosen": -33.1722526550293, "logps/rejected": -40.651100158691406, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -0.057152003049850464, "rewards/margins": 4.614887237548828, "rewards/rejected": -4.672039031982422, "step": 472 }, { "epoch": 8.0, "step": 472, "total_flos": 0.0, "train_loss": 0.15622181608736263, "train_runtime": 4721.8854, "train_samples_per_second": 12.788, "train_steps_per_second": 0.1 } ], "logging_steps": 1, "max_steps": 472, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }