diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7110 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 1, + "global_step": 472, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01694915254237288, + "grad_norm": 74.35959798227883, + "learning_rate": 1.0416666666666666e-08, + "logits/chosen": -1.0022015571594238, + "logits/rejected": -1.0571039915084839, + "logps/chosen": -26.953861236572266, + "logps/rejected": -41.69861602783203, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.03389830508474576, + "grad_norm": 71.81906743432833, + "learning_rate": 2.083333333333333e-08, + "logits/chosen": -0.9866722822189331, + "logits/rejected": -1.1117209196090698, + "logps/chosen": -33.70802307128906, + "logps/rejected": -37.13496398925781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.05084745762711865, + "grad_norm": 75.40773138219605, + "learning_rate": 3.125e-08, + "logits/chosen": -1.2932835817337036, + "logits/rejected": -1.2812855243682861, + "logps/chosen": -30.66956329345703, + "logps/rejected": -49.71609878540039, + "loss": 0.7141, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.015645474195480347, + "rewards/margins": -0.0929969847202301, + "rewards/rejected": 0.07735151052474976, + "step": 3 + }, + { + "epoch": 0.06779661016949153, + "grad_norm": 74.47397381287325, + "learning_rate": 4.166666666666666e-08, + "logits/chosen": -1.0415635108947754, + "logits/rejected": -1.0745270252227783, + "logps/chosen": -26.642629623413086, + "logps/rejected": -38.44277572631836, + "loss": 0.7026, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.048020511865615845, + "rewards/margins": -0.018185943365097046, + "rewards/rejected": 0.06620645523071289, + "step": 4 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 76.30048141954929, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -1.2889574766159058, + "logits/rejected": -1.219961404800415, + "logps/chosen": -31.46270751953125, + "logps/rejected": -28.84333610534668, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04683685302734375, + "rewards/margins": -0.016847282648086548, + "rewards/rejected": -0.029989570379257202, + "step": 5 + }, + { + "epoch": 0.1016949152542373, + "grad_norm": 71.45564125608341, + "learning_rate": 6.25e-08, + "logits/chosen": -1.2035868167877197, + "logits/rejected": -1.1329045295715332, + "logps/chosen": -35.237884521484375, + "logps/rejected": -38.39533996582031, + "loss": 0.6914, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.014047741889953613, + "rewards/margins": 0.0001532137393951416, + "rewards/rejected": -0.014200955629348755, + "step": 6 + }, + { + "epoch": 0.11864406779661017, + "grad_norm": 74.36407794239695, + "learning_rate": 7.291666666666667e-08, + "logits/chosen": -1.2061525583267212, + "logits/rejected": -1.1756294965744019, + "logps/chosen": -29.565950393676758, + "logps/rejected": -33.2646484375, + "loss": 0.7034, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09937351942062378, + "rewards/margins": 0.04770404100418091, + "rewards/rejected": 0.05166947841644287, + "step": 7 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 72.94427557648902, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": -1.2344228029251099, + "logits/rejected": -1.255335807800293, + "logps/chosen": -26.22496223449707, + "logps/rejected": -39.68927764892578, + "loss": 0.7084, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.005427069962024689, + "rewards/margins": -0.014611326158046722, + "rewards/rejected": 0.02003839612007141, + "step": 8 + }, + { + "epoch": 0.15254237288135594, + "grad_norm": 69.72083135093712, + "learning_rate": 9.375e-08, + "logits/chosen": -1.1401718854904175, + "logits/rejected": -1.1010041236877441, + "logps/chosen": -33.88338851928711, + "logps/rejected": -28.835594177246094, + "loss": 0.7, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.037932395935058594, + "rewards/margins": -0.005142271518707275, + "rewards/rejected": 0.04307466745376587, + "step": 9 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 71.74909213913158, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -1.180943250656128, + "logits/rejected": -1.090497612953186, + "logps/chosen": -37.923099517822266, + "logps/rejected": -36.79877471923828, + "loss": 0.6975, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031415216624736786, + "rewards/margins": -0.0013954713940620422, + "rewards/rejected": 0.03281068801879883, + "step": 10 + }, + { + "epoch": 0.1864406779661017, + "grad_norm": 75.11980776081754, + "learning_rate": 1.1458333333333332e-07, + "logits/chosen": -0.9823209047317505, + "logits/rejected": -0.9876938462257385, + "logps/chosen": -27.41145896911621, + "logps/rejected": -45.299991607666016, + "loss": 0.7077, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0018342137336730957, + "rewards/margins": 0.0696558952331543, + "rewards/rejected": -0.07149010896682739, + "step": 11 + }, + { + "epoch": 0.2033898305084746, + "grad_norm": 71.48536897103016, + "learning_rate": 1.25e-07, + "logits/chosen": -0.9993177652359009, + "logits/rejected": -0.9222314953804016, + "logps/chosen": -23.32973289489746, + "logps/rejected": -32.4486083984375, + "loss": 0.6838, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01836332678794861, + "rewards/margins": 0.00046828389167785645, + "rewards/rejected": -0.018831610679626465, + "step": 12 + }, + { + "epoch": 0.22033898305084745, + "grad_norm": 69.73428591175114, + "learning_rate": 1.3541666666666666e-07, + "logits/chosen": -1.4160847663879395, + "logits/rejected": -1.2769191265106201, + "logps/chosen": -26.515804290771484, + "logps/rejected": -36.50257110595703, + "loss": 0.7069, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09053070843219757, + "rewards/margins": 0.059635356068611145, + "rewards/rejected": 0.030895352363586426, + "step": 13 + }, + { + "epoch": 0.23728813559322035, + "grad_norm": 74.26548135080421, + "learning_rate": 1.4583333333333335e-07, + "logits/chosen": -1.2310669422149658, + "logits/rejected": -1.1061973571777344, + "logps/chosen": -38.83403396606445, + "logps/rejected": -57.466835021972656, + "loss": 0.7022, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.014897465705871582, + "rewards/margins": 0.015999972820281982, + "rewards/rejected": -0.030897438526153564, + "step": 14 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 70.61107518338639, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -1.245528221130371, + "logits/rejected": -1.2389111518859863, + "logps/chosen": -24.09255027770996, + "logps/rejected": -35.16242218017578, + "loss": 0.7081, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009011238813400269, + "rewards/margins": 0.01104736328125, + "rewards/rejected": -0.0020361244678497314, + "step": 15 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 69.06624100974847, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -1.1713544130325317, + "logits/rejected": -1.2104028463363647, + "logps/chosen": -27.774211883544922, + "logps/rejected": -32.56517791748047, + "loss": 0.6959, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06307366490364075, + "rewards/margins": -0.055293723940849304, + "rewards/rejected": -0.007779940962791443, + "step": 16 + }, + { + "epoch": 0.288135593220339, + "grad_norm": 74.45322195801226, + "learning_rate": 1.7708333333333334e-07, + "logits/chosen": -1.2374873161315918, + "logits/rejected": -1.2554068565368652, + "logps/chosen": -24.268505096435547, + "logps/rejected": -35.4179573059082, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0001607835292816162, + "rewards/margins": -0.01779848337173462, + "rewards/rejected": 0.017637699842453003, + "step": 17 + }, + { + "epoch": 0.3050847457627119, + "grad_norm": 71.43793235652528, + "learning_rate": 1.875e-07, + "logits/chosen": -1.1041090488433838, + "logits/rejected": -1.0679348707199097, + "logps/chosen": -23.000707626342773, + "logps/rejected": -29.853412628173828, + "loss": 0.705, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.055961236357688904, + "rewards/margins": -0.05573050677776337, + "rewards/rejected": -0.0002307295799255371, + "step": 18 + }, + { + "epoch": 0.3220338983050847, + "grad_norm": 71.50823519836709, + "learning_rate": 1.9791666666666664e-07, + "logits/chosen": -1.0583523511886597, + "logits/rejected": -1.0313684940338135, + "logps/chosen": -21.926410675048828, + "logps/rejected": -35.9990348815918, + "loss": 0.7084, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.018382668495178223, + "rewards/margins": 0.02424493432044983, + "rewards/rejected": -0.04262760281562805, + "step": 19 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 65.57787832871358, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -1.2984358072280884, + "logits/rejected": -1.2342230081558228, + "logps/chosen": -30.60858154296875, + "logps/rejected": -38.88047790527344, + "loss": 0.7012, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01342683658003807, + "rewards/margins": 0.014368299394845963, + "rewards/rejected": -0.027795135974884033, + "step": 20 + }, + { + "epoch": 0.3559322033898305, + "grad_norm": 68.54840408145243, + "learning_rate": 2.1875e-07, + "logits/chosen": -1.3012466430664062, + "logits/rejected": -1.3128973245620728, + "logps/chosen": -26.805089950561523, + "logps/rejected": -41.52635192871094, + "loss": 0.6853, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01039344072341919, + "rewards/margins": 0.011234819889068604, + "rewards/rejected": -0.0008413791656494141, + "step": 21 + }, + { + "epoch": 0.3728813559322034, + "grad_norm": 75.35032250165732, + "learning_rate": 2.2916666666666663e-07, + "logits/chosen": -1.1717727184295654, + "logits/rejected": -1.1936640739440918, + "logps/chosen": -21.9468994140625, + "logps/rejected": -27.52823257446289, + "loss": 0.6756, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.030159294605255127, + "rewards/margins": 0.03967534005641937, + "rewards/rejected": -0.0698346346616745, + "step": 22 + }, + { + "epoch": 0.3898305084745763, + "grad_norm": 69.76965900033692, + "learning_rate": 2.3958333333333335e-07, + "logits/chosen": -1.227396845817566, + "logits/rejected": -1.2514605522155762, + "logps/chosen": -30.95319938659668, + "logps/rejected": -33.107421875, + "loss": 0.6483, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.009671658277511597, + "rewards/margins": 0.06378874182701111, + "rewards/rejected": -0.05411708354949951, + "step": 23 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 71.41404237507972, + "learning_rate": 2.5e-07, + "logits/chosen": -1.1064453125, + "logits/rejected": -1.0061161518096924, + "logps/chosen": -32.68113327026367, + "logps/rejected": -38.1193962097168, + "loss": 0.6682, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.034807443618774414, + "rewards/margins": 0.0188644677400589, + "rewards/rejected": -0.05367191135883331, + "step": 24 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 70.49202249008462, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": -1.1899397373199463, + "logits/rejected": -1.1957398653030396, + "logps/chosen": -37.75098419189453, + "logps/rejected": -34.21184539794922, + "loss": 0.6635, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003971487283706665, + "rewards/margins": 0.10494436323642731, + "rewards/rejected": -0.10097287595272064, + "step": 25 + }, + { + "epoch": 0.4406779661016949, + "grad_norm": 67.14613715257252, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -1.3827494382858276, + "logits/rejected": -1.3526558876037598, + "logps/chosen": -27.053857803344727, + "logps/rejected": -33.783485412597656, + "loss": 0.6528, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015252411365509033, + "rewards/margins": 0.1568407416343689, + "rewards/rejected": -0.14158833026885986, + "step": 26 + }, + { + "epoch": 0.4576271186440678, + "grad_norm": 68.86332311369058, + "learning_rate": 2.8125e-07, + "logits/chosen": -1.0096489191055298, + "logits/rejected": -1.0554242134094238, + "logps/chosen": -33.25010681152344, + "logps/rejected": -35.958675384521484, + "loss": 0.6573, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04330983757972717, + "rewards/margins": 0.12464988231658936, + "rewards/rejected": -0.16795971989631653, + "step": 27 + }, + { + "epoch": 0.4745762711864407, + "grad_norm": 66.94051666468431, + "learning_rate": 2.916666666666667e-07, + "logits/chosen": -1.3319611549377441, + "logits/rejected": -1.2341638803482056, + "logps/chosen": -23.90947914123535, + "logps/rejected": -31.997909545898438, + "loss": 0.6585, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028478246182203293, + "rewards/margins": 0.07672013342380524, + "rewards/rejected": -0.10519838333129883, + "step": 28 + }, + { + "epoch": 0.4915254237288136, + "grad_norm": 62.13361032578897, + "learning_rate": 3.020833333333333e-07, + "logits/chosen": -1.058496356010437, + "logits/rejected": -1.061716914176941, + "logps/chosen": -27.815153121948242, + "logps/rejected": -35.63865661621094, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.002513296902179718, + "rewards/margins": 0.08875668793916702, + "rewards/rejected": -0.0862433910369873, + "step": 29 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 69.40663934648452, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -1.148958683013916, + "logits/rejected": -1.0955352783203125, + "logps/chosen": -29.258386611938477, + "logps/rejected": -37.004207611083984, + "loss": 0.6333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018777184188365936, + "rewards/margins": 0.270157128572464, + "rewards/rejected": -0.28893429040908813, + "step": 30 + }, + { + "epoch": 0.5254237288135594, + "grad_norm": 63.75111913966312, + "learning_rate": 3.2291666666666666e-07, + "logits/chosen": -1.1732263565063477, + "logits/rejected": -1.0661126375198364, + "logps/chosen": -31.076984405517578, + "logps/rejected": -34.91364288330078, + "loss": 0.6028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04974186420440674, + "rewards/margins": 0.24639210104942322, + "rewards/rejected": -0.19665023684501648, + "step": 31 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 63.46121996218027, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.2128429412841797, + "logits/rejected": -1.2360285520553589, + "logps/chosen": -36.98728942871094, + "logps/rejected": -38.032920837402344, + "loss": 0.6111, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015447601675987244, + "rewards/margins": 0.3491629958152771, + "rewards/rejected": -0.33371537923812866, + "step": 32 + }, + { + "epoch": 0.559322033898305, + "grad_norm": 64.01628740909891, + "learning_rate": 3.4375e-07, + "logits/chosen": -1.2774078845977783, + "logits/rejected": -1.252654790878296, + "logps/chosen": -30.110694885253906, + "logps/rejected": -39.82551956176758, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12499848008155823, + "rewards/margins": 0.44781434535980225, + "rewards/rejected": -0.5728127956390381, + "step": 33 + }, + { + "epoch": 0.576271186440678, + "grad_norm": 61.04313764198192, + "learning_rate": 3.541666666666667e-07, + "logits/chosen": -1.2864689826965332, + "logits/rejected": -1.1125664710998535, + "logps/chosen": -29.14852523803711, + "logps/rejected": -35.5062255859375, + "loss": 0.5925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07132556289434433, + "rewards/margins": 0.29910576343536377, + "rewards/rejected": -0.3704313337802887, + "step": 34 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 61.863987476747845, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": -1.1560570001602173, + "logits/rejected": -1.1620285511016846, + "logps/chosen": -21.810134887695312, + "logps/rejected": -45.566375732421875, + "loss": 0.5656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03650933504104614, + "rewards/margins": 0.7581607699394226, + "rewards/rejected": -0.7946701049804688, + "step": 35 + }, + { + "epoch": 0.6101694915254238, + "grad_norm": 59.689844232963395, + "learning_rate": 3.75e-07, + "logits/chosen": -1.1466398239135742, + "logits/rejected": -1.1647982597351074, + "logps/chosen": -24.348777770996094, + "logps/rejected": -29.572818756103516, + "loss": 0.5827, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0986182689666748, + "rewards/margins": 0.30504968762397766, + "rewards/rejected": -0.40366795659065247, + "step": 36 + }, + { + "epoch": 0.6271186440677966, + "grad_norm": 66.8530097795874, + "learning_rate": 3.8541666666666665e-07, + "logits/chosen": -1.1411319971084595, + "logits/rejected": -1.1138074398040771, + "logps/chosen": -29.88798713684082, + "logps/rejected": -30.512065887451172, + "loss": 0.5748, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1340489685535431, + "rewards/margins": 0.17574873566627502, + "rewards/rejected": -0.3097977042198181, + "step": 37 + }, + { + "epoch": 0.6440677966101694, + "grad_norm": 59.54927270244813, + "learning_rate": 3.958333333333333e-07, + "logits/chosen": -1.2669272422790527, + "logits/rejected": -1.2261359691619873, + "logps/chosen": -33.28898239135742, + "logps/rejected": -52.16841506958008, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1103217601776123, + "rewards/margins": 0.4893791675567627, + "rewards/rejected": -0.599700927734375, + "step": 38 + }, + { + "epoch": 0.6610169491525424, + "grad_norm": 61.22261904686975, + "learning_rate": 4.0625e-07, + "logits/chosen": -1.2993597984313965, + "logits/rejected": -1.2113492488861084, + "logps/chosen": -38.285438537597656, + "logps/rejected": -51.55641555786133, + "loss": 0.5969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14819863438606262, + "rewards/margins": 0.8797410726547241, + "rewards/rejected": -1.027939796447754, + "step": 39 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 60.12706158104359, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -1.1191147565841675, + "logits/rejected": -0.9404773712158203, + "logps/chosen": -27.851015090942383, + "logps/rejected": -40.25725173950195, + "loss": 0.5564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17417365312576294, + "rewards/margins": 0.5528932809829712, + "rewards/rejected": -0.7270669341087341, + "step": 40 + }, + { + "epoch": 0.6949152542372882, + "grad_norm": 57.92726362837684, + "learning_rate": 4.270833333333333e-07, + "logits/chosen": -1.0899916887283325, + "logits/rejected": -0.9453008770942688, + "logps/chosen": -26.283432006835938, + "logps/rejected": -45.57765579223633, + "loss": 0.5461, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13252140581607819, + "rewards/margins": 0.8450538516044617, + "rewards/rejected": -0.9775752425193787, + "step": 41 + }, + { + "epoch": 0.711864406779661, + "grad_norm": 58.169474320406394, + "learning_rate": 4.375e-07, + "logits/chosen": -1.1068731546401978, + "logits/rejected": -1.0945335626602173, + "logps/chosen": -24.04115104675293, + "logps/rejected": -36.62716293334961, + "loss": 0.5406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10331939160823822, + "rewards/margins": 0.5659677386283875, + "rewards/rejected": -0.6692871451377869, + "step": 42 + }, + { + "epoch": 0.7288135593220338, + "grad_norm": 66.66372536069926, + "learning_rate": 4.479166666666667e-07, + "logits/chosen": -1.1794297695159912, + "logits/rejected": -1.193390965461731, + "logps/chosen": -46.91223907470703, + "logps/rejected": -41.650333404541016, + "loss": 0.5573, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3094208538532257, + "rewards/margins": 0.14515355229377747, + "rewards/rejected": -0.4545744061470032, + "step": 43 + }, + { + "epoch": 0.7457627118644068, + "grad_norm": 56.78835490397593, + "learning_rate": 4.5833333333333327e-07, + "logits/chosen": -1.0120888948440552, + "logits/rejected": -0.8681447505950928, + "logps/chosen": -33.3481330871582, + "logps/rejected": -48.61557388305664, + "loss": 0.5516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.093438059091568, + "rewards/margins": 0.624420166015625, + "rewards/rejected": -0.7178582549095154, + "step": 44 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 59.752166781716426, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": -1.2927764654159546, + "logits/rejected": -1.212005615234375, + "logps/chosen": -27.116588592529297, + "logps/rejected": -48.95513153076172, + "loss": 0.5622, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08109956979751587, + "rewards/margins": 1.1761130094528198, + "rewards/rejected": -1.2572126388549805, + "step": 45 + }, + { + "epoch": 0.7796610169491526, + "grad_norm": 56.0712139430381, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": -1.310462474822998, + "logits/rejected": -1.1343291997909546, + "logps/chosen": -32.363792419433594, + "logps/rejected": -41.77903747558594, + "loss": 0.4924, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1404968947172165, + "rewards/margins": 0.6709720492362976, + "rewards/rejected": -0.8114689588546753, + "step": 46 + }, + { + "epoch": 0.7966101694915254, + "grad_norm": 53.687876472368266, + "learning_rate": 4.895833333333333e-07, + "logits/chosen": -1.0838322639465332, + "logits/rejected": -1.1518497467041016, + "logps/chosen": -34.07114791870117, + "logps/rejected": -38.58074188232422, + "loss": 0.475, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14443516731262207, + "rewards/margins": 0.33194631338119507, + "rewards/rejected": -0.47638148069381714, + "step": 47 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 57.81488552818391, + "learning_rate": 5e-07, + "logits/chosen": -1.3274493217468262, + "logits/rejected": -1.099511981010437, + "logps/chosen": -33.7588005065918, + "logps/rejected": -51.24869918823242, + "loss": 0.46, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2817200720310211, + "rewards/margins": 1.4236443042755127, + "rewards/rejected": -1.705364465713501, + "step": 48 + }, + { + "epoch": 0.8305084745762712, + "grad_norm": 51.9232811376309, + "learning_rate": 4.999931375995349e-07, + "logits/chosen": -1.2194585800170898, + "logits/rejected": -1.091806173324585, + "logps/chosen": -28.378108978271484, + "logps/rejected": -36.51158905029297, + "loss": 0.4926, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31702089309692383, + "rewards/margins": 0.6066832542419434, + "rewards/rejected": -0.9237041473388672, + "step": 49 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 58.79932831253633, + "learning_rate": 4.999725507748798e-07, + "logits/chosen": -1.4276092052459717, + "logits/rejected": -1.2703173160552979, + "logps/chosen": -27.375112533569336, + "logps/rejected": -45.86325454711914, + "loss": 0.5289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21876651048660278, + "rewards/margins": 1.0810855627059937, + "rewards/rejected": -1.2998521327972412, + "step": 50 + }, + { + "epoch": 0.864406779661017, + "grad_norm": 56.405768925690126, + "learning_rate": 4.99938240656235e-07, + "logits/chosen": -1.1859700679779053, + "logits/rejected": -1.110828161239624, + "logps/chosen": -30.097301483154297, + "logps/rejected": -56.47822189331055, + "loss": 0.4842, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14388218522071838, + "rewards/margins": 0.8802270889282227, + "rewards/rejected": -1.0241093635559082, + "step": 51 + }, + { + "epoch": 0.8813559322033898, + "grad_norm": 54.84466981430871, + "learning_rate": 4.998902091271985e-07, + "logits/chosen": -1.2907037734985352, + "logits/rejected": -1.198894739151001, + "logps/chosen": -23.458566665649414, + "logps/rejected": -36.68768310546875, + "loss": 0.4399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09736257791519165, + "rewards/margins": 0.8091447353363037, + "rewards/rejected": -0.9065073132514954, + "step": 52 + }, + { + "epoch": 0.8983050847457628, + "grad_norm": 53.59884977562605, + "learning_rate": 4.998284588246634e-07, + "logits/chosen": -1.2666661739349365, + "logits/rejected": -1.1450353860855103, + "logps/chosen": -32.4919319152832, + "logps/rejected": -36.02308654785156, + "loss": 0.4643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2568899691104889, + "rewards/margins": 0.8810305595397949, + "rewards/rejected": -1.137920618057251, + "step": 53 + }, + { + "epoch": 0.9152542372881356, + "grad_norm": 56.86352488023535, + "learning_rate": 4.997529931386719e-07, + "logits/chosen": -1.2422281503677368, + "logits/rejected": -1.2324570417404175, + "logps/chosen": -33.5006217956543, + "logps/rejected": -30.282136917114258, + "loss": 0.4844, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3842281401157379, + "rewards/margins": 0.29243284463882446, + "rewards/rejected": -0.6766610145568848, + "step": 54 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 53.15925076232375, + "learning_rate": 4.996638162122302e-07, + "logits/chosen": -1.2868903875350952, + "logits/rejected": -1.1907857656478882, + "logps/chosen": -32.84450149536133, + "logps/rejected": -39.07723617553711, + "loss": 0.4865, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19609573483467102, + "rewards/margins": 1.1344835758209229, + "rewards/rejected": -1.3305792808532715, + "step": 55 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 50.6107831993107, + "learning_rate": 4.995609329410804e-07, + "logits/chosen": -1.1869336366653442, + "logits/rejected": -1.0896047353744507, + "logps/chosen": -25.43665885925293, + "logps/rejected": -37.78445816040039, + "loss": 0.4331, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2436077892780304, + "rewards/margins": 1.5263582468032837, + "rewards/rejected": -1.7699658870697021, + "step": 56 + }, + { + "epoch": 0.9661016949152542, + "grad_norm": 53.83410279226871, + "learning_rate": 4.994443489734322e-07, + "logits/chosen": -1.140820860862732, + "logits/rejected": -1.0942250490188599, + "logps/chosen": -28.2945499420166, + "logps/rejected": -44.74162292480469, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.28495079278945923, + "rewards/margins": 1.6222182512283325, + "rewards/rejected": -1.9071691036224365, + "step": 57 + }, + { + "epoch": 0.9830508474576272, + "grad_norm": 57.20949918087681, + "learning_rate": 4.993140707096525e-07, + "logits/chosen": -1.2853411436080933, + "logits/rejected": -1.1984596252441406, + "logps/chosen": -35.58999252319336, + "logps/rejected": -42.4873161315918, + "loss": 0.4307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26272836327552795, + "rewards/margins": 1.1384968757629395, + "rewards/rejected": -1.4012253284454346, + "step": 58 + }, + { + "epoch": 1.0, + "grad_norm": 49.633369993703724, + "learning_rate": 4.991701053019145e-07, + "logits/chosen": -1.2317020893096924, + "logits/rejected": -1.2267297506332397, + "logps/chosen": -28.742332458496094, + "logps/rejected": -50.28435516357422, + "loss": 0.4259, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22372712194919586, + "rewards/margins": 1.6622364521026611, + "rewards/rejected": -1.8859635591506958, + "step": 59 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 43.19271219759473, + "learning_rate": 4.990124606538042e-07, + "logits/chosen": -1.27030348777771, + "logits/rejected": -1.2970830202102661, + "logps/chosen": -22.824378967285156, + "logps/rejected": -43.53241729736328, + "loss": 0.3445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11317068338394165, + "rewards/margins": 1.718346118927002, + "rewards/rejected": -1.8315167427062988, + "step": 60 + }, + { + "epoch": 1.0338983050847457, + "grad_norm": 46.56532536625075, + "learning_rate": 4.988411454198874e-07, + "logits/chosen": -1.2344048023223877, + "logits/rejected": -1.376710295677185, + "logps/chosen": -30.1800537109375, + "logps/rejected": -34.68271255493164, + "loss": 0.4151, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3319806754589081, + "rewards/margins": 0.3052244484424591, + "rewards/rejected": -0.637205183506012, + "step": 61 + }, + { + "epoch": 1.0508474576271187, + "grad_norm": 45.932270396743505, + "learning_rate": 4.98656169005234e-07, + "logits/chosen": -1.300619125366211, + "logits/rejected": -1.1745063066482544, + "logps/chosen": -32.63179016113281, + "logps/rejected": -39.57646942138672, + "loss": 0.3826, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.021081820130348206, + "rewards/margins": 1.4243545532226562, + "rewards/rejected": -1.4454363584518433, + "step": 62 + }, + { + "epoch": 1.0677966101694916, + "grad_norm": 41.51255029654015, + "learning_rate": 4.984575415649018e-07, + "logits/chosen": -1.272727370262146, + "logits/rejected": -1.1292237043380737, + "logps/chosen": -26.08396339416504, + "logps/rejected": -47.56403350830078, + "loss": 0.3506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11639979481697083, + "rewards/margins": 2.3725996017456055, + "rewards/rejected": -2.488999366760254, + "step": 63 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 38.246979074883626, + "learning_rate": 4.982452740033792e-07, + "logits/chosen": -1.22151517868042, + "logits/rejected": -1.082349181175232, + "logps/chosen": -27.676637649536133, + "logps/rejected": -33.685211181640625, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00457368791103363, + "rewards/margins": 1.3627952337265015, + "rewards/rejected": -1.358221411705017, + "step": 64 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 40.134511453074204, + "learning_rate": 4.980193779739863e-07, + "logits/chosen": -1.1573750972747803, + "logits/rejected": -1.0462363958358765, + "logps/chosen": -29.544757843017578, + "logps/rejected": -45.139732360839844, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12963010370731354, + "rewards/margins": 1.9951367378234863, + "rewards/rejected": -2.1247665882110596, + "step": 65 + }, + { + "epoch": 1.11864406779661, + "grad_norm": 42.89070654695283, + "learning_rate": 4.977798658782351e-07, + "logits/chosen": -1.3436741828918457, + "logits/rejected": -1.2738423347473145, + "logps/chosen": -29.10391616821289, + "logps/rejected": -42.66758346557617, + "loss": 0.349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23598036170005798, + "rewards/margins": 1.5764625072479248, + "rewards/rejected": -1.8124428987503052, + "step": 66 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 41.22280324142371, + "learning_rate": 4.975267508651491e-07, + "logits/chosen": -1.202622890472412, + "logits/rejected": -1.047250509262085, + "logps/chosen": -27.090843200683594, + "logps/rejected": -30.239742279052734, + "loss": 0.3463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1256638914346695, + "rewards/margins": 1.4247705936431885, + "rewards/rejected": -1.5504344701766968, + "step": 67 + }, + { + "epoch": 1.152542372881356, + "grad_norm": 41.21696455893867, + "learning_rate": 4.97260046830541e-07, + "logits/chosen": -1.166504144668579, + "logits/rejected": -0.8773643970489502, + "logps/chosen": -22.584096908569336, + "logps/rejected": -42.063133239746094, + "loss": 0.3428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07130265235900879, + "rewards/margins": 1.9287859201431274, + "rewards/rejected": -2.000088691711426, + "step": 68 + }, + { + "epoch": 1.1694915254237288, + "grad_norm": 43.16486538306169, + "learning_rate": 4.969797684162497e-07, + "logits/chosen": -1.4693577289581299, + "logits/rejected": -1.3569204807281494, + "logps/chosen": -25.79123878479004, + "logps/rejected": -37.476409912109375, + "loss": 0.3706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10858143866062164, + "rewards/margins": 1.385354995727539, + "rewards/rejected": -1.4939366579055786, + "step": 69 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 42.7942644360943, + "learning_rate": 4.966859310093372e-07, + "logits/chosen": -1.168703317642212, + "logits/rejected": -1.0707366466522217, + "logps/chosen": -30.724082946777344, + "logps/rejected": -41.230712890625, + "loss": 0.354, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.13678063452243805, + "rewards/margins": 1.567689061164856, + "rewards/rejected": -1.7044697999954224, + "step": 70 + }, + { + "epoch": 1.2033898305084745, + "grad_norm": 39.25048654439285, + "learning_rate": 4.96378550741243e-07, + "logits/chosen": -1.3091105222702026, + "logits/rejected": -1.2656583786010742, + "logps/chosen": -30.91085433959961, + "logps/rejected": -41.49336624145508, + "loss": 0.307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22276180982589722, + "rewards/margins": 1.757472276687622, + "rewards/rejected": -1.980234146118164, + "step": 71 + }, + { + "epoch": 1.2203389830508475, + "grad_norm": 40.23010655586107, + "learning_rate": 4.960576444868992e-07, + "logits/chosen": -1.4617582559585571, + "logits/rejected": -1.4617631435394287, + "logps/chosen": -28.715673446655273, + "logps/rejected": -48.552024841308594, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21487601101398468, + "rewards/margins": 2.4703986644744873, + "rewards/rejected": -2.685274362564087, + "step": 72 + }, + { + "epoch": 1.2372881355932204, + "grad_norm": 45.05758163705817, + "learning_rate": 4.957232298638035e-07, + "logits/chosen": -1.3103129863739014, + "logits/rejected": -1.297128677368164, + "logps/chosen": -29.75772476196289, + "logps/rejected": -42.82856750488281, + "loss": 0.3574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16225725412368774, + "rewards/margins": 1.7887846231460571, + "rewards/rejected": -1.9510418176651, + "step": 73 + }, + { + "epoch": 1.2542372881355932, + "grad_norm": 40.337686604932905, + "learning_rate": 4.953753252310525e-07, + "logits/chosen": -1.351475477218628, + "logits/rejected": -1.2956455945968628, + "logps/chosen": -30.769058227539062, + "logps/rejected": -37.68085479736328, + "loss": 0.3211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22842487692832947, + "rewards/margins": 1.4851303100585938, + "rewards/rejected": -1.713555097579956, + "step": 74 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 41.87339924313547, + "learning_rate": 4.950139496883334e-07, + "logits/chosen": -1.1046594381332397, + "logits/rejected": -1.083458662033081, + "logps/chosen": -23.34552764892578, + "logps/rejected": -33.603294372558594, + "loss": 0.2996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22441284358501434, + "rewards/margins": 1.64926016330719, + "rewards/rejected": -1.8736729621887207, + "step": 75 + }, + { + "epoch": 1.288135593220339, + "grad_norm": 40.59281009638228, + "learning_rate": 4.94639123074876e-07, + "logits/chosen": -1.5633933544158936, + "logits/rejected": -1.3900222778320312, + "logps/chosen": -28.575279235839844, + "logps/rejected": -40.71543502807617, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23141731321811676, + "rewards/margins": 1.784515619277954, + "rewards/rejected": -2.015933036804199, + "step": 76 + }, + { + "epoch": 1.305084745762712, + "grad_norm": 40.512541982486866, + "learning_rate": 4.942508659683626e-07, + "logits/chosen": -1.2337239980697632, + "logits/rejected": -1.1204659938812256, + "logps/chosen": -37.800514221191406, + "logps/rejected": -57.468048095703125, + "loss": 0.3173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0504547655582428, + "rewards/margins": 2.774332046508789, + "rewards/rejected": -2.824786901473999, + "step": 77 + }, + { + "epoch": 1.3220338983050848, + "grad_norm": 44.23393681805476, + "learning_rate": 4.938491996837994e-07, + "logits/chosen": -1.1162344217300415, + "logits/rejected": -1.1677778959274292, + "logps/chosen": -21.936918258666992, + "logps/rejected": -36.89491653442383, + "loss": 0.3389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04354112595319748, + "rewards/margins": 1.7271283864974976, + "rewards/rejected": -1.7706694602966309, + "step": 78 + }, + { + "epoch": 1.3389830508474576, + "grad_norm": 41.690947911283914, + "learning_rate": 4.934341462723454e-07, + "logits/chosen": -1.3248709440231323, + "logits/rejected": -1.2905504703521729, + "logps/chosen": -22.56363868713379, + "logps/rejected": -36.43506622314453, + "loss": 0.3275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11233043670654297, + "rewards/margins": 1.878725528717041, + "rewards/rejected": -1.991055965423584, + "step": 79 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 40.04810360539708, + "learning_rate": 4.930057285201027e-07, + "logits/chosen": -1.1112794876098633, + "logits/rejected": -1.0777575969696045, + "logps/chosen": -24.657052993774414, + "logps/rejected": -39.67708969116211, + "loss": 0.3229, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.13391758501529694, + "rewards/margins": 2.0548856258392334, + "rewards/rejected": -2.18880295753479, + "step": 80 + }, + { + "epoch": 1.3728813559322033, + "grad_norm": 34.996138065577895, + "learning_rate": 4.925639699468645e-07, + "logits/chosen": -1.2954164743423462, + "logits/rejected": -1.2493962049484253, + "logps/chosen": -22.976837158203125, + "logps/rejected": -30.558788299560547, + "loss": 0.2742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00974225252866745, + "rewards/margins": 1.3260502815246582, + "rewards/rejected": -1.3357923030853271, + "step": 81 + }, + { + "epoch": 1.3898305084745763, + "grad_norm": 37.19153270519795, + "learning_rate": 4.921088948048246e-07, + "logits/chosen": -1.0799801349639893, + "logits/rejected": -1.029328465461731, + "logps/chosen": -21.34770965576172, + "logps/rejected": -27.523284912109375, + "loss": 0.3197, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0038819462060928345, + "rewards/margins": 1.294053316116333, + "rewards/rejected": -1.2979353666305542, + "step": 82 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 38.782824257992516, + "learning_rate": 4.916405280772462e-07, + "logits/chosen": -1.2136616706848145, + "logits/rejected": -1.1575380563735962, + "logps/chosen": -34.565452575683594, + "logps/rejected": -37.70560836791992, + "loss": 0.2773, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11810773611068726, + "rewards/margins": 1.5449273586273193, + "rewards/rejected": -1.6630350351333618, + "step": 83 + }, + { + "epoch": 1.423728813559322, + "grad_norm": 46.057268803202, + "learning_rate": 4.911588954770896e-07, + "logits/chosen": -1.323722004890442, + "logits/rejected": -1.2598729133605957, + "logps/chosen": -28.280380249023438, + "logps/rejected": -35.962127685546875, + "loss": 0.3575, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18131475150585175, + "rewards/margins": 1.3694193363189697, + "rewards/rejected": -1.5507341623306274, + "step": 84 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 39.078212479916175, + "learning_rate": 4.906640234456011e-07, + "logits/chosen": -1.254352331161499, + "logits/rejected": -1.220250129699707, + "logps/chosen": -22.146629333496094, + "logps/rejected": -34.24107360839844, + "loss": 0.3198, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.16238263249397278, + "rewards/margins": 2.182830810546875, + "rewards/rejected": -2.3452136516571045, + "step": 85 + }, + { + "epoch": 1.457627118644068, + "grad_norm": 40.017911222526, + "learning_rate": 4.90155939150861e-07, + "logits/chosen": -1.2662581205368042, + "logits/rejected": -1.1520745754241943, + "logps/chosen": -28.800233840942383, + "logps/rejected": -45.41657638549805, + "loss": 0.2885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10078342258930206, + "rewards/margins": 2.74984073638916, + "rewards/rejected": -2.8506245613098145, + "step": 86 + }, + { + "epoch": 1.4745762711864407, + "grad_norm": 38.422387432652926, + "learning_rate": 4.896346704862927e-07, + "logits/chosen": -1.1133819818496704, + "logits/rejected": -1.1244077682495117, + "logps/chosen": -26.18191909790039, + "logps/rejected": -38.19125747680664, + "loss": 0.2784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2561756372451782, + "rewards/margins": 2.0794429779052734, + "rewards/rejected": -2.335618734359741, + "step": 87 + }, + { + "epoch": 1.4915254237288136, + "grad_norm": 43.19306665638686, + "learning_rate": 4.891002460691305e-07, + "logits/chosen": -1.1532127857208252, + "logits/rejected": -1.0514928102493286, + "logps/chosen": -33.20591735839844, + "logps/rejected": -44.78691101074219, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31594839692115784, + "rewards/margins": 2.774695634841919, + "rewards/rejected": -3.090643882751465, + "step": 88 + }, + { + "epoch": 1.5084745762711864, + "grad_norm": 37.55592044811218, + "learning_rate": 4.885526952388497e-07, + "logits/chosen": -1.5443884134292603, + "logits/rejected": -1.491202473640442, + "logps/chosen": -29.413482666015625, + "logps/rejected": -44.91068649291992, + "loss": 0.2738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08295662701129913, + "rewards/margins": 2.9935202598571777, + "rewards/rejected": -3.076476812362671, + "step": 89 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 39.41480347028368, + "learning_rate": 4.879920480555549e-07, + "logits/chosen": -1.2505245208740234, + "logits/rejected": -1.1800575256347656, + "logps/chosen": -34.517356872558594, + "logps/rejected": -54.461875915527344, + "loss": 0.3016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2518163323402405, + "rewards/margins": 2.4639229774475098, + "rewards/rejected": -2.7157392501831055, + "step": 90 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 41.233244462870424, + "learning_rate": 4.874183352983297e-07, + "logits/chosen": -1.2128483057022095, + "logits/rejected": -1.1637994050979614, + "logps/chosen": -26.66428565979004, + "logps/rejected": -30.274539947509766, + "loss": 0.3237, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.016297563910484314, + "rewards/margins": 1.5082069635391235, + "rewards/rejected": -1.4919092655181885, + "step": 91 + }, + { + "epoch": 1.559322033898305, + "grad_norm": 33.074028930414165, + "learning_rate": 4.868315884635478e-07, + "logits/chosen": -1.3729506731033325, + "logits/rejected": -1.363387107849121, + "logps/chosen": -32.76537322998047, + "logps/rejected": -40.88609313964844, + "loss": 0.2209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49252671003341675, + "rewards/margins": 1.6107347011566162, + "rewards/rejected": -2.1032614707946777, + "step": 92 + }, + { + "epoch": 1.576271186440678, + "grad_norm": 44.45767411612402, + "learning_rate": 4.862318397631433e-07, + "logits/chosen": -1.290561318397522, + "logits/rejected": -1.2587978839874268, + "logps/chosen": -27.68136215209961, + "logps/rejected": -37.53421401977539, + "loss": 0.3135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.045494288206100464, + "rewards/margins": 2.3608152866363525, + "rewards/rejected": -2.3153209686279297, + "step": 93 + }, + { + "epoch": 1.5932203389830508, + "grad_norm": 38.75459636795706, + "learning_rate": 4.856191221228422e-07, + "logits/chosen": -1.4851233959197998, + "logits/rejected": -1.3868330717086792, + "logps/chosen": -25.41338348388672, + "logps/rejected": -50.90999984741211, + "loss": 0.3055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.057565659284591675, + "rewards/margins": 3.016381025314331, + "rewards/rejected": -2.958815336227417, + "step": 94 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 39.42511408680881, + "learning_rate": 4.84993469180355e-07, + "logits/chosen": -1.5986624956130981, + "logits/rejected": -1.5039199590682983, + "logps/chosen": -23.876976013183594, + "logps/rejected": -37.30643081665039, + "loss": 0.2918, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1558499038219452, + "rewards/margins": 2.534642219543457, + "rewards/rejected": -2.3787922859191895, + "step": 95 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 40.62251102213997, + "learning_rate": 4.843549152835302e-07, + "logits/chosen": -1.4016410112380981, + "logits/rejected": -1.330747127532959, + "logps/chosen": -35.39458465576172, + "logps/rejected": -41.057533264160156, + "loss": 0.2683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30797258019447327, + "rewards/margins": 1.5779187679290771, + "rewards/rejected": -1.8858911991119385, + "step": 96 + }, + { + "epoch": 1.6440677966101696, + "grad_norm": 37.47292375573573, + "learning_rate": 4.837034954884681e-07, + "logits/chosen": -1.2742365598678589, + "logits/rejected": -1.16330885887146, + "logps/chosen": -19.86534881591797, + "logps/rejected": -37.20149230957031, + "loss": 0.3136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16833439469337463, + "rewards/margins": 2.6455376148223877, + "rewards/rejected": -2.8138718605041504, + "step": 97 + }, + { + "epoch": 1.6610169491525424, + "grad_norm": 34.360879838757626, + "learning_rate": 4.83039245557597e-07, + "logits/chosen": -1.4751869440078735, + "logits/rejected": -1.3903852701187134, + "logps/chosen": -27.246294021606445, + "logps/rejected": -37.46033477783203, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015960171818733215, + "rewards/margins": 1.7550795078277588, + "rewards/rejected": -1.739119291305542, + "step": 98 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 36.086860100703234, + "learning_rate": 4.823622019577088e-07, + "logits/chosen": -1.5154139995574951, + "logits/rejected": -1.435178279876709, + "logps/chosen": -29.518024444580078, + "logps/rejected": -34.60513687133789, + "loss": 0.2638, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023779883980751038, + "rewards/margins": 1.6297731399536133, + "rewards/rejected": -1.6059932708740234, + "step": 99 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 44.37484563053148, + "learning_rate": 4.816724018579583e-07, + "logits/chosen": -1.3004710674285889, + "logits/rejected": -1.1905782222747803, + "logps/chosen": -44.38223648071289, + "logps/rejected": -44.088531494140625, + "loss": 0.3148, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6569697856903076, + "rewards/margins": 1.929584264755249, + "rewards/rejected": -2.5865538120269775, + "step": 100 + }, + { + "epoch": 1.711864406779661, + "grad_norm": 37.67689426318597, + "learning_rate": 4.809698831278217e-07, + "logits/chosen": -1.1910172700881958, + "logits/rejected": -1.085993766784668, + "logps/chosen": -25.45205307006836, + "logps/rejected": -38.646697998046875, + "loss": 0.2924, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10635387897491455, + "rewards/margins": 2.1210451126098633, + "rewards/rejected": -2.2273988723754883, + "step": 101 + }, + { + "epoch": 1.7288135593220337, + "grad_norm": 44.53554229157193, + "learning_rate": 4.802546843350177e-07, + "logits/chosen": -1.3502918481826782, + "logits/rejected": -1.3793635368347168, + "logps/chosen": -31.245201110839844, + "logps/rejected": -38.174922943115234, + "loss": 0.308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04055324196815491, + "rewards/margins": 2.123584032058716, + "rewards/rejected": -2.164137363433838, + "step": 102 + }, + { + "epoch": 1.7457627118644068, + "grad_norm": 33.6410465461232, + "learning_rate": 4.795268447433906e-07, + "logits/chosen": -1.5270867347717285, + "logits/rejected": -1.5600392818450928, + "logps/chosen": -26.281536102294922, + "logps/rejected": -40.25182342529297, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38495194911956787, + "rewards/margins": 2.3371963500976562, + "rewards/rejected": -2.7221484184265137, + "step": 103 + }, + { + "epoch": 1.7627118644067796, + "grad_norm": 37.86791531616575, + "learning_rate": 4.787864043107546e-07, + "logits/chosen": -1.1703252792358398, + "logits/rejected": -1.2337673902511597, + "logps/chosen": -28.018447875976562, + "logps/rejected": -26.48525619506836, + "loss": 0.2904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1365218162536621, + "rewards/margins": 1.0608235597610474, + "rewards/rejected": -1.197345495223999, + "step": 104 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 39.89897092814635, + "learning_rate": 4.780334036866996e-07, + "logits/chosen": -1.4013991355895996, + "logits/rejected": -1.3012598752975464, + "logps/chosen": -30.404062271118164, + "logps/rejected": -50.464717864990234, + "loss": 0.2444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36456581950187683, + "rewards/margins": 2.707120895385742, + "rewards/rejected": -3.0716869831085205, + "step": 105 + }, + { + "epoch": 1.7966101694915255, + "grad_norm": 38.70079485674391, + "learning_rate": 4.772678842103605e-07, + "logits/chosen": -1.396234393119812, + "logits/rejected": -1.2844393253326416, + "logps/chosen": -30.791566848754883, + "logps/rejected": -41.80755615234375, + "loss": 0.2245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2901880741119385, + "rewards/margins": 2.5994958877563477, + "rewards/rejected": -2.8896842002868652, + "step": 106 + }, + { + "epoch": 1.8135593220338984, + "grad_norm": 31.615929528424363, + "learning_rate": 4.764898879081467e-07, + "logits/chosen": -1.3441611528396606, + "logits/rejected": -1.3465229272842407, + "logps/chosen": -25.936952590942383, + "logps/rejected": -44.00107192993164, + "loss": 0.2261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0497933030128479, + "rewards/margins": 1.7863792181015015, + "rewards/rejected": -1.7365858554840088, + "step": 107 + }, + { + "epoch": 1.8305084745762712, + "grad_norm": 36.20936022186147, + "learning_rate": 4.7569945749143586e-07, + "logits/chosen": -1.3387551307678223, + "logits/rejected": -1.329017996788025, + "logps/chosen": -26.50401496887207, + "logps/rejected": -48.64241027832031, + "loss": 0.2448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1088000237941742, + "rewards/margins": 2.8309779167175293, + "rewards/rejected": -2.9397776126861572, + "step": 108 + }, + { + "epoch": 1.847457627118644, + "grad_norm": 36.766498225229036, + "learning_rate": 4.748966363542285e-07, + "logits/chosen": -1.2870004177093506, + "logits/rejected": -1.2100244760513306, + "logps/chosen": -24.399860382080078, + "logps/rejected": -37.19389343261719, + "loss": 0.2463, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04130461812019348, + "rewards/margins": 2.3760554790496826, + "rewards/rejected": -2.3347508907318115, + "step": 109 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 37.505379590427104, + "learning_rate": 4.7408146857076563e-07, + "logits/chosen": -1.2373127937316895, + "logits/rejected": -1.1650217771530151, + "logps/chosen": -39.90864562988281, + "logps/rejected": -40.89439392089844, + "loss": 0.2646, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0051874518394470215, + "rewards/margins": 1.6469688415527344, + "rewards/rejected": -1.6417814493179321, + "step": 110 + }, + { + "epoch": 1.8813559322033897, + "grad_norm": 35.644427389294876, + "learning_rate": 4.732539988931096e-07, + "logits/chosen": -1.3618909120559692, + "logits/rejected": -1.4531258344650269, + "logps/chosen": -26.969541549682617, + "logps/rejected": -42.32440185546875, + "loss": 0.2208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25816747546195984, + "rewards/margins": 2.6419811248779297, + "rewards/rejected": -2.900148391723633, + "step": 111 + }, + { + "epoch": 1.8983050847457628, + "grad_norm": 39.403893355046144, + "learning_rate": 4.7241427274868683e-07, + "logits/chosen": -1.392714262008667, + "logits/rejected": -1.2956047058105469, + "logps/chosen": -25.744760513305664, + "logps/rejected": -43.44940185546875, + "loss": 0.2692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19541674852371216, + "rewards/margins": 2.4149329662323, + "rewards/rejected": -2.610349655151367, + "step": 112 + }, + { + "epoch": 1.9152542372881356, + "grad_norm": 42.63537438222289, + "learning_rate": 4.7156233623779383e-07, + "logits/chosen": -1.3376697301864624, + "logits/rejected": -1.3315945863723755, + "logps/chosen": -31.033275604248047, + "logps/rejected": -34.59294128417969, + "loss": 0.262, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.01679442822933197, + "rewards/margins": 1.9989566802978516, + "rewards/rejected": -2.015751361846924, + "step": 113 + }, + { + "epoch": 1.9322033898305084, + "grad_norm": 37.518445889823454, + "learning_rate": 4.7069823613106687e-07, + "logits/chosen": -1.3494609594345093, + "logits/rejected": -1.4168843030929565, + "logps/chosen": -37.80765914916992, + "logps/rejected": -50.918888092041016, + "loss": 0.2293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5224637985229492, + "rewards/margins": 3.094416618347168, + "rewards/rejected": -3.6168806552886963, + "step": 114 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 42.51900093030587, + "learning_rate": 4.698220198669136e-07, + "logits/chosen": -1.646604299545288, + "logits/rejected": -1.517140507698059, + "logps/chosen": -27.87681770324707, + "logps/rejected": -42.672096252441406, + "loss": 0.2965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16765162348747253, + "rewards/margins": 2.965381145477295, + "rewards/rejected": -3.133033275604248, + "step": 115 + }, + { + "epoch": 1.9661016949152543, + "grad_norm": 33.55277119465526, + "learning_rate": 4.6893373554890917e-07, + "logits/chosen": -1.5241327285766602, + "logits/rejected": -1.2908215522766113, + "logps/chosen": -33.22343444824219, + "logps/rejected": -44.93342208862305, + "loss": 0.226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36639106273651123, + "rewards/margins": 2.9403634071350098, + "rewards/rejected": -3.3067543506622314, + "step": 116 + }, + { + "epoch": 1.9830508474576272, + "grad_norm": 36.89279198796042, + "learning_rate": 4.6803343194315546e-07, + "logits/chosen": -1.3500347137451172, + "logits/rejected": -1.3750879764556885, + "logps/chosen": -33.52631759643555, + "logps/rejected": -46.71184539794922, + "loss": 0.2323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17364656925201416, + "rewards/margins": 2.6467905044555664, + "rewards/rejected": -2.820436954498291, + "step": 117 + }, + { + "epoch": 2.0, + "grad_norm": 29.673763942339104, + "learning_rate": 4.6712115847560353e-07, + "logits/chosen": -1.1543025970458984, + "logits/rejected": -1.1016186475753784, + "logps/chosen": -27.275094985961914, + "logps/rejected": -35.34591293334961, + "loss": 0.2172, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010025471448898315, + "rewards/margins": 2.681260824203491, + "rewards/rejected": -2.6712355613708496, + "step": 118 + }, + { + "epoch": 2.016949152542373, + "grad_norm": 18.754455801142957, + "learning_rate": 4.661969652293402e-07, + "logits/chosen": -1.3498600721359253, + "logits/rejected": -1.2991336584091187, + "logps/chosen": -24.873241424560547, + "logps/rejected": -45.2943000793457, + "loss": 0.1226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13535727560520172, + "rewards/margins": 3.1097800731658936, + "rewards/rejected": -3.2451369762420654, + "step": 119 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 22.75173167375844, + "learning_rate": 4.652609029418388e-07, + "logits/chosen": -1.283535122871399, + "logits/rejected": -1.2337732315063477, + "logps/chosen": -26.472030639648438, + "logps/rejected": -41.27444076538086, + "loss": 0.1627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14066343009471893, + "rewards/margins": 2.8250813484191895, + "rewards/rejected": -2.9657444953918457, + "step": 120 + }, + { + "epoch": 2.0508474576271185, + "grad_norm": 20.99893381194414, + "learning_rate": 4.6431302300217366e-07, + "logits/chosen": -1.4252784252166748, + "logits/rejected": -1.421356439590454, + "logps/chosen": -31.787378311157227, + "logps/rejected": -33.3957405090332, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40507441759109497, + "rewards/margins": 1.8697203397750854, + "rewards/rejected": -1.4646459817886353, + "step": 121 + }, + { + "epoch": 2.0677966101694913, + "grad_norm": 18.786273208958082, + "learning_rate": 4.633533774481987e-07, + "logits/chosen": -1.2753486633300781, + "logits/rejected": -1.1164805889129639, + "logps/chosen": -31.65105438232422, + "logps/rejected": -46.21112823486328, + "loss": 0.1353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07824045419692993, + "rewards/margins": 3.2129859924316406, + "rewards/rejected": -3.291226387023926, + "step": 122 + }, + { + "epoch": 2.084745762711864, + "grad_norm": 20.917039008362774, + "learning_rate": 4.623820189636905e-07, + "logits/chosen": -1.3903650045394897, + "logits/rejected": -1.3099316358566284, + "logps/chosen": -29.19454002380371, + "logps/rejected": -46.168941497802734, + "loss": 0.1536, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4621618092060089, + "rewards/margins": 2.845735788345337, + "rewards/rejected": -2.3835740089416504, + "step": 123 + }, + { + "epoch": 2.1016949152542375, + "grad_norm": 20.45027445287867, + "learning_rate": 4.613990008754565e-07, + "logits/chosen": -1.3704748153686523, + "logits/rejected": -1.2672369480133057, + "logps/chosen": -34.432945251464844, + "logps/rejected": -39.66848373413086, + "loss": 0.1588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1847638189792633, + "rewards/margins": 2.653686761856079, + "rewards/rejected": -2.4689230918884277, + "step": 124 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 20.088146504282733, + "learning_rate": 4.60404377150407e-07, + "logits/chosen": -1.2935415506362915, + "logits/rejected": -1.2802854776382446, + "logps/chosen": -26.330522537231445, + "logps/rejected": -41.638004302978516, + "loss": 0.1543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15763472020626068, + "rewards/margins": 2.74660587310791, + "rewards/rejected": -2.904240131378174, + "step": 125 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 23.38109010648461, + "learning_rate": 4.593982023925925e-07, + "logits/chosen": -1.1731081008911133, + "logits/rejected": -1.0896246433258057, + "logps/chosen": -33.23085021972656, + "logps/rejected": -40.83133316040039, + "loss": 0.1689, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19674523174762726, + "rewards/margins": 2.6324305534362793, + "rewards/rejected": -2.8291759490966797, + "step": 126 + }, + { + "epoch": 2.152542372881356, + "grad_norm": 19.53451419738541, + "learning_rate": 4.58380531840206e-07, + "logits/chosen": -1.3832257986068726, + "logits/rejected": -1.1368017196655273, + "logps/chosen": -31.488880157470703, + "logps/rejected": -37.851097106933594, + "loss": 0.1448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38223376870155334, + "rewards/margins": 3.1422624588012695, + "rewards/rejected": -2.76002836227417, + "step": 127 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 19.597219846897513, + "learning_rate": 4.5735142136255045e-07, + "logits/chosen": -1.3937269449234009, + "logits/rejected": -1.3436622619628906, + "logps/chosen": -28.44049644470215, + "logps/rejected": -49.33514404296875, + "loss": 0.1489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09995569288730621, + "rewards/margins": 3.583566665649414, + "rewards/rejected": -3.4836111068725586, + "step": 128 + }, + { + "epoch": 2.1864406779661016, + "grad_norm": 18.394803732689535, + "learning_rate": 4.5631092745697164e-07, + "logits/chosen": -1.1625999212265015, + "logits/rejected": -1.1585655212402344, + "logps/chosen": -29.666309356689453, + "logps/rejected": -41.36751174926758, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2516571879386902, + "rewards/margins": 2.918815851211548, + "rewards/rejected": -2.667158603668213, + "step": 129 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 19.452927999271022, + "learning_rate": 4.5525910724575645e-07, + "logits/chosen": -1.298140048980713, + "logits/rejected": -1.238019585609436, + "logps/chosen": -30.736804962158203, + "logps/rejected": -49.28110885620117, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11459851264953613, + "rewards/margins": 4.15794563293457, + "rewards/rejected": -4.043347358703613, + "step": 130 + }, + { + "epoch": 2.2203389830508473, + "grad_norm": 16.937231864925646, + "learning_rate": 4.54196018472997e-07, + "logits/chosen": -1.2767530679702759, + "logits/rejected": -1.1214213371276855, + "logps/chosen": -27.268341064453125, + "logps/rejected": -52.620338439941406, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010926365852355957, + "rewards/margins": 4.242362976074219, + "rewards/rejected": -4.231436729431152, + "step": 131 + }, + { + "epoch": 2.23728813559322, + "grad_norm": 19.91080195782553, + "learning_rate": 4.5312171950142033e-07, + "logits/chosen": -1.443523645401001, + "logits/rejected": -1.3692708015441895, + "logps/chosen": -23.356483459472656, + "logps/rejected": -39.884552001953125, + "loss": 0.137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09224121272563934, + "rewards/margins": 3.541024923324585, + "rewards/rejected": -3.4487838745117188, + "step": 132 + }, + { + "epoch": 2.2542372881355934, + "grad_norm": 20.224862204629513, + "learning_rate": 4.520362693091845e-07, + "logits/chosen": -1.3375797271728516, + "logits/rejected": -1.2925443649291992, + "logps/chosen": -24.31032371520996, + "logps/rejected": -31.916282653808594, + "loss": 0.1541, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05450062453746796, + "rewards/margins": 1.724971055984497, + "rewards/rejected": -1.6704705953598022, + "step": 133 + }, + { + "epoch": 2.2711864406779663, + "grad_norm": 19.256180369703056, + "learning_rate": 4.5093972748664087e-07, + "logits/chosen": -1.3231240510940552, + "logits/rejected": -1.2512582540512085, + "logps/chosen": -35.62217330932617, + "logps/rejected": -48.332237243652344, + "loss": 0.1134, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26731669902801514, + "rewards/margins": 3.2187671661376953, + "rewards/rejected": -3.486083745956421, + "step": 134 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 17.92618734739177, + "learning_rate": 4.498321542330622e-07, + "logits/chosen": -1.4986979961395264, + "logits/rejected": -1.4225276708602905, + "logps/chosen": -26.092084884643555, + "logps/rejected": -52.08736038208008, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04193298518657684, + "rewards/margins": 3.300752639770508, + "rewards/rejected": -3.3426852226257324, + "step": 135 + }, + { + "epoch": 2.305084745762712, + "grad_norm": 19.524384947131523, + "learning_rate": 4.4871361035333833e-07, + "logits/chosen": -1.302308201789856, + "logits/rejected": -1.306333065032959, + "logps/chosen": -25.217451095581055, + "logps/rejected": -43.95090103149414, + "loss": 0.1352, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24911952018737793, + "rewards/margins": 3.123009204864502, + "rewards/rejected": -2.873889684677124, + "step": 136 + }, + { + "epoch": 2.3220338983050848, + "grad_norm": 21.654262830345974, + "learning_rate": 4.475841572546374e-07, + "logits/chosen": -1.333143711090088, + "logits/rejected": -1.1934046745300293, + "logps/chosen": -29.572952270507812, + "logps/rejected": -42.41865539550781, + "loss": 0.1608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2631007432937622, + "rewards/margins": 2.9843344688415527, + "rewards/rejected": -3.2474350929260254, + "step": 137 + }, + { + "epoch": 2.3389830508474576, + "grad_norm": 18.2902071706345, + "learning_rate": 4.464438569430353e-07, + "logits/chosen": -1.4010558128356934, + "logits/rejected": -1.3596662282943726, + "logps/chosen": -27.541452407836914, + "logps/rejected": -37.1332893371582, + "loss": 0.1122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17605237662792206, + "rewards/margins": 2.4087536334991455, + "rewards/rejected": -2.232701301574707, + "step": 138 + }, + { + "epoch": 2.3559322033898304, + "grad_norm": 20.193211619680806, + "learning_rate": 4.452927720201112e-07, + "logits/chosen": -1.1419758796691895, + "logits/rejected": -1.2467293739318848, + "logps/chosen": -24.366790771484375, + "logps/rejected": -44.22784423828125, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2860787510871887, + "rewards/margins": 3.2284557819366455, + "rewards/rejected": -2.9423768520355225, + "step": 139 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 16.846442500740498, + "learning_rate": 4.441309656795106e-07, + "logits/chosen": -1.3143264055252075, + "logits/rejected": -1.1588001251220703, + "logps/chosen": -25.230070114135742, + "logps/rejected": -50.344722747802734, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2081325650215149, + "rewards/margins": 3.1986870765686035, + "rewards/rejected": -2.990554094314575, + "step": 140 + }, + { + "epoch": 2.389830508474576, + "grad_norm": 19.91817673644514, + "learning_rate": 4.429585017034766e-07, + "logits/chosen": -1.355256199836731, + "logits/rejected": -1.3759305477142334, + "logps/chosen": -30.238567352294922, + "logps/rejected": -43.92167663574219, + "loss": 0.1377, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17302727699279785, + "rewards/margins": 3.385795831680298, + "rewards/rejected": -3.558823347091675, + "step": 141 + }, + { + "epoch": 2.406779661016949, + "grad_norm": 17.91719154232722, + "learning_rate": 4.417754444593478e-07, + "logits/chosen": -1.452804684638977, + "logits/rejected": -1.394730567932129, + "logps/chosen": -29.204275131225586, + "logps/rejected": -45.112300872802734, + "loss": 0.1072, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046540290117263794, + "rewards/margins": 4.2701945304870605, + "rewards/rejected": -4.223654270172119, + "step": 142 + }, + { + "epoch": 2.423728813559322, + "grad_norm": 17.305416636387555, + "learning_rate": 4.4058185889602497e-07, + "logits/chosen": -1.375995397567749, + "logits/rejected": -1.3006415367126465, + "logps/chosen": -17.108240127563477, + "logps/rejected": -35.922874450683594, + "loss": 0.1445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28150078654289246, + "rewards/margins": 3.295111656188965, + "rewards/rejected": -3.01361083984375, + "step": 143 + }, + { + "epoch": 2.440677966101695, + "grad_norm": 23.303228169178016, + "learning_rate": 4.39377810540405e-07, + "logits/chosen": -1.4052019119262695, + "logits/rejected": -1.4526053667068481, + "logps/chosen": -41.58496856689453, + "logps/rejected": -40.82295227050781, + "loss": 0.1742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30576008558273315, + "rewards/margins": 2.3252716064453125, + "rewards/rejected": -2.6310315132141113, + "step": 144 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 19.121378802553227, + "learning_rate": 4.38163365493784e-07, + "logits/chosen": -1.5245730876922607, + "logits/rejected": -1.401250958442688, + "logps/chosen": -34.83244705200195, + "logps/rejected": -64.91217041015625, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20400622487068176, + "rewards/margins": 4.261959552764893, + "rewards/rejected": -4.057952880859375, + "step": 145 + }, + { + "epoch": 2.4745762711864407, + "grad_norm": 19.034390396216985, + "learning_rate": 4.3693859042822774e-07, + "logits/chosen": -1.2971210479736328, + "logits/rejected": -1.2576966285705566, + "logps/chosen": -33.01571273803711, + "logps/rejected": -45.87381362915039, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19635039567947388, + "rewards/margins": 3.7301583290100098, + "rewards/rejected": -3.5338077545166016, + "step": 146 + }, + { + "epoch": 2.4915254237288136, + "grad_norm": 18.357502635593228, + "learning_rate": 4.3570355258291223e-07, + "logits/chosen": -1.2477927207946777, + "logits/rejected": -1.2029328346252441, + "logps/chosen": -28.15603256225586, + "logps/rejected": -39.89695358276367, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18680232763290405, + "rewards/margins": 2.8544976711273193, + "rewards/rejected": -2.6676955223083496, + "step": 147 + }, + { + "epoch": 2.5084745762711864, + "grad_norm": 13.757078257538927, + "learning_rate": 4.344583197604318e-07, + "logits/chosen": -1.280619740486145, + "logits/rejected": -1.2003180980682373, + "logps/chosen": -24.585142135620117, + "logps/rejected": -49.09882354736328, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009844973683357239, + "rewards/margins": 3.587639331817627, + "rewards/rejected": -3.5974843502044678, + "step": 148 + }, + { + "epoch": 2.5254237288135593, + "grad_norm": 20.493002641349737, + "learning_rate": 4.332029603230767e-07, + "logits/chosen": -1.2804765701293945, + "logits/rejected": -1.2299047708511353, + "logps/chosen": -38.74854278564453, + "logps/rejected": -40.60469055175781, + "loss": 0.1209, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31216520071029663, + "rewards/margins": 3.550785541534424, + "rewards/rejected": -3.2386200428009033, + "step": 149 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 18.873521616647515, + "learning_rate": 4.319375431890806e-07, + "logits/chosen": -1.5655155181884766, + "logits/rejected": -1.5413960218429565, + "logps/chosen": -29.58742904663086, + "logps/rejected": -38.81654739379883, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10849936306476593, + "rewards/margins": 4.284536361694336, + "rewards/rejected": -4.393035888671875, + "step": 150 + }, + { + "epoch": 2.559322033898305, + "grad_norm": 19.108581899079446, + "learning_rate": 4.306621378288364e-07, + "logits/chosen": -1.227691650390625, + "logits/rejected": -1.206729769706726, + "logps/chosen": -29.225234985351562, + "logps/rejected": -47.78348159790039, + "loss": 0.1189, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18846926093101501, + "rewards/margins": 3.5759382247924805, + "rewards/rejected": -3.3874690532684326, + "step": 151 + }, + { + "epoch": 2.576271186440678, + "grad_norm": 17.54679094674662, + "learning_rate": 4.2937681426108275e-07, + "logits/chosen": -1.2980151176452637, + "logits/rejected": -1.2792203426361084, + "logps/chosen": -29.963348388671875, + "logps/rejected": -36.31569290161133, + "loss": 0.119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15102972090244293, + "rewards/margins": 2.2603495121002197, + "rewards/rejected": -2.411379098892212, + "step": 152 + }, + { + "epoch": 2.593220338983051, + "grad_norm": 20.293681732169816, + "learning_rate": 4.280816430490602e-07, + "logits/chosen": -1.6751010417938232, + "logits/rejected": -1.579331874847412, + "logps/chosen": -27.965721130371094, + "logps/rejected": -41.831912994384766, + "loss": 0.1398, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03810068964958191, + "rewards/margins": 3.356261968612671, + "rewards/rejected": -3.3181610107421875, + "step": 153 + }, + { + "epoch": 2.610169491525424, + "grad_norm": 18.30214205628312, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -1.2230945825576782, + "logits/rejected": -1.3257890939712524, + "logps/chosen": -22.84085464477539, + "logps/rejected": -30.21630859375, + "loss": 0.1296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4078897535800934, + "rewards/margins": 2.349480152130127, + "rewards/rejected": -1.9415905475616455, + "step": 154 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 16.933166318116392, + "learning_rate": 4.254620426444053e-07, + "logits/chosen": -1.2983089685440063, + "logits/rejected": -1.2278701066970825, + "logps/chosen": -28.5270938873291, + "logps/rejected": -47.79298400878906, + "loss": 0.1053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39613279700279236, + "rewards/margins": 4.555056571960449, + "rewards/rejected": -4.158923625946045, + "step": 155 + }, + { + "epoch": 2.6440677966101696, + "grad_norm": 16.954026895333676, + "learning_rate": 4.2413775726574923e-07, + "logits/chosen": -1.4136017560958862, + "logits/rejected": -1.308146357536316, + "logps/chosen": -26.033409118652344, + "logps/rejected": -45.159584045410156, + "loss": 0.1098, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3105984926223755, + "rewards/margins": 3.2515172958374023, + "rewards/rejected": -3.5621156692504883, + "step": 156 + }, + { + "epoch": 2.6610169491525424, + "grad_norm": 23.018543543838756, + "learning_rate": 4.228039118628815e-07, + "logits/chosen": -1.3158849477767944, + "logits/rejected": -1.1991815567016602, + "logps/chosen": -25.21765899658203, + "logps/rejected": -40.39363479614258, + "loss": 0.1629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04916132241487503, + "rewards/margins": 2.8159801959991455, + "rewards/rejected": -2.7668187618255615, + "step": 157 + }, + { + "epoch": 2.6779661016949152, + "grad_norm": 18.04681645919104, + "learning_rate": 4.214605796628526e-07, + "logits/chosen": -1.3597509860992432, + "logits/rejected": -1.375614881515503, + "logps/chosen": -26.287364959716797, + "logps/rejected": -41.234405517578125, + "loss": 0.1155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18970844149589539, + "rewards/margins": 3.569676399230957, + "rewards/rejected": -3.3799679279327393, + "step": 158 + }, + { + "epoch": 2.694915254237288, + "grad_norm": 14.068268306629102, + "learning_rate": 4.201078344135306e-07, + "logits/chosen": -1.497442364692688, + "logits/rejected": -1.4510717391967773, + "logps/chosen": -27.79704475402832, + "logps/rejected": -43.55573272705078, + "loss": 0.101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.037288397550582886, + "rewards/margins": 3.6583452224731445, + "rewards/rejected": -3.6210567951202393, + "step": 159 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 20.29404199604278, + "learning_rate": 4.187457503795526e-07, + "logits/chosen": -1.6163471937179565, + "logits/rejected": -1.5419741868972778, + "logps/chosen": -26.61087417602539, + "logps/rejected": -30.924564361572266, + "loss": 0.1336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18512818217277527, + "rewards/margins": 2.91485857963562, + "rewards/rejected": -2.7297308444976807, + "step": 160 + }, + { + "epoch": 2.7288135593220337, + "grad_norm": 14.433423874260647, + "learning_rate": 4.173744023382474e-07, + "logits/chosen": -1.6092435121536255, + "logits/rejected": -1.4594898223876953, + "logps/chosen": -24.235836029052734, + "logps/rejected": -42.99752426147461, + "loss": 0.0838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11106225848197937, + "rewards/margins": 3.523545742034912, + "rewards/rejected": -3.6346077919006348, + "step": 161 + }, + { + "epoch": 2.7457627118644066, + "grad_norm": 18.073004370156372, + "learning_rate": 4.159938655755306e-07, + "logits/chosen": -1.2371647357940674, + "logits/rejected": -1.249834656715393, + "logps/chosen": -26.12664031982422, + "logps/rejected": -41.84228515625, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07593405246734619, + "rewards/margins": 3.008237361907959, + "rewards/rejected": -2.9323031902313232, + "step": 162 + }, + { + "epoch": 2.7627118644067794, + "grad_norm": 17.580682721769048, + "learning_rate": 4.1460421588177094e-07, + "logits/chosen": -1.3883872032165527, + "logits/rejected": -1.284624695777893, + "logps/chosen": -24.988061904907227, + "logps/rejected": -44.79413986206055, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04829689860343933, + "rewards/margins": 4.283046722412109, + "rewards/rejected": -4.234749794006348, + "step": 163 + }, + { + "epoch": 2.7796610169491527, + "grad_norm": 14.683936946298637, + "learning_rate": 4.1320552954763037e-07, + "logits/chosen": -1.3138792514801025, + "logits/rejected": -1.3704159259796143, + "logps/chosen": -34.95574188232422, + "logps/rejected": -41.10405731201172, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08980777859687805, + "rewards/margins": 3.34732723236084, + "rewards/rejected": -3.257519483566284, + "step": 164 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 19.53577404998919, + "learning_rate": 4.117978833598747e-07, + "logits/chosen": -1.4069619178771973, + "logits/rejected": -1.3315298557281494, + "logps/chosen": -38.807586669921875, + "logps/rejected": -41.68088912963867, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32595694065093994, + "rewards/margins": 2.6362242698669434, + "rewards/rejected": -2.962181329727173, + "step": 165 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 15.630747955128056, + "learning_rate": 4.1038135459715885e-07, + "logits/chosen": -1.3965390920639038, + "logits/rejected": -1.3702296018600464, + "logps/chosen": -17.21418571472168, + "logps/rejected": -33.933189392089844, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29006946086883545, + "rewards/margins": 3.5608468055725098, + "rewards/rejected": -3.270777463912964, + "step": 166 + }, + { + "epoch": 2.830508474576271, + "grad_norm": 15.319247967225127, + "learning_rate": 4.0895602102578373e-07, + "logits/chosen": -1.2587236166000366, + "logits/rejected": -1.2784702777862549, + "logps/chosen": -33.745662689208984, + "logps/rejected": -53.36175537109375, + "loss": 0.0943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5272909998893738, + "rewards/margins": 3.7912838459014893, + "rewards/rejected": -4.318574905395508, + "step": 167 + }, + { + "epoch": 2.847457627118644, + "grad_norm": 17.91197797858757, + "learning_rate": 4.075219608954278e-07, + "logits/chosen": -1.220982313156128, + "logits/rejected": -1.119415044784546, + "logps/chosen": -25.41081428527832, + "logps/rejected": -43.74126434326172, + "loss": 0.1181, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16412924230098724, + "rewards/margins": 3.79860258102417, + "rewards/rejected": -3.6344728469848633, + "step": 168 + }, + { + "epoch": 2.864406779661017, + "grad_norm": 19.216931577152895, + "learning_rate": 4.0607925293484997e-07, + "logits/chosen": -1.3364936113357544, + "logits/rejected": -1.3348599672317505, + "logps/chosen": -28.252498626708984, + "logps/rejected": -36.41025161743164, + "loss": 0.1405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07597078382968903, + "rewards/margins": 2.509685516357422, + "rewards/rejected": -2.5856566429138184, + "step": 169 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 18.004667351375737, + "learning_rate": 4.046279763475687e-07, + "logits/chosen": -1.4610190391540527, + "logits/rejected": -1.467834234237671, + "logps/chosen": -23.43694305419922, + "logps/rejected": -40.96953582763672, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00839678943157196, + "rewards/margins": 3.222750186920166, + "rewards/rejected": -3.214353561401367, + "step": 170 + }, + { + "epoch": 2.898305084745763, + "grad_norm": 16.724266176013543, + "learning_rate": 4.031682108075128e-07, + "logits/chosen": -1.3930025100708008, + "logits/rejected": -1.3053603172302246, + "logps/chosen": -26.837438583374023, + "logps/rejected": -46.042606353759766, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3490511476993561, + "rewards/margins": 3.341874122619629, + "rewards/rejected": -3.6909255981445312, + "step": 171 + }, + { + "epoch": 2.915254237288136, + "grad_norm": 17.501626125315028, + "learning_rate": 4.0170003645464835e-07, + "logits/chosen": -1.4981375932693481, + "logits/rejected": -1.4686487913131714, + "logps/chosen": -31.091188430786133, + "logps/rejected": -41.67449188232422, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08511877059936523, + "rewards/margins": 3.4050216674804688, + "rewards/rejected": -3.490140438079834, + "step": 172 + }, + { + "epoch": 2.9322033898305087, + "grad_norm": 17.505812920779697, + "learning_rate": 4.0022353389057793e-07, + "logits/chosen": -1.5352020263671875, + "logits/rejected": -1.4121986627578735, + "logps/chosen": -29.729156494140625, + "logps/rejected": -48.48210525512695, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050699926912784576, + "rewards/margins": 3.282029390335083, + "rewards/rejected": -3.2313296794891357, + "step": 173 + }, + { + "epoch": 2.9491525423728815, + "grad_norm": 15.718044631960725, + "learning_rate": 3.9873878417411685e-07, + "logits/chosen": -1.3283764123916626, + "logits/rejected": -1.2386645078659058, + "logps/chosen": -31.367496490478516, + "logps/rejected": -48.3538818359375, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02908661961555481, + "rewards/margins": 4.318341255187988, + "rewards/rejected": -4.3474273681640625, + "step": 174 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 19.307114282390735, + "learning_rate": 3.97245868816842e-07, + "logits/chosen": -1.61316978931427, + "logits/rejected": -1.4183673858642578, + "logps/chosen": -25.636213302612305, + "logps/rejected": -31.219690322875977, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19882389903068542, + "rewards/margins": 2.8216686248779297, + "rewards/rejected": -2.622844696044922, + "step": 175 + }, + { + "epoch": 2.983050847457627, + "grad_norm": 19.852640661420647, + "learning_rate": 3.95744869778618e-07, + "logits/chosen": -1.3724339008331299, + "logits/rejected": -1.2494310140609741, + "logps/chosen": -37.323822021484375, + "logps/rejected": -48.69821548461914, + "loss": 0.1322, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2126758098602295, + "rewards/margins": 3.236673355102539, + "rewards/rejected": -3.4493494033813477, + "step": 176 + }, + { + "epoch": 3.0, + "grad_norm": 17.961834616955002, + "learning_rate": 3.942358694630967e-07, + "logits/chosen": -1.4469400644302368, + "logits/rejected": -1.4965747594833374, + "logps/chosen": -27.073068618774414, + "logps/rejected": -47.31336212158203, + "loss": 0.1379, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2840898931026459, + "rewards/margins": 3.4035041332244873, + "rewards/rejected": -3.687593936920166, + "step": 177 + }, + { + "epoch": 3.016949152542373, + "grad_norm": 11.404351341704286, + "learning_rate": 3.927189507131938e-07, + "logits/chosen": -1.340846300125122, + "logits/rejected": -1.3331762552261353, + "logps/chosen": -29.038299560546875, + "logps/rejected": -44.76627731323242, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44353172183036804, + "rewards/margins": 3.546970844268799, + "rewards/rejected": -3.99050235748291, + "step": 178 + }, + { + "epoch": 3.0338983050847457, + "grad_norm": 13.04527280968449, + "learning_rate": 3.9119419680654083e-07, + "logits/chosen": -1.3644622564315796, + "logits/rejected": -1.2482867240905762, + "logps/chosen": -29.963117599487305, + "logps/rejected": -44.21002960205078, + "loss": 0.0936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05338460952043533, + "rewards/margins": 3.796813488006592, + "rewards/rejected": -3.7434287071228027, + "step": 179 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 10.551592699584061, + "learning_rate": 3.896616914509131e-07, + "logits/chosen": -1.1854358911514282, + "logits/rejected": -1.2167768478393555, + "logps/chosen": -28.745718002319336, + "logps/rejected": -39.30556869506836, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07138124108314514, + "rewards/margins": 3.6734557151794434, + "rewards/rejected": -3.60207462310791, + "step": 180 + }, + { + "epoch": 3.0677966101694913, + "grad_norm": 12.613695441723, + "learning_rate": 3.881215187796344e-07, + "logits/chosen": -1.5086756944656372, + "logits/rejected": -1.4750981330871582, + "logps/chosen": -24.299089431762695, + "logps/rejected": -48.68684005737305, + "loss": 0.088, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2250959873199463, + "rewards/margins": 5.23423433303833, + "rewards/rejected": -5.0091376304626465, + "step": 181 + }, + { + "epoch": 3.084745762711864, + "grad_norm": 12.720295333105653, + "learning_rate": 3.865737633469579e-07, + "logits/chosen": -1.456554889678955, + "logits/rejected": -1.4655078649520874, + "logps/chosen": -34.70986557006836, + "logps/rejected": -45.1373405456543, + "loss": 0.0913, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5282381176948547, + "rewards/margins": 3.892458438873291, + "rewards/rejected": -4.420697212219238, + "step": 182 + }, + { + "epoch": 3.1016949152542375, + "grad_norm": 11.42456365348978, + "learning_rate": 3.8501851012342444e-07, + "logits/chosen": -1.429445505142212, + "logits/rejected": -1.2520623207092285, + "logps/chosen": -33.7725830078125, + "logps/rejected": -48.29920196533203, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20377352833747864, + "rewards/margins": 3.905850887298584, + "rewards/rejected": -4.10962438583374, + "step": 183 + }, + { + "epoch": 3.1186440677966103, + "grad_norm": 11.707964288174598, + "learning_rate": 3.834558444911977e-07, + "logits/chosen": -1.442047119140625, + "logits/rejected": -1.2768933773040771, + "logps/chosen": -31.306930541992188, + "logps/rejected": -55.08317184448242, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1050567626953125, + "rewards/margins": 4.629131317138672, + "rewards/rejected": -4.524074554443359, + "step": 184 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 12.341349720055733, + "learning_rate": 3.818858522393763e-07, + "logits/chosen": -1.4654240608215332, + "logits/rejected": -1.2980097532272339, + "logps/chosen": -25.214473724365234, + "logps/rejected": -48.52466583251953, + "loss": 0.084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06900361180305481, + "rewards/margins": 3.743170738220215, + "rewards/rejected": -3.8121743202209473, + "step": 185 + }, + { + "epoch": 3.152542372881356, + "grad_norm": 11.013651552230316, + "learning_rate": 3.8030861955928496e-07, + "logits/chosen": -1.4930305480957031, + "logits/rejected": -1.49806809425354, + "logps/chosen": -33.961334228515625, + "logps/rejected": -60.0030403137207, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1520581841468811, + "rewards/margins": 4.225584983825684, + "rewards/rejected": -4.377642631530762, + "step": 186 + }, + { + "epoch": 3.169491525423729, + "grad_norm": 11.565802520156074, + "learning_rate": 3.787242330397418e-07, + "logits/chosen": -1.151625633239746, + "logits/rejected": -1.229320764541626, + "logps/chosen": -27.614501953125, + "logps/rejected": -46.39374542236328, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1534918248653412, + "rewards/margins": 3.757617235183716, + "rewards/rejected": -3.6041250228881836, + "step": 187 + }, + { + "epoch": 3.1864406779661016, + "grad_norm": 11.91938723600508, + "learning_rate": 3.7713277966230513e-07, + "logits/chosen": -1.3814018964767456, + "logits/rejected": -1.3967022895812988, + "logps/chosen": -39.28313446044922, + "logps/rejected": -51.139671325683594, + "loss": 0.0781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14791953563690186, + "rewards/margins": 3.787137508392334, + "rewards/rejected": -3.639218330383301, + "step": 188 + }, + { + "epoch": 3.2033898305084745, + "grad_norm": 11.7798233218555, + "learning_rate": 3.755343467964981e-07, + "logits/chosen": -1.4196237325668335, + "logits/rejected": -1.281465768814087, + "logps/chosen": -30.973888397216797, + "logps/rejected": -60.48612976074219, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04596884548664093, + "rewards/margins": 5.157370567321777, + "rewards/rejected": -5.203339099884033, + "step": 189 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 9.848668366337494, + "learning_rate": 3.739290221950123e-07, + "logits/chosen": -1.4920192956924438, + "logits/rejected": -1.3025527000427246, + "logps/chosen": -20.232595443725586, + "logps/rejected": -40.525020599365234, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6132769584655762, + "rewards/margins": 4.133052349090576, + "rewards/rejected": -3.519775390625, + "step": 190 + }, + { + "epoch": 3.23728813559322, + "grad_norm": 11.384419516356248, + "learning_rate": 3.723168939888901e-07, + "logits/chosen": -1.4284577369689941, + "logits/rejected": -1.3038253784179688, + "logps/chosen": -36.025821685791016, + "logps/rejected": -47.912906646728516, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4135657548904419, + "rewards/margins": 4.515513896942139, + "rewards/rejected": -4.101947784423828, + "step": 191 + }, + { + "epoch": 3.2542372881355934, + "grad_norm": 12.741788906869417, + "learning_rate": 3.7069805068268624e-07, + "logits/chosen": -1.190822958946228, + "logits/rejected": -1.2386127710342407, + "logps/chosen": -25.53938865661621, + "logps/rejected": -41.572757720947266, + "loss": 0.0968, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21493886411190033, + "rewards/margins": 3.464285373687744, + "rewards/rejected": -3.6792244911193848, + "step": 192 + }, + { + "epoch": 3.2711864406779663, + "grad_norm": 11.543121161706487, + "learning_rate": 3.6907258114960915e-07, + "logits/chosen": -1.44771146774292, + "logits/rejected": -1.3371340036392212, + "logps/chosen": -22.689008712768555, + "logps/rejected": -33.55266571044922, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20511241257190704, + "rewards/margins": 3.822577953338623, + "rewards/rejected": -3.617465019226074, + "step": 193 + }, + { + "epoch": 3.288135593220339, + "grad_norm": 12.699281503282757, + "learning_rate": 3.6744057462664194e-07, + "logits/chosen": -1.2974333763122559, + "logits/rejected": -1.2633615732192993, + "logps/chosen": -35.57712936401367, + "logps/rejected": -41.26565170288086, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1042805016040802, + "rewards/margins": 3.9843153953552246, + "rewards/rejected": -4.088595867156982, + "step": 194 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 11.346679553734594, + "learning_rate": 3.658021207096432e-07, + "logits/chosen": -1.250510811805725, + "logits/rejected": -1.2325608730316162, + "logps/chosen": -27.381729125976562, + "logps/rejected": -39.44231414794922, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14005836844444275, + "rewards/margins": 3.225729465484619, + "rewards/rejected": -3.0856711864471436, + "step": 195 + }, + { + "epoch": 3.3220338983050848, + "grad_norm": 11.675211775416328, + "learning_rate": 3.6415730934842825e-07, + "logits/chosen": -1.4330198764801025, + "logits/rejected": -1.2990684509277344, + "logps/chosen": -26.091453552246094, + "logps/rejected": -37.71536636352539, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5305294990539551, + "rewards/margins": 3.7357966899871826, + "rewards/rejected": -3.2052674293518066, + "step": 196 + }, + { + "epoch": 3.3389830508474576, + "grad_norm": 10.595079308232455, + "learning_rate": 3.625062308418311e-07, + "logits/chosen": -1.4256861209869385, + "logits/rejected": -1.3300725221633911, + "logps/chosen": -44.14484786987305, + "logps/rejected": -49.18852233886719, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23929768800735474, + "rewards/margins": 3.831993818283081, + "rewards/rejected": -4.071290969848633, + "step": 197 + }, + { + "epoch": 3.3559322033898304, + "grad_norm": 10.589818972158676, + "learning_rate": 3.6084897583274715e-07, + "logits/chosen": -1.5123655796051025, + "logits/rejected": -1.4504189491271973, + "logps/chosen": -21.742530822753906, + "logps/rejected": -43.59285354614258, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08142367005348206, + "rewards/margins": 4.206247329711914, + "rewards/rejected": -4.124823570251465, + "step": 198 + }, + { + "epoch": 3.3728813559322033, + "grad_norm": 10.064171955563651, + "learning_rate": 3.591856353031566e-07, + "logits/chosen": -1.436945915222168, + "logits/rejected": -1.475320816040039, + "logps/chosen": -22.611244201660156, + "logps/rejected": -43.448341369628906, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2055773138999939, + "rewards/margins": 4.247138500213623, + "rewards/rejected": -4.041561126708984, + "step": 199 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 9.40202931235145, + "learning_rate": 3.5751630056913013e-07, + "logits/chosen": -1.5867615938186646, + "logits/rejected": -1.559362769126892, + "logps/chosen": -27.019208908081055, + "logps/rejected": -38.67339324951172, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21613261103630066, + "rewards/margins": 3.4724230766296387, + "rewards/rejected": -3.2562904357910156, + "step": 200 + }, + { + "epoch": 3.406779661016949, + "grad_norm": 10.1184235462117, + "learning_rate": 3.558410632758153e-07, + "logits/chosen": -1.4771666526794434, + "logits/rejected": -1.4069215059280396, + "logps/chosen": -23.256723403930664, + "logps/rejected": -42.76215744018555, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2688101828098297, + "rewards/margins": 4.244396686553955, + "rewards/rejected": -3.9755868911743164, + "step": 201 + }, + { + "epoch": 3.423728813559322, + "grad_norm": 13.152928025513987, + "learning_rate": 3.5416001539240574e-07, + "logits/chosen": -1.5284346342086792, + "logits/rejected": -1.4866607189178467, + "logps/chosen": -24.796672821044922, + "logps/rejected": -51.007835388183594, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19576720893383026, + "rewards/margins": 4.06268835067749, + "rewards/rejected": -4.258455753326416, + "step": 202 + }, + { + "epoch": 3.440677966101695, + "grad_norm": 11.395001548472651, + "learning_rate": 3.5247324920709147e-07, + "logits/chosen": -1.3313159942626953, + "logits/rejected": -1.2498857975006104, + "logps/chosen": -30.9143009185791, + "logps/rejected": -41.243194580078125, + "loss": 0.07, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04568520188331604, + "rewards/margins": 3.3133530616760254, + "rewards/rejected": -3.267667770385742, + "step": 203 + }, + { + "epoch": 3.457627118644068, + "grad_norm": 9.512575018928173, + "learning_rate": 3.5078085732199307e-07, + "logits/chosen": -1.567063331604004, + "logits/rejected": -1.4618712663650513, + "logps/chosen": -25.394840240478516, + "logps/rejected": -49.12550354003906, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02158541977405548, + "rewards/margins": 4.889016628265381, + "rewards/rejected": -4.867431163787842, + "step": 204 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 11.586078932679674, + "learning_rate": 3.490829326480773e-07, + "logits/chosen": -1.4906607866287231, + "logits/rejected": -1.3863712549209595, + "logps/chosen": -31.574989318847656, + "logps/rejected": -48.25616455078125, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011842876672744751, + "rewards/margins": 4.549272537231445, + "rewards/rejected": -4.561115741729736, + "step": 205 + }, + { + "epoch": 3.4915254237288136, + "grad_norm": 12.547434003386876, + "learning_rate": 3.4737956840005684e-07, + "logits/chosen": -1.3435784578323364, + "logits/rejected": -1.3757704496383667, + "logps/chosen": -25.303531646728516, + "logps/rejected": -39.15208053588867, + "loss": 0.086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02988174557685852, + "rewards/margins": 3.5366082191467285, + "rewards/rejected": -3.5067262649536133, + "step": 206 + }, + { + "epoch": 3.5084745762711864, + "grad_norm": 9.788457273992568, + "learning_rate": 3.4567085809127245e-07, + "logits/chosen": -1.5033990144729614, + "logits/rejected": -1.4766664505004883, + "logps/chosen": -26.12259864807129, + "logps/rejected": -52.18391418457031, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08881500363349915, + "rewards/margins": 4.650575160980225, + "rewards/rejected": -4.561759948730469, + "step": 207 + }, + { + "epoch": 3.5254237288135593, + "grad_norm": 11.249455307964162, + "learning_rate": 3.439568955285595e-07, + "logits/chosen": -1.600437045097351, + "logits/rejected": -1.5185781717300415, + "logps/chosen": -20.018888473510742, + "logps/rejected": -41.524349212646484, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08864006400108337, + "rewards/margins": 3.782201051712036, + "rewards/rejected": -3.8708412647247314, + "step": 208 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 9.990126506747924, + "learning_rate": 3.4223777480709804e-07, + "logits/chosen": -1.3552483320236206, + "logits/rejected": -1.2629144191741943, + "logps/chosen": -21.040348052978516, + "logps/rejected": -41.19926071166992, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.106040358543396, + "rewards/margins": 4.690126419067383, + "rewards/rejected": -4.796167373657227, + "step": 209 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 9.838092383663449, + "learning_rate": 3.405135903052465e-07, + "logits/chosen": -1.4207416772842407, + "logits/rejected": -1.2667282819747925, + "logps/chosen": -30.103904724121094, + "logps/rejected": -46.88092803955078, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3432961106300354, + "rewards/margins": 4.319201946258545, + "rewards/rejected": -4.6624979972839355, + "step": 210 + }, + { + "epoch": 3.576271186440678, + "grad_norm": 9.67907653448832, + "learning_rate": 3.3878443667936136e-07, + "logits/chosen": -1.2791118621826172, + "logits/rejected": -1.2004616260528564, + "logps/chosen": -40.10679244995117, + "logps/rejected": -60.650394439697266, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6684077978134155, + "rewards/margins": 4.151851654052734, + "rewards/rejected": -4.820259094238281, + "step": 211 + }, + { + "epoch": 3.593220338983051, + "grad_norm": 9.853143082693608, + "learning_rate": 3.3705040885859967e-07, + "logits/chosen": -1.5361111164093018, + "logits/rejected": -1.4038530588150024, + "logps/chosen": -37.90250015258789, + "logps/rejected": -45.059024810791016, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030466020107269287, + "rewards/margins": 3.964195728302002, + "rewards/rejected": -3.9672420024871826, + "step": 212 + }, + { + "epoch": 3.610169491525424, + "grad_norm": 9.44047901416316, + "learning_rate": 3.3531160203970805e-07, + "logits/chosen": -1.4581642150878906, + "logits/rejected": -1.4039422273635864, + "logps/chosen": -30.59682846069336, + "logps/rejected": -44.48968505859375, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1339409351348877, + "rewards/margins": 4.1027374267578125, + "rewards/rejected": -4.236677646636963, + "step": 213 + }, + { + "epoch": 3.6271186440677967, + "grad_norm": 11.435287785470077, + "learning_rate": 3.3356811168179627e-07, + "logits/chosen": -1.2856285572052002, + "logits/rejected": -1.177187204360962, + "logps/chosen": -28.70745086669922, + "logps/rejected": -39.22813415527344, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.319486528635025, + "rewards/margins": 4.963109016418457, + "rewards/rejected": -4.643622398376465, + "step": 214 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 10.041339419341044, + "learning_rate": 3.318200335010967e-07, + "logits/chosen": -1.720375418663025, + "logits/rejected": -1.5207815170288086, + "logps/chosen": -26.1734676361084, + "logps/rejected": -41.08108901977539, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5092735290527344, + "rewards/margins": 4.717857360839844, + "rewards/rejected": -4.208584308624268, + "step": 215 + }, + { + "epoch": 3.6610169491525424, + "grad_norm": 10.722521061649259, + "learning_rate": 3.3006746346570935e-07, + "logits/chosen": -1.5194891691207886, + "logits/rejected": -1.5225661993026733, + "logps/chosen": -21.88874626159668, + "logps/rejected": -31.313539505004883, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42684128880500793, + "rewards/margins": 3.6699740886688232, + "rewards/rejected": -3.2431328296661377, + "step": 216 + }, + { + "epoch": 3.6779661016949152, + "grad_norm": 11.239607248752012, + "learning_rate": 3.2831049779033395e-07, + "logits/chosen": -1.585048794746399, + "logits/rejected": -1.447473406791687, + "logps/chosen": -44.55853271484375, + "logps/rejected": -62.56214141845703, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5582241415977478, + "rewards/margins": 5.07534122467041, + "rewards/rejected": -5.633565425872803, + "step": 217 + }, + { + "epoch": 3.694915254237288, + "grad_norm": 11.121200361780803, + "learning_rate": 3.2654923293098666e-07, + "logits/chosen": -1.4354138374328613, + "logits/rejected": -1.431335210800171, + "logps/chosen": -29.88983917236328, + "logps/rejected": -41.21036911010742, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12309768795967102, + "rewards/margins": 4.238857746124268, + "rewards/rejected": -4.361955642700195, + "step": 218 + }, + { + "epoch": 3.711864406779661, + "grad_norm": 8.46695075496944, + "learning_rate": 3.247837655797061e-07, + "logits/chosen": -1.4146300554275513, + "logits/rejected": -1.3639909029006958, + "logps/chosen": -23.79798698425293, + "logps/rejected": -42.03700256347656, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14847984910011292, + "rewards/margins": 4.2562971115112305, + "rewards/rejected": -4.107817649841309, + "step": 219 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 10.875773708222669, + "learning_rate": 3.2301419265924393e-07, + "logits/chosen": -1.3614307641983032, + "logits/rejected": -1.263061285018921, + "logps/chosen": -26.913053512573242, + "logps/rejected": -38.41120910644531, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12246422469615936, + "rewards/margins": 3.324322462081909, + "rewards/rejected": -3.2018585205078125, + "step": 220 + }, + { + "epoch": 3.7457627118644066, + "grad_norm": 10.149586335689621, + "learning_rate": 3.2124061131774443e-07, + "logits/chosen": -1.3240911960601807, + "logits/rejected": -1.3163329362869263, + "logps/chosen": -25.73955535888672, + "logps/rejected": -49.833221435546875, + "loss": 0.0624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2507280111312866, + "rewards/margins": 4.252786636352539, + "rewards/rejected": -4.002058982849121, + "step": 221 + }, + { + "epoch": 3.7627118644067794, + "grad_norm": 9.140932864263574, + "learning_rate": 3.194631189234109e-07, + "logits/chosen": -1.6033962965011597, + "logits/rejected": -1.4346346855163574, + "logps/chosen": -35.82508087158203, + "logps/rejected": -44.355918884277344, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09858936071395874, + "rewards/margins": 4.44521951675415, + "rewards/rejected": -4.543808937072754, + "step": 222 + }, + { + "epoch": 3.7796610169491527, + "grad_norm": 9.775231597153738, + "learning_rate": 3.1768181305916063e-07, + "logits/chosen": -1.4034669399261475, + "logits/rejected": -1.2983992099761963, + "logps/chosen": -36.48939514160156, + "logps/rejected": -50.544036865234375, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08162027597427368, + "rewards/margins": 4.144961357116699, + "rewards/rejected": -4.06334114074707, + "step": 223 + }, + { + "epoch": 3.7966101694915255, + "grad_norm": 11.927845816995063, + "learning_rate": 3.158967915172669e-07, + "logits/chosen": -1.4804027080535889, + "logits/rejected": -1.349886178970337, + "logps/chosen": -27.929018020629883, + "logps/rejected": -37.72224426269531, + "loss": 0.0819, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21573437750339508, + "rewards/margins": 3.54447078704834, + "rewards/rejected": -3.760205030441284, + "step": 224 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 11.052999347550179, + "learning_rate": 3.141081522939911e-07, + "logits/chosen": -1.4779460430145264, + "logits/rejected": -1.349549412727356, + "logps/chosen": -37.28151321411133, + "logps/rejected": -47.440765380859375, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3217180073261261, + "rewards/margins": 4.7478485107421875, + "rewards/rejected": -5.06956672668457, + "step": 225 + }, + { + "epoch": 3.830508474576271, + "grad_norm": 10.970513938481727, + "learning_rate": 3.1231599358420233e-07, + "logits/chosen": -1.3360522985458374, + "logits/rejected": -1.2304054498672485, + "logps/chosen": -25.628524780273438, + "logps/rejected": -38.20055389404297, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2233581840991974, + "rewards/margins": 4.349393844604492, + "rewards/rejected": -4.126035690307617, + "step": 226 + }, + { + "epoch": 3.847457627118644, + "grad_norm": 9.433851363193908, + "learning_rate": 3.105204137759867e-07, + "logits/chosen": -1.2719545364379883, + "logits/rejected": -1.330519676208496, + "logps/chosen": -34.64011001586914, + "logps/rejected": -52.32704162597656, + "loss": 0.0631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2245815396308899, + "rewards/margins": 4.7216315269470215, + "rewards/rejected": -4.9462127685546875, + "step": 227 + }, + { + "epoch": 3.864406779661017, + "grad_norm": 10.439526673557062, + "learning_rate": 3.0872151144524594e-07, + "logits/chosen": -1.6759734153747559, + "logits/rejected": -1.579871654510498, + "logps/chosen": -27.196908950805664, + "logps/rejected": -50.7559700012207, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22928810119628906, + "rewards/margins": 4.84774923324585, + "rewards/rejected": -5.077037811279297, + "step": 228 + }, + { + "epoch": 3.8813559322033897, + "grad_norm": 11.392292474810198, + "learning_rate": 3.069193853502855e-07, + "logits/chosen": -1.5869011878967285, + "logits/rejected": -1.5820071697235107, + "logps/chosen": -25.63404655456543, + "logps/rejected": -38.69296646118164, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3039775788784027, + "rewards/margins": 3.5710806846618652, + "rewards/rejected": -3.875058174133301, + "step": 229 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 10.225877059440036, + "learning_rate": 3.0511413442639297e-07, + "logits/chosen": -1.434234619140625, + "logits/rejected": -1.3351249694824219, + "logps/chosen": -24.643152236938477, + "logps/rejected": -60.13444519042969, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10867035388946533, + "rewards/margins": 6.075805187225342, + "rewards/rejected": -6.184475898742676, + "step": 230 + }, + { + "epoch": 3.915254237288136, + "grad_norm": 9.295415437658166, + "learning_rate": 3.0330585778040675e-07, + "logits/chosen": -1.3108859062194824, + "logits/rejected": -1.2643885612487793, + "logps/chosen": -24.4619140625, + "logps/rejected": -34.88125228881836, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13806024193763733, + "rewards/margins": 3.5883846282958984, + "rewards/rejected": -3.726444721221924, + "step": 231 + }, + { + "epoch": 3.9322033898305087, + "grad_norm": 9.365373350685292, + "learning_rate": 3.0149465468527457e-07, + "logits/chosen": -1.4852244853973389, + "logits/rejected": -1.5285433530807495, + "logps/chosen": -25.74740982055664, + "logps/rejected": -39.37394332885742, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005698531866073608, + "rewards/margins": 4.059448719024658, + "rewards/rejected": -4.0651469230651855, + "step": 232 + }, + { + "epoch": 3.9491525423728815, + "grad_norm": 8.974250718981809, + "learning_rate": 2.9968062457460437e-07, + "logits/chosen": -1.5756818056106567, + "logits/rejected": -1.4879854917526245, + "logps/chosen": -24.878393173217773, + "logps/rejected": -43.14397048950195, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1284530758857727, + "rewards/margins": 4.223859786987305, + "rewards/rejected": -4.3523125648498535, + "step": 233 + }, + { + "epoch": 3.9661016949152543, + "grad_norm": 11.054495120717768, + "learning_rate": 2.978638670372047e-07, + "logits/chosen": -1.4321879148483276, + "logits/rejected": -1.3549392223358154, + "logps/chosen": -35.11801528930664, + "logps/rejected": -50.857975006103516, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5975885391235352, + "rewards/margins": 4.9589338302612305, + "rewards/rejected": -5.556522846221924, + "step": 234 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 10.55567875619197, + "learning_rate": 2.9604448181161755e-07, + "logits/chosen": -1.4181180000305176, + "logits/rejected": -1.247128963470459, + "logps/chosen": -24.004146575927734, + "logps/rejected": -40.258331298828125, + "loss": 0.0771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09078972041606903, + "rewards/margins": 3.5850982666015625, + "rewards/rejected": -3.6758880615234375, + "step": 235 + }, + { + "epoch": 4.0, + "grad_norm": 8.789282244689215, + "learning_rate": 2.9422256878064324e-07, + "logits/chosen": -1.3495515584945679, + "logits/rejected": -1.3198853731155396, + "logps/chosen": -35.33921432495117, + "logps/rejected": -48.3183708190918, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6972587704658508, + "rewards/margins": 4.445403575897217, + "rewards/rejected": -5.142662525177002, + "step": 236 + }, + { + "epoch": 4.016949152542373, + "grad_norm": 6.75184146677303, + "learning_rate": 2.923982279658564e-07, + "logits/chosen": -1.633570909500122, + "logits/rejected": -1.5905429124832153, + "logps/chosen": -38.822261810302734, + "logps/rejected": -48.57697296142578, + "loss": 0.0414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40955615043640137, + "rewards/margins": 4.950255393981934, + "rewards/rejected": -5.359812259674072, + "step": 237 + }, + { + "epoch": 4.033898305084746, + "grad_norm": 9.349730549756508, + "learning_rate": 2.90571559522115e-07, + "logits/chosen": -1.0709235668182373, + "logits/rejected": -1.1218361854553223, + "logps/chosen": -29.26075553894043, + "logps/rejected": -34.48569869995117, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08125007152557373, + "rewards/margins": 3.5675737857818604, + "rewards/rejected": -3.486323833465576, + "step": 238 + }, + { + "epoch": 4.0508474576271185, + "grad_norm": 8.738621249147451, + "learning_rate": 2.8874266373206215e-07, + "logits/chosen": -1.5969672203063965, + "logits/rejected": -1.5486068725585938, + "logps/chosen": -32.66074752807617, + "logps/rejected": -43.64617919921875, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13383600115776062, + "rewards/margins": 4.7373738288879395, + "rewards/rejected": -4.603538513183594, + "step": 239 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 8.414244576812601, + "learning_rate": 2.8691164100062034e-07, + "logits/chosen": -1.4231804609298706, + "logits/rejected": -1.2792776823043823, + "logps/chosen": -31.69137954711914, + "logps/rejected": -54.63775634765625, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08693331480026245, + "rewards/margins": 5.980134963989258, + "rewards/rejected": -5.89320182800293, + "step": 240 + }, + { + "epoch": 4.084745762711864, + "grad_norm": 8.9348895906443, + "learning_rate": 2.8507859184947953e-07, + "logits/chosen": -1.4366453886032104, + "logits/rejected": -1.3272255659103394, + "logps/chosen": -26.352733612060547, + "logps/rejected": -46.89934158325195, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2997795641422272, + "rewards/margins": 3.753899574279785, + "rewards/rejected": -4.053679466247559, + "step": 241 + }, + { + "epoch": 4.101694915254237, + "grad_norm": 8.250352811668128, + "learning_rate": 2.8324361691157853e-07, + "logits/chosen": -1.374919056892395, + "logits/rejected": -1.421012043952942, + "logps/chosen": -31.536663055419922, + "logps/rejected": -55.928749084472656, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2735545039176941, + "rewards/margins": 4.6181135177612305, + "rewards/rejected": -4.891667366027832, + "step": 242 + }, + { + "epoch": 4.11864406779661, + "grad_norm": 9.40465609028747, + "learning_rate": 2.8140681692558034e-07, + "logits/chosen": -1.7210100889205933, + "logits/rejected": -1.6137436628341675, + "logps/chosen": -29.081104278564453, + "logps/rejected": -42.040245056152344, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13167661428451538, + "rewards/margins": 4.6320929527282715, + "rewards/rejected": -4.500416278839111, + "step": 243 + }, + { + "epoch": 4.135593220338983, + "grad_norm": 7.575529614099465, + "learning_rate": 2.7956829273034146e-07, + "logits/chosen": -1.3278542757034302, + "logits/rejected": -1.2801333665847778, + "logps/chosen": -28.36415672302246, + "logps/rejected": -45.0773811340332, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0763159990310669, + "rewards/margins": 4.670356750488281, + "rewards/rejected": -4.594040393829346, + "step": 244 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 7.841549437235225, + "learning_rate": 2.7772814525937634e-07, + "logits/chosen": -1.521530032157898, + "logits/rejected": -1.3261444568634033, + "logps/chosen": -25.693857192993164, + "logps/rejected": -44.95489501953125, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31940776109695435, + "rewards/margins": 4.865126132965088, + "rewards/rejected": -4.545718193054199, + "step": 245 + }, + { + "epoch": 4.169491525423728, + "grad_norm": 7.437246530691066, + "learning_rate": 2.7588647553531576e-07, + "logits/chosen": -1.3834595680236816, + "logits/rejected": -1.366625428199768, + "logps/chosen": -27.27410125732422, + "logps/rejected": -47.83835220336914, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1564972996711731, + "rewards/margins": 4.690642833709717, + "rewards/rejected": -4.534145355224609, + "step": 246 + }, + { + "epoch": 4.186440677966102, + "grad_norm": 8.374701794746429, + "learning_rate": 2.7404338466436116e-07, + "logits/chosen": -1.4731414318084717, + "logits/rejected": -1.4213758707046509, + "logps/chosen": -32.91023635864258, + "logps/rejected": -48.37405014038086, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12676534056663513, + "rewards/margins": 5.037107944488525, + "rewards/rejected": -4.910342216491699, + "step": 247 + }, + { + "epoch": 4.203389830508475, + "grad_norm": 8.684915932739882, + "learning_rate": 2.721989738307337e-07, + "logits/chosen": -1.5974880456924438, + "logits/rejected": -1.5572988986968994, + "logps/chosen": -29.11380386352539, + "logps/rejected": -43.61833953857422, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09620954096317291, + "rewards/margins": 3.6240577697753906, + "rewards/rejected": -3.527848482131958, + "step": 248 + }, + { + "epoch": 4.220338983050848, + "grad_norm": 8.422615969153167, + "learning_rate": 2.7035334429111955e-07, + "logits/chosen": -1.4860804080963135, + "logits/rejected": -1.4129899740219116, + "logps/chosen": -38.593204498291016, + "logps/rejected": -55.535247802734375, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007733196020126343, + "rewards/margins": 4.608980178833008, + "rewards/rejected": -4.616713047027588, + "step": 249 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 8.150205311828831, + "learning_rate": 2.685065973691107e-07, + "logits/chosen": -1.509846806526184, + "logits/rejected": -1.440779447555542, + "logps/chosen": -32.75022506713867, + "logps/rejected": -53.621826171875, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10550498962402344, + "rewards/margins": 5.389019966125488, + "rewards/rejected": -5.49452543258667, + "step": 250 + }, + { + "epoch": 4.254237288135593, + "grad_norm": 8.158516596968319, + "learning_rate": 2.6665883444964277e-07, + "logits/chosen": -1.2622435092926025, + "logits/rejected": -1.2126017808914185, + "logps/chosen": -22.526697158813477, + "logps/rejected": -49.31510543823242, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2732163071632385, + "rewards/margins": 5.386449337005615, + "rewards/rejected": -5.659666061401367, + "step": 251 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 8.708471545136758, + "learning_rate": 2.6481015697342856e-07, + "logits/chosen": -1.3532803058624268, + "logits/rejected": -1.263253092765808, + "logps/chosen": -18.96270751953125, + "logps/rejected": -40.195404052734375, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013826161623001099, + "rewards/margins": 4.144740581512451, + "rewards/rejected": -4.130914688110352, + "step": 252 + }, + { + "epoch": 4.288135593220339, + "grad_norm": 7.692204259130146, + "learning_rate": 2.629606664313896e-07, + "logits/chosen": -1.4633291959762573, + "logits/rejected": -1.2908146381378174, + "logps/chosen": -26.60018539428711, + "logps/rejected": -42.72929763793945, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05975392460823059, + "rewards/margins": 3.8384809494018555, + "rewards/rejected": -3.8982346057891846, + "step": 253 + }, + { + "epoch": 4.305084745762712, + "grad_norm": 7.840483273382048, + "learning_rate": 2.611104643590838e-07, + "logits/chosen": -1.3723235130310059, + "logits/rejected": -1.3505971431732178, + "logps/chosen": -22.22472381591797, + "logps/rejected": -48.225982666015625, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1287948191165924, + "rewards/margins": 4.58843994140625, + "rewards/rejected": -4.4596452713012695, + "step": 254 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 7.007309716178494, + "learning_rate": 2.592596523311317e-07, + "logits/chosen": -1.6482080221176147, + "logits/rejected": -1.5536173582077026, + "logps/chosen": -34.07274627685547, + "logps/rejected": -39.57206344604492, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029447555541992188, + "rewards/margins": 4.474746227264404, + "rewards/rejected": -4.445298194885254, + "step": 255 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 8.655754602681508, + "learning_rate": 2.5740833195563994e-07, + "logits/chosen": -1.4583147764205933, + "logits/rejected": -1.4426932334899902, + "logps/chosen": -28.750776290893555, + "logps/rejected": -42.55610656738281, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13989472389221191, + "rewards/margins": 4.060596942901611, + "rewards/rejected": -4.200491905212402, + "step": 256 + }, + { + "epoch": 4.3559322033898304, + "grad_norm": 7.666532517465646, + "learning_rate": 2.5555660486862293e-07, + "logits/chosen": -1.3961790800094604, + "logits/rejected": -1.3545511960983276, + "logps/chosen": -30.55471420288086, + "logps/rejected": -46.441307067871094, + "loss": 0.0527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.01744025945663452, + "rewards/margins": 4.828250885009766, + "rewards/rejected": -4.810810565948486, + "step": 257 + }, + { + "epoch": 4.372881355932203, + "grad_norm": 7.38904178241979, + "learning_rate": 2.5370457272842315e-07, + "logits/chosen": -1.2144039869308472, + "logits/rejected": -1.1987119913101196, + "logps/chosen": -31.387115478515625, + "logps/rejected": -44.41786193847656, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37475213408470154, + "rewards/margins": 4.554241180419922, + "rewards/rejected": -4.179489612579346, + "step": 258 + }, + { + "epoch": 4.389830508474576, + "grad_norm": 7.357227804203978, + "learning_rate": 2.5185233721013053e-07, + "logits/chosen": -1.564170479774475, + "logits/rejected": -1.5079408884048462, + "logps/chosen": -24.936302185058594, + "logps/rejected": -44.029632568359375, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1059635728597641, + "rewards/margins": 4.297895908355713, + "rewards/rejected": -4.403859615325928, + "step": 259 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 9.22648891874881, + "learning_rate": 2.5e-07, + "logits/chosen": -1.3310877084732056, + "logits/rejected": -1.3538789749145508, + "logps/chosen": -24.936809539794922, + "logps/rejected": -49.063575744628906, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07093064486980438, + "rewards/margins": 4.918918132781982, + "rewards/rejected": -4.989849090576172, + "step": 260 + }, + { + "epoch": 4.423728813559322, + "grad_norm": 6.836226567179652, + "learning_rate": 2.4814766278986944e-07, + "logits/chosen": -1.6707507371902466, + "logits/rejected": -1.5290323495864868, + "logps/chosen": -29.512426376342773, + "logps/rejected": -54.653114318847656, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0224665105342865, + "rewards/margins": 5.460002899169922, + "rewards/rejected": -5.437536239624023, + "step": 261 + }, + { + "epoch": 4.440677966101695, + "grad_norm": 8.63432296696573, + "learning_rate": 2.462954272715768e-07, + "logits/chosen": -1.5188168287277222, + "logits/rejected": -1.4816163778305054, + "logps/chosen": -31.95643424987793, + "logps/rejected": -42.647403717041016, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3785313367843628, + "rewards/margins": 4.157118320465088, + "rewards/rejected": -4.535649299621582, + "step": 262 + }, + { + "epoch": 4.4576271186440675, + "grad_norm": 8.141729762323788, + "learning_rate": 2.4444339513137716e-07, + "logits/chosen": -1.590928316116333, + "logits/rejected": -1.5425117015838623, + "logps/chosen": -30.981969833374023, + "logps/rejected": -55.22663116455078, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1410723626613617, + "rewards/margins": 5.534295082092285, + "rewards/rejected": -5.393222808837891, + "step": 263 + }, + { + "epoch": 4.47457627118644, + "grad_norm": 6.553586136504455, + "learning_rate": 2.4259166804436003e-07, + "logits/chosen": -1.6111406087875366, + "logits/rejected": -1.5660130977630615, + "logps/chosen": -30.944381713867188, + "logps/rejected": -48.914039611816406, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1980370730161667, + "rewards/margins": 5.023958683013916, + "rewards/rejected": -5.2219953536987305, + "step": 264 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 7.7695916088798045, + "learning_rate": 2.4074034766886826e-07, + "logits/chosen": -1.4081194400787354, + "logits/rejected": -1.4115426540374756, + "logps/chosen": -26.77755355834961, + "logps/rejected": -44.787628173828125, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2828730642795563, + "rewards/margins": 5.210730075836182, + "rewards/rejected": -5.493603229522705, + "step": 265 + }, + { + "epoch": 4.508474576271187, + "grad_norm": 9.138228801008614, + "learning_rate": 2.3888953564091616e-07, + "logits/chosen": -1.5158981084823608, + "logits/rejected": -1.5783250331878662, + "logps/chosen": -34.667503356933594, + "logps/rejected": -53.5840950012207, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20706957578659058, + "rewards/margins": 5.503992080688477, + "rewards/rejected": -5.711061477661133, + "step": 266 + }, + { + "epoch": 4.52542372881356, + "grad_norm": 6.507174183795552, + "learning_rate": 2.3703933356861044e-07, + "logits/chosen": -1.399531602859497, + "logits/rejected": -1.419585108757019, + "logps/chosen": -33.15267562866211, + "logps/rejected": -45.78288269042969, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5228755474090576, + "rewards/margins": 4.130328178405762, + "rewards/rejected": -4.653203964233398, + "step": 267 + }, + { + "epoch": 4.5423728813559325, + "grad_norm": 7.507653099421495, + "learning_rate": 2.3518984302657144e-07, + "logits/chosen": -1.5033714771270752, + "logits/rejected": -1.427080750465393, + "logps/chosen": -24.752490997314453, + "logps/rejected": -57.89167022705078, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4027895927429199, + "rewards/margins": 5.957125186920166, + "rewards/rejected": -6.359914779663086, + "step": 268 + }, + { + "epoch": 4.559322033898305, + "grad_norm": 7.12313349874887, + "learning_rate": 2.333411655503572e-07, + "logits/chosen": -1.4686946868896484, + "logits/rejected": -1.3002254962921143, + "logps/chosen": -29.05103302001953, + "logps/rejected": -53.72389221191406, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036399289965629578, + "rewards/margins": 5.409298419952393, + "rewards/rejected": -5.412938117980957, + "step": 269 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 7.93618380947219, + "learning_rate": 2.3149340263088927e-07, + "logits/chosen": -1.7106562852859497, + "logits/rejected": -1.5941462516784668, + "logps/chosen": -23.763492584228516, + "logps/rejected": -47.54367446899414, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3024110496044159, + "rewards/margins": 5.066253185272217, + "rewards/rejected": -4.7638421058654785, + "step": 270 + }, + { + "epoch": 4.593220338983051, + "grad_norm": 7.441146999396367, + "learning_rate": 2.296466557088805e-07, + "logits/chosen": -1.432921051979065, + "logits/rejected": -1.415549635887146, + "logps/chosen": -27.215051651000977, + "logps/rejected": -50.206668853759766, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3797184228897095, + "rewards/margins": 5.635693550109863, + "rewards/rejected": -6.015412330627441, + "step": 271 + }, + { + "epoch": 4.610169491525424, + "grad_norm": 7.203519404793784, + "learning_rate": 2.278010261692663e-07, + "logits/chosen": -1.6040518283843994, + "logits/rejected": -1.5052812099456787, + "logps/chosen": -27.241743087768555, + "logps/rejected": -47.83567810058594, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10971636325120926, + "rewards/margins": 5.601438999176025, + "rewards/rejected": -5.711155891418457, + "step": 272 + }, + { + "epoch": 4.627118644067797, + "grad_norm": 7.1515886159837505, + "learning_rate": 2.2595661533563887e-07, + "logits/chosen": -1.5058000087738037, + "logits/rejected": -1.4968637228012085, + "logps/chosen": -31.407421112060547, + "logps/rejected": -50.27019500732422, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42635661363601685, + "rewards/margins": 4.159647464752197, + "rewards/rejected": -4.58600378036499, + "step": 273 + }, + { + "epoch": 4.6440677966101696, + "grad_norm": 7.65650118033261, + "learning_rate": 2.2411352446468424e-07, + "logits/chosen": -1.3374431133270264, + "logits/rejected": -1.3260188102722168, + "logps/chosen": -21.7060489654541, + "logps/rejected": -47.063289642333984, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14809060096740723, + "rewards/margins": 4.805155277252197, + "rewards/rejected": -4.657064437866211, + "step": 274 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 8.277053566352693, + "learning_rate": 2.2227185474062374e-07, + "logits/chosen": -1.49006986618042, + "logits/rejected": -1.3998165130615234, + "logps/chosen": -24.71628761291504, + "logps/rejected": -47.67393493652344, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02221532166004181, + "rewards/margins": 4.3083624839782715, + "rewards/rejected": -4.286147117614746, + "step": 275 + }, + { + "epoch": 4.677966101694915, + "grad_norm": 7.8454820880953715, + "learning_rate": 2.2043170726965857e-07, + "logits/chosen": -1.4486889839172363, + "logits/rejected": -1.4862079620361328, + "logps/chosen": -27.138608932495117, + "logps/rejected": -44.235687255859375, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02510838210582733, + "rewards/margins": 4.74564266204834, + "rewards/rejected": -4.720534324645996, + "step": 276 + }, + { + "epoch": 4.694915254237288, + "grad_norm": 7.405600784416248, + "learning_rate": 2.1859318307441966e-07, + "logits/chosen": -1.444726586341858, + "logits/rejected": -1.4401050806045532, + "logps/chosen": -32.68350601196289, + "logps/rejected": -45.99779510498047, + "loss": 0.0469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.024221569299697876, + "rewards/margins": 5.507167816162109, + "rewards/rejected": -5.482946872711182, + "step": 277 + }, + { + "epoch": 4.711864406779661, + "grad_norm": 8.104155066599384, + "learning_rate": 2.1675638308842142e-07, + "logits/chosen": -1.2842341661453247, + "logits/rejected": -1.330397605895996, + "logps/chosen": -27.432506561279297, + "logps/rejected": -40.2528076171875, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19671624898910522, + "rewards/margins": 4.634299278259277, + "rewards/rejected": -4.4375834465026855, + "step": 278 + }, + { + "epoch": 4.728813559322034, + "grad_norm": 5.864508514550939, + "learning_rate": 2.149214081505205e-07, + "logits/chosen": -1.5249521732330322, + "logits/rejected": -1.404898762702942, + "logps/chosen": -32.8936653137207, + "logps/rejected": -37.96503829956055, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00440371036529541, + "rewards/margins": 4.577935218811035, + "rewards/rejected": -4.573531627655029, + "step": 279 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 7.425009165674246, + "learning_rate": 2.1308835899937972e-07, + "logits/chosen": -1.463742733001709, + "logits/rejected": -1.4091339111328125, + "logps/chosen": -28.56261444091797, + "logps/rejected": -44.104896545410156, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04231783747673035, + "rewards/margins": 4.795005798339844, + "rewards/rejected": -4.837324142456055, + "step": 280 + }, + { + "epoch": 4.762711864406779, + "grad_norm": 8.25860440969276, + "learning_rate": 2.112573362679379e-07, + "logits/chosen": -1.4964463710784912, + "logits/rejected": -1.355745553970337, + "logps/chosen": -37.27642059326172, + "logps/rejected": -61.14379119873047, + "loss": 0.0524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.36257362365722656, + "rewards/margins": 5.591864109039307, + "rewards/rejected": -5.22929048538208, + "step": 281 + }, + { + "epoch": 4.779661016949152, + "grad_norm": 8.825180205529128, + "learning_rate": 2.09428440477885e-07, + "logits/chosen": -1.5772864818572998, + "logits/rejected": -1.3255836963653564, + "logps/chosen": -27.248455047607422, + "logps/rejected": -47.95212173461914, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1756860911846161, + "rewards/margins": 6.540701866149902, + "rewards/rejected": -6.71638822555542, + "step": 282 + }, + { + "epoch": 4.796610169491525, + "grad_norm": 6.459544380114144, + "learning_rate": 2.0760177203414366e-07, + "logits/chosen": -1.4836134910583496, + "logits/rejected": -1.5253633260726929, + "logps/chosen": -28.740909576416016, + "logps/rejected": -38.53902816772461, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034703388810157776, + "rewards/margins": 4.7387285232543945, + "rewards/rejected": -4.773431777954102, + "step": 283 + }, + { + "epoch": 4.813559322033898, + "grad_norm": 8.201593648267453, + "learning_rate": 2.0577743121935682e-07, + "logits/chosen": -1.5238087177276611, + "logits/rejected": -1.479065179824829, + "logps/chosen": -23.523155212402344, + "logps/rejected": -49.87913131713867, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09851184487342834, + "rewards/margins": 4.452397346496582, + "rewards/rejected": -4.550909519195557, + "step": 284 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 8.037104047323464, + "learning_rate": 2.0395551818838243e-07, + "logits/chosen": -1.5587154626846313, + "logits/rejected": -1.473785638809204, + "logps/chosen": -38.94770812988281, + "logps/rejected": -56.7118034362793, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5131532549858093, + "rewards/margins": 5.551197052001953, + "rewards/rejected": -6.06434965133667, + "step": 285 + }, + { + "epoch": 4.847457627118644, + "grad_norm": 6.619262748940799, + "learning_rate": 2.021361329627953e-07, + "logits/chosen": -1.5830734968185425, + "logits/rejected": -1.4701610803604126, + "logps/chosen": -25.216768264770508, + "logps/rejected": -52.16200637817383, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12712815403938293, + "rewards/margins": 5.188778877258301, + "rewards/rejected": -5.3159074783325195, + "step": 286 + }, + { + "epoch": 4.864406779661017, + "grad_norm": 7.108190229695481, + "learning_rate": 2.003193754253957e-07, + "logits/chosen": -1.499477744102478, + "logits/rejected": -1.442081093788147, + "logps/chosen": -29.464977264404297, + "logps/rejected": -44.91366958618164, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14971515536308289, + "rewards/margins": 4.699173927307129, + "rewards/rejected": -4.848889350891113, + "step": 287 + }, + { + "epoch": 4.88135593220339, + "grad_norm": 6.501653298714645, + "learning_rate": 1.9850534531472544e-07, + "logits/chosen": -1.384242057800293, + "logits/rejected": -1.2570266723632812, + "logps/chosen": -27.370962142944336, + "logps/rejected": -41.63351058959961, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09189403057098389, + "rewards/margins": 5.2285356521606445, + "rewards/rejected": -5.136641502380371, + "step": 288 + }, + { + "epoch": 4.898305084745763, + "grad_norm": 6.999053945822559, + "learning_rate": 1.966941422195933e-07, + "logits/chosen": -1.4301297664642334, + "logits/rejected": -1.3571842908859253, + "logps/chosen": -30.701709747314453, + "logps/rejected": -47.53770065307617, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2589109241962433, + "rewards/margins": 4.9735002517700195, + "rewards/rejected": -5.232410907745361, + "step": 289 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 7.2234737195409275, + "learning_rate": 1.94885865573607e-07, + "logits/chosen": -1.5849264860153198, + "logits/rejected": -1.5440596342086792, + "logps/chosen": -22.58307647705078, + "logps/rejected": -42.317161560058594, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05918128788471222, + "rewards/margins": 4.245813846588135, + "rewards/rejected": -4.304995536804199, + "step": 290 + }, + { + "epoch": 4.932203389830509, + "grad_norm": 7.881613271557854, + "learning_rate": 1.930806146497146e-07, + "logits/chosen": -1.5185688734054565, + "logits/rejected": -1.498981237411499, + "logps/chosen": -27.832983016967773, + "logps/rejected": -44.11700439453125, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012274429202079773, + "rewards/margins": 4.80919885635376, + "rewards/rejected": -4.796924591064453, + "step": 291 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 7.929413571170103, + "learning_rate": 1.912784885547541e-07, + "logits/chosen": -1.4354244470596313, + "logits/rejected": -1.2839109897613525, + "logps/chosen": -28.58751678466797, + "logps/rejected": -52.627559661865234, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09678968787193298, + "rewards/margins": 3.94634747505188, + "rewards/rejected": -4.043137550354004, + "step": 292 + }, + { + "epoch": 4.966101694915254, + "grad_norm": 8.10839834515343, + "learning_rate": 1.8947958622401328e-07, + "logits/chosen": -1.2912473678588867, + "logits/rejected": -1.3136945962905884, + "logps/chosen": -28.488567352294922, + "logps/rejected": -44.88406753540039, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5085054636001587, + "rewards/margins": 4.020839691162109, + "rewards/rejected": -4.529345512390137, + "step": 293 + }, + { + "epoch": 4.983050847457627, + "grad_norm": 6.68942425307024, + "learning_rate": 1.876840064157976e-07, + "logits/chosen": -1.4644904136657715, + "logits/rejected": -1.4102816581726074, + "logps/chosen": -26.936355590820312, + "logps/rejected": -47.06075668334961, + "loss": 0.0416, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07672211527824402, + "rewards/margins": 4.622737884521484, + "rewards/rejected": -4.699460029602051, + "step": 294 + }, + { + "epoch": 5.0, + "grad_norm": 8.672333086068912, + "learning_rate": 1.858918477060089e-07, + "logits/chosen": -1.4006508588790894, + "logits/rejected": -1.4321238994598389, + "logps/chosen": -25.137971878051758, + "logps/rejected": -42.432716369628906, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02563352882862091, + "rewards/margins": 4.927007675170898, + "rewards/rejected": -4.952641010284424, + "step": 295 + }, + { + "epoch": 5.016949152542373, + "grad_norm": 6.442205099947888, + "learning_rate": 1.8410320848273313e-07, + "logits/chosen": -1.4923574924468994, + "logits/rejected": -1.4842016696929932, + "logps/chosen": -26.232227325439453, + "logps/rejected": -48.02200698852539, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0712697505950928, + "rewards/margins": 5.136242866516113, + "rewards/rejected": -6.207512855529785, + "step": 296 + }, + { + "epoch": 5.033898305084746, + "grad_norm": 6.132421236608343, + "learning_rate": 1.8231818694083938e-07, + "logits/chosen": -1.4439387321472168, + "logits/rejected": -1.4443602561950684, + "logps/chosen": -39.26667404174805, + "logps/rejected": -56.92470169067383, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1077936589717865, + "rewards/margins": 6.438662528991699, + "rewards/rejected": -6.546456336975098, + "step": 297 + }, + { + "epoch": 5.0508474576271185, + "grad_norm": 5.496092091450637, + "learning_rate": 1.8053688107658905e-07, + "logits/chosen": -1.2399489879608154, + "logits/rejected": -1.1487674713134766, + "logps/chosen": -24.482892990112305, + "logps/rejected": -38.75669860839844, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1850048452615738, + "rewards/margins": 4.235044479370117, + "rewards/rejected": -4.050039291381836, + "step": 298 + }, + { + "epoch": 5.067796610169491, + "grad_norm": 6.371224857913382, + "learning_rate": 1.787593886822556e-07, + "logits/chosen": -1.5012649297714233, + "logits/rejected": -1.5702931880950928, + "logps/chosen": -26.034366607666016, + "logps/rejected": -54.3376579284668, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.206559419631958, + "rewards/margins": 5.6937360763549805, + "rewards/rejected": -5.900295257568359, + "step": 299 + }, + { + "epoch": 5.084745762711864, + "grad_norm": 6.572183005650043, + "learning_rate": 1.7698580734075607e-07, + "logits/chosen": -1.48176908493042, + "logits/rejected": -1.4655022621154785, + "logps/chosen": -27.89720344543457, + "logps/rejected": -45.596282958984375, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14924107491970062, + "rewards/margins": 4.559385299682617, + "rewards/rejected": -4.7086262702941895, + "step": 300 + }, + { + "epoch": 5.101694915254237, + "grad_norm": 6.034869634249788, + "learning_rate": 1.7521623442029388e-07, + "logits/chosen": -1.5313202142715454, + "logits/rejected": -1.564162015914917, + "logps/chosen": -22.828670501708984, + "logps/rejected": -48.36279296875, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2727990448474884, + "rewards/margins": 4.790719985961914, + "rewards/rejected": -4.51792049407959, + "step": 301 + }, + { + "epoch": 5.11864406779661, + "grad_norm": 6.764929177752538, + "learning_rate": 1.7345076706901326e-07, + "logits/chosen": -1.4571880102157593, + "logits/rejected": -1.4540932178497314, + "logps/chosen": -30.096073150634766, + "logps/rejected": -55.99176788330078, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1132214367389679, + "rewards/margins": 5.543183326721191, + "rewards/rejected": -5.656404972076416, + "step": 302 + }, + { + "epoch": 5.135593220338983, + "grad_norm": 7.398683409083888, + "learning_rate": 1.7168950220966614e-07, + "logits/chosen": -1.430177927017212, + "logits/rejected": -1.482927918434143, + "logps/chosen": -30.467327117919922, + "logps/rejected": -45.27473831176758, + "loss": 0.0516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06274604797363281, + "rewards/margins": 4.2044267654418945, + "rewards/rejected": -4.267173767089844, + "step": 303 + }, + { + "epoch": 5.1525423728813555, + "grad_norm": 5.9915990011633555, + "learning_rate": 1.6993253653429062e-07, + "logits/chosen": -1.4992166757583618, + "logits/rejected": -1.4459744691848755, + "logps/chosen": -33.046180725097656, + "logps/rejected": -52.499366760253906, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5871363878250122, + "rewards/margins": 5.686320781707764, + "rewards/rejected": -6.2734575271606445, + "step": 304 + }, + { + "epoch": 5.169491525423728, + "grad_norm": 6.387235286093505, + "learning_rate": 1.681799664989033e-07, + "logits/chosen": -1.4004420042037964, + "logits/rejected": -1.3709527254104614, + "logps/chosen": -26.409391403198242, + "logps/rejected": -37.188838958740234, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3655552864074707, + "rewards/margins": 4.699047565460205, + "rewards/rejected": -4.333492279052734, + "step": 305 + }, + { + "epoch": 5.186440677966102, + "grad_norm": 5.71477855668128, + "learning_rate": 1.6643188831820374e-07, + "logits/chosen": -1.4717719554901123, + "logits/rejected": -1.5952577590942383, + "logps/chosen": -27.931270599365234, + "logps/rejected": -50.50065231323242, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6083439588546753, + "rewards/margins": 5.1790900230407715, + "rewards/rejected": -5.787433624267578, + "step": 306 + }, + { + "epoch": 5.203389830508475, + "grad_norm": 5.369095807459635, + "learning_rate": 1.6468839796029198e-07, + "logits/chosen": -1.5481698513031006, + "logits/rejected": -1.4016451835632324, + "logps/chosen": -33.04383850097656, + "logps/rejected": -58.85445022583008, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07381519675254822, + "rewards/margins": 5.094147205352783, + "rewards/rejected": -5.167962074279785, + "step": 307 + }, + { + "epoch": 5.220338983050848, + "grad_norm": 6.478677176582888, + "learning_rate": 1.6294959114140033e-07, + "logits/chosen": -1.6205213069915771, + "logits/rejected": -1.5254000425338745, + "logps/chosen": -29.354225158691406, + "logps/rejected": -43.66874694824219, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08936242759227753, + "rewards/margins": 4.020651817321777, + "rewards/rejected": -4.110013961791992, + "step": 308 + }, + { + "epoch": 5.237288135593221, + "grad_norm": 6.290350105724975, + "learning_rate": 1.6121556332063861e-07, + "logits/chosen": -1.3170948028564453, + "logits/rejected": -1.281385898590088, + "logps/chosen": -37.28167724609375, + "logps/rejected": -45.006961822509766, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06686612963676453, + "rewards/margins": 4.521815776824951, + "rewards/rejected": -4.588682174682617, + "step": 309 + }, + { + "epoch": 5.254237288135593, + "grad_norm": 7.295722008430091, + "learning_rate": 1.5948640969475345e-07, + "logits/chosen": -1.5421946048736572, + "logits/rejected": -1.406693935394287, + "logps/chosen": -26.546001434326172, + "logps/rejected": -40.070228576660156, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11074566841125488, + "rewards/margins": 4.666855812072754, + "rewards/rejected": -4.77760124206543, + "step": 310 + }, + { + "epoch": 5.271186440677966, + "grad_norm": 6.299381642433967, + "learning_rate": 1.5776222519290204e-07, + "logits/chosen": -1.5537474155426025, + "logits/rejected": -1.501166820526123, + "logps/chosen": -26.153173446655273, + "logps/rejected": -50.52534103393555, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.541397213935852, + "rewards/margins": 5.92680025100708, + "rewards/rejected": -6.468197822570801, + "step": 311 + }, + { + "epoch": 5.288135593220339, + "grad_norm": 6.901736790648853, + "learning_rate": 1.560431044714405e-07, + "logits/chosen": -1.547554850578308, + "logits/rejected": -1.4092559814453125, + "logps/chosen": -32.46575927734375, + "logps/rejected": -52.17520523071289, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18227432668209076, + "rewards/margins": 5.195803642272949, + "rewards/rejected": -5.378078460693359, + "step": 312 + }, + { + "epoch": 5.305084745762712, + "grad_norm": 6.537170506793814, + "learning_rate": 1.5432914190872756e-07, + "logits/chosen": -1.394778847694397, + "logits/rejected": -1.3562414646148682, + "logps/chosen": -26.12644386291504, + "logps/rejected": -38.51447677612305, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43324902653694153, + "rewards/margins": 4.649719715118408, + "rewards/rejected": -4.216470718383789, + "step": 313 + }, + { + "epoch": 5.322033898305085, + "grad_norm": 6.822269638404017, + "learning_rate": 1.5262043159994314e-07, + "logits/chosen": -1.603257179260254, + "logits/rejected": -1.41001558303833, + "logps/chosen": -27.069252014160156, + "logps/rejected": -54.39444351196289, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2375459223985672, + "rewards/margins": 5.968645095825195, + "rewards/rejected": -5.731099605560303, + "step": 314 + }, + { + "epoch": 5.338983050847458, + "grad_norm": 6.852429555090263, + "learning_rate": 1.5091706735192266e-07, + "logits/chosen": -1.5043295621871948, + "logits/rejected": -1.4766517877578735, + "logps/chosen": -21.581401824951172, + "logps/rejected": -50.377071380615234, + "loss": 0.0566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24750420451164246, + "rewards/margins": 5.493175506591797, + "rewards/rejected": -5.24567174911499, + "step": 315 + }, + { + "epoch": 5.3559322033898304, + "grad_norm": 6.421398270201425, + "learning_rate": 1.4921914267800699e-07, + "logits/chosen": -1.4200242757797241, + "logits/rejected": -1.3987623453140259, + "logps/chosen": -19.360551834106445, + "logps/rejected": -35.37394714355469, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14433270692825317, + "rewards/margins": 3.7558863162994385, + "rewards/rejected": -3.611553430557251, + "step": 316 + }, + { + "epoch": 5.372881355932203, + "grad_norm": 6.030116263122066, + "learning_rate": 1.4752675079290848e-07, + "logits/chosen": -1.3269891738891602, + "logits/rejected": -1.2499594688415527, + "logps/chosen": -29.73949432373047, + "logps/rejected": -37.97323989868164, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44939878582954407, + "rewards/margins": 3.9090001583099365, + "rewards/rejected": -4.358399391174316, + "step": 317 + }, + { + "epoch": 5.389830508474576, + "grad_norm": 6.16190997278292, + "learning_rate": 1.458399846075942e-07, + "logits/chosen": -1.5378894805908203, + "logits/rejected": -1.5542147159576416, + "logps/chosen": -33.65290832519531, + "logps/rejected": -57.606727600097656, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47632092237472534, + "rewards/margins": 5.512063980102539, + "rewards/rejected": -5.98838472366333, + "step": 318 + }, + { + "epoch": 5.406779661016949, + "grad_norm": 6.084242732250159, + "learning_rate": 1.441589367241846e-07, + "logits/chosen": -1.3787221908569336, + "logits/rejected": -1.2671631574630737, + "logps/chosen": -25.762453079223633, + "logps/rejected": -41.90483093261719, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07579733431339264, + "rewards/margins": 4.506519317626953, + "rewards/rejected": -4.430721759796143, + "step": 319 + }, + { + "epoch": 5.423728813559322, + "grad_norm": 6.400287756731778, + "learning_rate": 1.4248369943086995e-07, + "logits/chosen": -1.609413504600525, + "logits/rejected": -1.440029501914978, + "logps/chosen": -29.613908767700195, + "logps/rejected": -45.63286590576172, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1749686449766159, + "rewards/margins": 4.805515766143799, + "rewards/rejected": -4.980484485626221, + "step": 320 + }, + { + "epoch": 5.440677966101695, + "grad_norm": 5.58673826227153, + "learning_rate": 1.4081436469684337e-07, + "logits/chosen": -1.4598129987716675, + "logits/rejected": -1.3000261783599854, + "logps/chosen": -26.955453872680664, + "logps/rejected": -49.03952407836914, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2353724241256714, + "rewards/margins": 5.155057907104492, + "rewards/rejected": -5.390429973602295, + "step": 321 + }, + { + "epoch": 5.4576271186440675, + "grad_norm": 5.7806024491841885, + "learning_rate": 1.3915102416725286e-07, + "logits/chosen": -1.5266464948654175, + "logits/rejected": -1.406123161315918, + "logps/chosen": -21.365182876586914, + "logps/rejected": -44.53782653808594, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19776439666748047, + "rewards/margins": 4.6383209228515625, + "rewards/rejected": -4.440556526184082, + "step": 322 + }, + { + "epoch": 5.47457627118644, + "grad_norm": 6.663910066076515, + "learning_rate": 1.3749376915816885e-07, + "logits/chosen": -1.5701993703842163, + "logits/rejected": -1.552007794380188, + "logps/chosen": -36.67780685424805, + "logps/rejected": -47.98915481567383, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8618345856666565, + "rewards/margins": 4.562122821807861, + "rewards/rejected": -5.423957347869873, + "step": 323 + }, + { + "epoch": 5.491525423728813, + "grad_norm": 6.220022143578053, + "learning_rate": 1.3584269065157172e-07, + "logits/chosen": -1.3734331130981445, + "logits/rejected": -1.3631477355957031, + "logps/chosen": -36.97685623168945, + "logps/rejected": -53.4107780456543, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10500967502593994, + "rewards/margins": 4.521078109741211, + "rewards/rejected": -4.6260881423950195, + "step": 324 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 6.094670518205716, + "learning_rate": 1.341978792903568e-07, + "logits/chosen": -1.4123265743255615, + "logits/rejected": -1.3930362462997437, + "logps/chosen": -24.757308959960938, + "logps/rejected": -50.27605438232422, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1657731980085373, + "rewards/margins": 6.393916606903076, + "rewards/rejected": -6.22814416885376, + "step": 325 + }, + { + "epoch": 5.52542372881356, + "grad_norm": 7.344149893164419, + "learning_rate": 1.3255942537335804e-07, + "logits/chosen": -1.4898028373718262, + "logits/rejected": -1.529905915260315, + "logps/chosen": -30.63724136352539, + "logps/rejected": -47.23683547973633, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2261982262134552, + "rewards/margins": 4.666712760925293, + "rewards/rejected": -4.892910957336426, + "step": 326 + }, + { + "epoch": 5.5423728813559325, + "grad_norm": 5.4257600584522185, + "learning_rate": 1.3092741885039085e-07, + "logits/chosen": -1.430129051208496, + "logits/rejected": -1.2972102165222168, + "logps/chosen": -29.192171096801758, + "logps/rejected": -61.473453521728516, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42023560404777527, + "rewards/margins": 5.649378299713135, + "rewards/rejected": -6.069613933563232, + "step": 327 + }, + { + "epoch": 5.559322033898305, + "grad_norm": 6.750041211234325, + "learning_rate": 1.2930194931731382e-07, + "logits/chosen": -1.5282580852508545, + "logits/rejected": -1.464250087738037, + "logps/chosen": -21.08121109008789, + "logps/rejected": -36.48762512207031, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21070019900798798, + "rewards/margins": 4.739938735961914, + "rewards/rejected": -4.950639247894287, + "step": 328 + }, + { + "epoch": 5.576271186440678, + "grad_norm": 6.346382606047745, + "learning_rate": 1.2768310601110993e-07, + "logits/chosen": -1.544804334640503, + "logits/rejected": -1.5036790370941162, + "logps/chosen": -25.425878524780273, + "logps/rejected": -57.694950103759766, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15332584083080292, + "rewards/margins": 6.2687883377075195, + "rewards/rejected": -6.115462303161621, + "step": 329 + }, + { + "epoch": 5.593220338983051, + "grad_norm": 5.891747930904324, + "learning_rate": 1.260709778049877e-07, + "logits/chosen": -1.4886832237243652, + "logits/rejected": -1.5345224142074585, + "logps/chosen": -24.217357635498047, + "logps/rejected": -38.936588287353516, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14570964872837067, + "rewards/margins": 3.9201903343200684, + "rewards/rejected": -3.7744803428649902, + "step": 330 + }, + { + "epoch": 5.610169491525424, + "grad_norm": 6.479009923461722, + "learning_rate": 1.2446565320350182e-07, + "logits/chosen": -1.6119954586029053, + "logits/rejected": -1.5358697175979614, + "logps/chosen": -23.517757415771484, + "logps/rejected": -43.86408233642578, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01591832935810089, + "rewards/margins": 5.240238189697266, + "rewards/rejected": -5.2243194580078125, + "step": 331 + }, + { + "epoch": 5.627118644067797, + "grad_norm": 5.354023576639168, + "learning_rate": 1.2286722033769492e-07, + "logits/chosen": -1.630919337272644, + "logits/rejected": -1.538849949836731, + "logps/chosen": -28.93423080444336, + "logps/rejected": -53.66632080078125, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17018574476242065, + "rewards/margins": 5.786593437194824, + "rewards/rejected": -5.9567790031433105, + "step": 332 + }, + { + "epoch": 5.6440677966101696, + "grad_norm": 5.571846024722403, + "learning_rate": 1.2127576696025826e-07, + "logits/chosen": -1.3449130058288574, + "logits/rejected": -1.3209434747695923, + "logps/chosen": -28.3823299407959, + "logps/rejected": -57.359275817871094, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19171355664730072, + "rewards/margins": 6.2648420333862305, + "rewards/rejected": -6.4565558433532715, + "step": 333 + }, + { + "epoch": 5.661016949152542, + "grad_norm": 6.138956177743218, + "learning_rate": 1.19691380440715e-07, + "logits/chosen": -1.358786702156067, + "logits/rejected": -1.3909467458724976, + "logps/chosen": -30.639657974243164, + "logps/rejected": -48.24367141723633, + "loss": 0.0411, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4908216595649719, + "rewards/margins": 4.40742301940918, + "rewards/rejected": -4.898244857788086, + "step": 334 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 4.507338496018484, + "learning_rate": 1.1811414776062365e-07, + "logits/chosen": -1.2691717147827148, + "logits/rejected": -1.2181487083435059, + "logps/chosen": -32.79166793823242, + "logps/rejected": -39.962955474853516, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39959853887557983, + "rewards/margins": 4.66073751449585, + "rewards/rejected": -4.261138916015625, + "step": 335 + }, + { + "epoch": 5.694915254237288, + "grad_norm": 6.700687388710383, + "learning_rate": 1.1654415550880242e-07, + "logits/chosen": -1.4832508563995361, + "logits/rejected": -1.4977812767028809, + "logps/chosen": -25.554107666015625, + "logps/rejected": -42.260276794433594, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07877922058105469, + "rewards/margins": 4.69774055480957, + "rewards/rejected": -4.776519775390625, + "step": 336 + }, + { + "epoch": 5.711864406779661, + "grad_norm": 6.852779788605717, + "learning_rate": 1.1498148987657549e-07, + "logits/chosen": -1.2651898860931396, + "logits/rejected": -1.2874916791915894, + "logps/chosen": -31.26464080810547, + "logps/rejected": -56.77597427368164, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4548006057739258, + "rewards/margins": 6.219876289367676, + "rewards/rejected": -6.67467737197876, + "step": 337 + }, + { + "epoch": 5.728813559322034, + "grad_norm": 5.857910583133337, + "learning_rate": 1.1342623665304207e-07, + "logits/chosen": -1.5274604558944702, + "logits/rejected": -1.5151126384735107, + "logps/chosen": -26.850894927978516, + "logps/rejected": -47.350067138671875, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5475198030471802, + "rewards/margins": 4.643810272216797, + "rewards/rejected": -5.1913299560546875, + "step": 338 + }, + { + "epoch": 5.745762711864407, + "grad_norm": 7.203472778094057, + "learning_rate": 1.1187848122036562e-07, + "logits/chosen": -1.6401658058166504, + "logits/rejected": -1.6699111461639404, + "logps/chosen": -28.927499771118164, + "logps/rejected": -36.704795837402344, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15964704751968384, + "rewards/margins": 4.4911370277404785, + "rewards/rejected": -4.3314900398254395, + "step": 339 + }, + { + "epoch": 5.762711864406779, + "grad_norm": 7.2779221460952, + "learning_rate": 1.1033830854908691e-07, + "logits/chosen": -1.502543330192566, + "logits/rejected": -1.4199045896530151, + "logps/chosen": -23.966299057006836, + "logps/rejected": -46.33234786987305, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10161617398262024, + "rewards/margins": 5.133113861083984, + "rewards/rejected": -5.2347307205200195, + "step": 340 + }, + { + "epoch": 5.779661016949152, + "grad_norm": 7.302109736865858, + "learning_rate": 1.0880580319345919e-07, + "logits/chosen": -1.4564778804779053, + "logits/rejected": -1.5227388143539429, + "logps/chosen": -33.69976806640625, + "logps/rejected": -42.49582290649414, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1525229811668396, + "rewards/margins": 4.674422264099121, + "rewards/rejected": -4.8269453048706055, + "step": 341 + }, + { + "epoch": 5.796610169491525, + "grad_norm": 7.067583835233995, + "learning_rate": 1.0728104928680623e-07, + "logits/chosen": -1.5719774961471558, + "logits/rejected": -1.502136468887329, + "logps/chosen": -21.33773422241211, + "logps/rejected": -42.372676849365234, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12551027536392212, + "rewards/margins": 5.258235454559326, + "rewards/rejected": -5.3837456703186035, + "step": 342 + }, + { + "epoch": 5.813559322033898, + "grad_norm": 5.957807006157361, + "learning_rate": 1.0576413053690326e-07, + "logits/chosen": -1.4468128681182861, + "logits/rejected": -1.3548585176467896, + "logps/chosen": -24.791799545288086, + "logps/rejected": -44.49803161621094, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11925530433654785, + "rewards/margins": 5.184385299682617, + "rewards/rejected": -5.06512975692749, + "step": 343 + }, + { + "epoch": 5.830508474576272, + "grad_norm": 6.401362630388005, + "learning_rate": 1.0425513022138202e-07, + "logits/chosen": -1.491408348083496, + "logits/rejected": -1.3371365070343018, + "logps/chosen": -29.460378646850586, + "logps/rejected": -53.63321304321289, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3325144648551941, + "rewards/margins": 5.398627758026123, + "rewards/rejected": -5.731142520904541, + "step": 344 + }, + { + "epoch": 5.847457627118644, + "grad_norm": 5.5542840449584405, + "learning_rate": 1.0275413118315798e-07, + "logits/chosen": -1.3796604871749878, + "logits/rejected": -1.377408504486084, + "logps/chosen": -26.488426208496094, + "logps/rejected": -46.708282470703125, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1414252519607544, + "rewards/margins": 5.361431121826172, + "rewards/rejected": -5.220005989074707, + "step": 345 + }, + { + "epoch": 5.864406779661017, + "grad_norm": 6.360073336005713, + "learning_rate": 1.0126121582588315e-07, + "logits/chosen": -1.5218093395233154, + "logits/rejected": -1.3046934604644775, + "logps/chosen": -41.216285705566406, + "logps/rejected": -43.98678970336914, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5423004627227783, + "rewards/margins": 4.604074001312256, + "rewards/rejected": -5.146374702453613, + "step": 346 + }, + { + "epoch": 5.88135593220339, + "grad_norm": 5.539790259394075, + "learning_rate": 9.977646610942201e-08, + "logits/chosen": -1.4420253038406372, + "logits/rejected": -1.4948465824127197, + "logps/chosen": -37.67629623413086, + "logps/rejected": -53.65730667114258, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8092373609542847, + "rewards/margins": 5.268472671508789, + "rewards/rejected": -6.077709674835205, + "step": 347 + }, + { + "epoch": 5.898305084745763, + "grad_norm": 6.554303392876224, + "learning_rate": 9.829996354535172e-08, + "logits/chosen": -1.5454871654510498, + "logits/rejected": -1.5333365201950073, + "logps/chosen": -21.350933074951172, + "logps/rejected": -49.00358200073242, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24133503437042236, + "rewards/margins": 4.911167144775391, + "rewards/rejected": -5.152502059936523, + "step": 348 + }, + { + "epoch": 5.915254237288136, + "grad_norm": 6.546704494284942, + "learning_rate": 9.68317891924871e-08, + "logits/chosen": -1.5749708414077759, + "logits/rejected": -1.433280110359192, + "logps/chosen": -34.201988220214844, + "logps/rejected": -51.28511428833008, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3138273358345032, + "rewards/margins": 4.917507648468018, + "rewards/rejected": -5.231335163116455, + "step": 349 + }, + { + "epoch": 5.932203389830509, + "grad_norm": 6.294270520592479, + "learning_rate": 9.53720236524313e-08, + "logits/chosen": -1.5306050777435303, + "logits/rejected": -1.442819356918335, + "logps/chosen": -39.66340255737305, + "logps/rejected": -47.36227798461914, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3249303102493286, + "rewards/margins": 4.536349296569824, + "rewards/rejected": -4.861279487609863, + "step": 350 + }, + { + "epoch": 5.9491525423728815, + "grad_norm": 6.24196436709209, + "learning_rate": 9.392074706515002e-08, + "logits/chosen": -1.3876663446426392, + "logits/rejected": -1.3757574558258057, + "logps/chosen": -29.42030143737793, + "logps/rejected": -51.045021057128906, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15947160124778748, + "rewards/margins": 4.901638031005859, + "rewards/rejected": -5.06110954284668, + "step": 351 + }, + { + "epoch": 5.966101694915254, + "grad_norm": 7.215999575793012, + "learning_rate": 9.247803910457225e-08, + "logits/chosen": -1.5945814847946167, + "logits/rejected": -1.5649042129516602, + "logps/chosen": -23.595874786376953, + "logps/rejected": -45.547088623046875, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08822253346443176, + "rewards/margins": 4.996213912963867, + "rewards/rejected": -5.084437370300293, + "step": 352 + }, + { + "epoch": 5.983050847457627, + "grad_norm": 6.19260019151604, + "learning_rate": 9.104397897421623e-08, + "logits/chosen": -1.4307911396026611, + "logits/rejected": -1.3201934099197388, + "logps/chosen": -26.220760345458984, + "logps/rejected": -51.94208526611328, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05383840203285217, + "rewards/margins": 5.268675804138184, + "rewards/rejected": -5.322514057159424, + "step": 353 + }, + { + "epoch": 6.0, + "grad_norm": 5.881521162946629, + "learning_rate": 8.961864540284119e-08, + "logits/chosen": -1.46354079246521, + "logits/rejected": -1.369706392288208, + "logps/chosen": -21.57988739013672, + "logps/rejected": -39.70952606201172, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02779284119606018, + "rewards/margins": 4.516615867614746, + "rewards/rejected": -4.488822937011719, + "step": 354 + }, + { + "epoch": 6.016949152542373, + "grad_norm": 6.0145355879659625, + "learning_rate": 8.82021166401253e-08, + "logits/chosen": -1.124272108078003, + "logits/rejected": -1.1505348682403564, + "logps/chosen": -46.4174919128418, + "logps/rejected": -55.31433868408203, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7474560737609863, + "rewards/margins": 5.104282379150391, + "rewards/rejected": -5.851738452911377, + "step": 355 + }, + { + "epoch": 6.033898305084746, + "grad_norm": 5.799187572293291, + "learning_rate": 8.679447045236962e-08, + "logits/chosen": -1.4048717021942139, + "logits/rejected": -1.479861855506897, + "logps/chosen": -21.200166702270508, + "logps/rejected": -38.02268600463867, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14473703503608704, + "rewards/margins": 5.024458885192871, + "rewards/rejected": -5.169196128845215, + "step": 356 + }, + { + "epoch": 6.0508474576271185, + "grad_norm": 7.041713270895366, + "learning_rate": 8.539578411822901e-08, + "logits/chosen": -1.3607594966888428, + "logits/rejected": -1.3202852010726929, + "logps/chosen": -27.951833724975586, + "logps/rejected": -46.871917724609375, + "loss": 0.0439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13125565648078918, + "rewards/margins": 4.497035026550293, + "rewards/rejected": -4.365779399871826, + "step": 357 + }, + { + "epoch": 6.067796610169491, + "grad_norm": 4.947363488557164, + "learning_rate": 8.400613442446947e-08, + "logits/chosen": -1.571787714958191, + "logits/rejected": -1.4330024719238281, + "logps/chosen": -28.946853637695312, + "logps/rejected": -45.973785400390625, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.589256763458252, + "rewards/margins": 5.187601566314697, + "rewards/rejected": -5.776858329772949, + "step": 358 + }, + { + "epoch": 6.084745762711864, + "grad_norm": 5.199584429039793, + "learning_rate": 8.262559766175253e-08, + "logits/chosen": -1.4506189823150635, + "logits/rejected": -1.3975802659988403, + "logps/chosen": -24.713668823242188, + "logps/rejected": -51.15769577026367, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11659543216228485, + "rewards/margins": 6.322598457336426, + "rewards/rejected": -6.439194202423096, + "step": 359 + }, + { + "epoch": 6.101694915254237, + "grad_norm": 5.228661804484342, + "learning_rate": 8.125424962044741e-08, + "logits/chosen": -1.679395079612732, + "logits/rejected": -1.5920395851135254, + "logps/chosen": -32.77759552001953, + "logps/rejected": -51.247283935546875, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6732353568077087, + "rewards/margins": 5.932892322540283, + "rewards/rejected": -6.6061272621154785, + "step": 360 + }, + { + "epoch": 6.11864406779661, + "grad_norm": 4.99263433146697, + "learning_rate": 7.989216558646941e-08, + "logits/chosen": -1.5995450019836426, + "logits/rejected": -1.5377026796340942, + "logps/chosen": -30.13187026977539, + "logps/rejected": -47.109947204589844, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2468140721321106, + "rewards/margins": 5.169532299041748, + "rewards/rejected": -5.416346073150635, + "step": 361 + }, + { + "epoch": 6.135593220338983, + "grad_norm": 5.703896702738386, + "learning_rate": 7.853942033714736e-08, + "logits/chosen": -1.5143556594848633, + "logits/rejected": -1.3968244791030884, + "logps/chosen": -37.41975402832031, + "logps/rejected": -56.349769592285156, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31496256589889526, + "rewards/margins": 5.113517761230469, + "rewards/rejected": -5.428481101989746, + "step": 362 + }, + { + "epoch": 6.1525423728813555, + "grad_norm": 5.916237621370921, + "learning_rate": 7.719608813711847e-08, + "logits/chosen": -1.5250320434570312, + "logits/rejected": -1.475509762763977, + "logps/chosen": -26.13006591796875, + "logps/rejected": -35.852359771728516, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13443481922149658, + "rewards/margins": 4.1385908126831055, + "rewards/rejected": -4.00415563583374, + "step": 363 + }, + { + "epoch": 6.169491525423728, + "grad_norm": 6.510656924645236, + "learning_rate": 7.586224273425081e-08, + "logits/chosen": -1.3499259948730469, + "logits/rejected": -1.244497299194336, + "logps/chosen": -34.813541412353516, + "logps/rejected": -47.603271484375, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11887124180793762, + "rewards/margins": 4.864650726318359, + "rewards/rejected": -4.745779514312744, + "step": 364 + }, + { + "epoch": 6.186440677966102, + "grad_norm": 5.630103461727817, + "learning_rate": 7.45379573555947e-08, + "logits/chosen": -1.4300577640533447, + "logits/rejected": -1.4099435806274414, + "logps/chosen": -34.18559646606445, + "logps/rejected": -45.739933013916016, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07826349139213562, + "rewards/margins": 4.771849155426025, + "rewards/rejected": -4.8501129150390625, + "step": 365 + }, + { + "epoch": 6.203389830508475, + "grad_norm": 5.238611793677947, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.5490187406539917, + "logits/rejected": -1.6131647825241089, + "logps/chosen": -29.01973533630371, + "logps/rejected": -48.46769332885742, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043659090995788574, + "rewards/margins": 4.948011875152588, + "rewards/rejected": -4.991670608520508, + "step": 366 + }, + { + "epoch": 6.220338983050848, + "grad_norm": 7.178172383273588, + "learning_rate": 7.19183569509398e-08, + "logits/chosen": -1.4348180294036865, + "logits/rejected": -1.4342725276947021, + "logps/chosen": -26.032867431640625, + "logps/rejected": -35.465248107910156, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11982511729001999, + "rewards/margins": 4.798130035400391, + "rewards/rejected": -4.678304672241211, + "step": 367 + }, + { + "epoch": 6.237288135593221, + "grad_norm": 5.80114325638975, + "learning_rate": 7.062318573891715e-08, + "logits/chosen": -1.3932957649230957, + "logits/rejected": -1.3466596603393555, + "logps/chosen": -26.193689346313477, + "logps/rejected": -43.341983795166016, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26808837056159973, + "rewards/margins": 5.052790641784668, + "rewards/rejected": -4.784702301025391, + "step": 368 + }, + { + "epoch": 6.254237288135593, + "grad_norm": 5.8991450258769875, + "learning_rate": 6.933786217116364e-08, + "logits/chosen": -1.5599933862686157, + "logits/rejected": -1.4719116687774658, + "logps/chosen": -26.387096405029297, + "logps/rejected": -41.33942794799805, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40677589178085327, + "rewards/margins": 4.713696479797363, + "rewards/rejected": -4.306921005249023, + "step": 369 + }, + { + "epoch": 6.271186440677966, + "grad_norm": 5.827867650343381, + "learning_rate": 6.806245681091944e-08, + "logits/chosen": -1.4101459980010986, + "logits/rejected": -1.3021355867385864, + "logps/chosen": -27.591569900512695, + "logps/rejected": -47.71153259277344, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1557801067829132, + "rewards/margins": 5.925251483917236, + "rewards/rejected": -5.769471168518066, + "step": 370 + }, + { + "epoch": 6.288135593220339, + "grad_norm": 5.390338841256569, + "learning_rate": 6.679703967692321e-08, + "logits/chosen": -1.584707498550415, + "logits/rejected": -1.4573631286621094, + "logps/chosen": -21.927671432495117, + "logps/rejected": -49.91468048095703, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.393250048160553, + "rewards/margins": 5.488755226135254, + "rewards/rejected": -5.095505714416504, + "step": 371 + }, + { + "epoch": 6.305084745762712, + "grad_norm": 4.571609396535961, + "learning_rate": 6.554168023956816e-08, + "logits/chosen": -1.4562060832977295, + "logits/rejected": -1.3005712032318115, + "logps/chosen": -25.927963256835938, + "logps/rejected": -44.458621978759766, + "loss": 0.0284, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2479822039604187, + "rewards/margins": 4.508713722229004, + "rewards/rejected": -4.756695747375488, + "step": 372 + }, + { + "epoch": 6.322033898305085, + "grad_norm": 6.286732270647367, + "learning_rate": 6.429644741708779e-08, + "logits/chosen": -1.4326915740966797, + "logits/rejected": -1.3545727729797363, + "logps/chosen": -25.131105422973633, + "logps/rejected": -40.29548263549805, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04497765004634857, + "rewards/margins": 4.998070240020752, + "rewards/rejected": -4.953092575073242, + "step": 373 + }, + { + "epoch": 6.338983050847458, + "grad_norm": 6.303722387593113, + "learning_rate": 6.306140957177225e-08, + "logits/chosen": -1.452634334564209, + "logits/rejected": -1.427751898765564, + "logps/chosen": -26.491840362548828, + "logps/rejected": -47.47591781616211, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015405148267745972, + "rewards/margins": 5.423872947692871, + "rewards/rejected": -5.439278602600098, + "step": 374 + }, + { + "epoch": 6.3559322033898304, + "grad_norm": 5.766682311294022, + "learning_rate": 6.183663450621607e-08, + "logits/chosen": -1.506758213043213, + "logits/rejected": -1.4310308694839478, + "logps/chosen": -38.03307342529297, + "logps/rejected": -46.39859390258789, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16017396748065948, + "rewards/margins": 4.915417194366455, + "rewards/rejected": -5.075592041015625, + "step": 375 + }, + { + "epoch": 6.372881355932203, + "grad_norm": 4.912000082420141, + "learning_rate": 6.062218945959496e-08, + "logits/chosen": -1.4634639024734497, + "logits/rejected": -1.5328123569488525, + "logps/chosen": -34.974857330322266, + "logps/rejected": -43.447689056396484, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06525677442550659, + "rewards/margins": 5.0226850509643555, + "rewards/rejected": -5.087942123413086, + "step": 376 + }, + { + "epoch": 6.389830508474576, + "grad_norm": 4.923323155963838, + "learning_rate": 5.9418141103975026e-08, + "logits/chosen": -1.4239155054092407, + "logits/rejected": -1.4835269451141357, + "logps/chosen": -30.768068313598633, + "logps/rejected": -60.89370346069336, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3536137342453003, + "rewards/margins": 6.844842910766602, + "rewards/rejected": -7.198456764221191, + "step": 377 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 6.31358841065967, + "learning_rate": 5.822455554065217e-08, + "logits/chosen": -1.3272855281829834, + "logits/rejected": -1.2906978130340576, + "logps/chosen": -25.13174819946289, + "logps/rejected": -41.51945877075195, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2586425840854645, + "rewards/margins": 5.224736213684082, + "rewards/rejected": -4.966093063354492, + "step": 378 + }, + { + "epoch": 6.423728813559322, + "grad_norm": 5.9217345486120365, + "learning_rate": 5.704149829652341e-08, + "logits/chosen": -1.5300796031951904, + "logits/rejected": -1.4160737991333008, + "logps/chosen": -29.946929931640625, + "logps/rejected": -49.227298736572266, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1305888444185257, + "rewards/margins": 5.067564487457275, + "rewards/rejected": -5.198153495788574, + "step": 379 + }, + { + "epoch": 6.440677966101695, + "grad_norm": 5.394022162647855, + "learning_rate": 5.586903432048942e-08, + "logits/chosen": -1.5190213918685913, + "logits/rejected": -1.4037725925445557, + "logps/chosen": -31.318918228149414, + "logps/rejected": -49.82917404174805, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9247037768363953, + "rewards/margins": 5.2036919593811035, + "rewards/rejected": -6.128396034240723, + "step": 380 + }, + { + "epoch": 6.4576271186440675, + "grad_norm": 5.307172264281174, + "learning_rate": 5.470722797988883e-08, + "logits/chosen": -1.3760260343551636, + "logits/rejected": -1.3698216676712036, + "logps/chosen": -24.693878173828125, + "logps/rejected": -37.211387634277344, + "loss": 0.0353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05154569447040558, + "rewards/margins": 4.664517879486084, + "rewards/rejected": -4.612972736358643, + "step": 381 + }, + { + "epoch": 6.47457627118644, + "grad_norm": 5.039925613354933, + "learning_rate": 5.355614305696468e-08, + "logits/chosen": -1.56493079662323, + "logits/rejected": -1.5143284797668457, + "logps/chosen": -27.657060623168945, + "logps/rejected": -43.6398811340332, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09710407257080078, + "rewards/margins": 4.8609700202941895, + "rewards/rejected": -4.958074569702148, + "step": 382 + }, + { + "epoch": 6.491525423728813, + "grad_norm": 6.304781751843407, + "learning_rate": 5.241584274536259e-08, + "logits/chosen": -1.3160762786865234, + "logits/rejected": -1.426667332649231, + "logps/chosen": -31.549245834350586, + "logps/rejected": -49.771148681640625, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3802323043346405, + "rewards/margins": 5.496129035949707, + "rewards/rejected": -5.876360893249512, + "step": 383 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 5.063738993506685, + "learning_rate": 5.1286389646661654e-08, + "logits/chosen": -1.396496057510376, + "logits/rejected": -1.3829492330551147, + "logps/chosen": -28.830703735351562, + "logps/rejected": -47.824317932128906, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2612743079662323, + "rewards/margins": 4.95106315612793, + "rewards/rejected": -5.212337017059326, + "step": 384 + }, + { + "epoch": 6.52542372881356, + "grad_norm": 5.637690264352778, + "learning_rate": 5.0167845766937806e-08, + "logits/chosen": -1.4855574369430542, + "logits/rejected": -1.3984534740447998, + "logps/chosen": -27.114734649658203, + "logps/rejected": -43.69512939453125, + "loss": 0.0414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21004855632781982, + "rewards/margins": 4.845144748687744, + "rewards/rejected": -5.055192947387695, + "step": 385 + }, + { + "epoch": 6.5423728813559325, + "grad_norm": 6.025392312759281, + "learning_rate": 4.906027251335917e-08, + "logits/chosen": -1.4550882577896118, + "logits/rejected": -1.4542597532272339, + "logps/chosen": -23.857240676879883, + "logps/rejected": -47.817237854003906, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31768307089805603, + "rewards/margins": 4.948706150054932, + "rewards/rejected": -5.266389846801758, + "step": 386 + }, + { + "epoch": 6.559322033898305, + "grad_norm": 5.587911152077424, + "learning_rate": 4.7963730690815467e-08, + "logits/chosen": -1.4407036304473877, + "logits/rejected": -1.3747568130493164, + "logps/chosen": -19.481971740722656, + "logps/rejected": -42.736568450927734, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19361047446727753, + "rewards/margins": 5.198715686798096, + "rewards/rejected": -5.392325401306152, + "step": 387 + }, + { + "epoch": 6.576271186440678, + "grad_norm": 6.586187305125031, + "learning_rate": 4.687828049857967e-08, + "logits/chosen": -1.3945553302764893, + "logits/rejected": -1.322791337966919, + "logps/chosen": -29.22484016418457, + "logps/rejected": -40.85685348510742, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17647796869277954, + "rewards/margins": 4.56741189956665, + "rewards/rejected": -4.390933990478516, + "step": 388 + }, + { + "epoch": 6.593220338983051, + "grad_norm": 6.218480481228633, + "learning_rate": 4.580398152700304e-08, + "logits/chosen": -1.6018644571304321, + "logits/rejected": -1.4887254238128662, + "logps/chosen": -25.33257484436035, + "logps/rejected": -45.91154479980469, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20426103472709656, + "rewards/margins": 5.0216498374938965, + "rewards/rejected": -5.225910186767578, + "step": 389 + }, + { + "epoch": 6.610169491525424, + "grad_norm": 6.569813291973876, + "learning_rate": 4.47408927542435e-08, + "logits/chosen": -1.3770880699157715, + "logits/rejected": -1.286376714706421, + "logps/chosen": -23.488061904907227, + "logps/rejected": -38.757720947265625, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2643284499645233, + "rewards/margins": 3.8822245597839355, + "rewards/rejected": -4.146553039550781, + "step": 390 + }, + { + "epoch": 6.627118644067797, + "grad_norm": 5.39352903238084, + "learning_rate": 4.368907254302837e-08, + "logits/chosen": -1.5808112621307373, + "logits/rejected": -1.5275750160217285, + "logps/chosen": -20.95781707763672, + "logps/rejected": -38.6187744140625, + "loss": 0.0366, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05846305191516876, + "rewards/margins": 4.244011878967285, + "rewards/rejected": -4.302474498748779, + "step": 391 + }, + { + "epoch": 6.6440677966101696, + "grad_norm": 5.323944978561868, + "learning_rate": 4.264857863744956e-08, + "logits/chosen": -1.5523847341537476, + "logits/rejected": -1.39801824092865, + "logps/chosen": -22.804271697998047, + "logps/rejected": -38.31208419799805, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4400111436843872, + "rewards/margins": 5.238194465637207, + "rewards/rejected": -4.798183917999268, + "step": 392 + }, + { + "epoch": 6.661016949152542, + "grad_norm": 7.086981471555236, + "learning_rate": 4.161946815979403e-08, + "logits/chosen": -1.4906023740768433, + "logits/rejected": -1.4377923011779785, + "logps/chosen": -37.56535720825195, + "logps/rejected": -52.383487701416016, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3086543679237366, + "rewards/margins": 5.0754618644714355, + "rewards/rejected": -5.384116172790527, + "step": 393 + }, + { + "epoch": 6.677966101694915, + "grad_norm": 5.2671704490811075, + "learning_rate": 4.0601797607407505e-08, + "logits/chosen": -1.4550049304962158, + "logits/rejected": -1.3966783285140991, + "logps/chosen": -22.132400512695312, + "logps/rejected": -41.57554626464844, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2699892520904541, + "rewards/margins": 4.588739395141602, + "rewards/rejected": -4.858728408813477, + "step": 394 + }, + { + "epoch": 6.694915254237288, + "grad_norm": 5.32453491166638, + "learning_rate": 3.9595622849593e-08, + "logits/chosen": -1.512937307357788, + "logits/rejected": -1.3621115684509277, + "logps/chosen": -28.885116577148438, + "logps/rejected": -50.2530517578125, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5175858736038208, + "rewards/margins": 5.390725135803223, + "rewards/rejected": -5.908310890197754, + "step": 395 + }, + { + "epoch": 6.711864406779661, + "grad_norm": 5.792264114569628, + "learning_rate": 3.8600999124543455e-08, + "logits/chosen": -1.5662391185760498, + "logits/rejected": -1.42247474193573, + "logps/chosen": -25.263622283935547, + "logps/rejected": -45.11002731323242, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27481862902641296, + "rewards/margins": 5.28562593460083, + "rewards/rejected": -5.010807037353516, + "step": 396 + }, + { + "epoch": 6.728813559322034, + "grad_norm": 5.997208149184034, + "learning_rate": 3.7617981036309533e-08, + "logits/chosen": -1.5715502500534058, + "logits/rejected": -1.6154096126556396, + "logps/chosen": -23.581790924072266, + "logps/rejected": -45.5838623046875, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18038499355316162, + "rewards/margins": 4.884888172149658, + "rewards/rejected": -5.065273284912109, + "step": 397 + }, + { + "epoch": 6.745762711864407, + "grad_norm": 5.245999983483119, + "learning_rate": 3.664662255180134e-08, + "logits/chosen": -1.4324719905853271, + "logits/rejected": -1.329777717590332, + "logps/chosen": -26.615516662597656, + "logps/rejected": -41.86264419555664, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05792872607707977, + "rewards/margins": 4.163052082061768, + "rewards/rejected": -4.105123519897461, + "step": 398 + }, + { + "epoch": 6.762711864406779, + "grad_norm": 4.518676670270046, + "learning_rate": 3.5686976997826245e-08, + "logits/chosen": -1.6349799633026123, + "logits/rejected": -1.6590909957885742, + "logps/chosen": -41.426727294921875, + "logps/rejected": -52.470558166503906, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4126446843147278, + "rewards/margins": 5.763060569763184, + "rewards/rejected": -6.175705432891846, + "step": 399 + }, + { + "epoch": 6.779661016949152, + "grad_norm": 5.721240252343364, + "learning_rate": 3.473909705816111e-08, + "logits/chosen": -1.4431383609771729, + "logits/rejected": -1.3899328708648682, + "logps/chosen": -36.3721923828125, + "logps/rejected": -52.30086898803711, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.106335163116455, + "rewards/margins": 5.6671295166015625, + "rewards/rejected": -6.773464679718018, + "step": 400 + }, + { + "epoch": 6.796610169491525, + "grad_norm": 5.164348258065598, + "learning_rate": 3.3803034770659824e-08, + "logits/chosen": -1.6824212074279785, + "logits/rejected": -1.5971221923828125, + "logps/chosen": -37.007022857666016, + "logps/rejected": -74.48016357421875, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6765580177307129, + "rewards/margins": 7.421613693237305, + "rewards/rejected": -8.09817123413086, + "step": 401 + }, + { + "epoch": 6.813559322033898, + "grad_norm": 4.808650804090459, + "learning_rate": 3.287884152439646e-08, + "logits/chosen": -1.3810476064682007, + "logits/rejected": -1.4028950929641724, + "logps/chosen": -29.925891876220703, + "logps/rejected": -49.08871841430664, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16381314396858215, + "rewards/margins": 6.113283157348633, + "rewards/rejected": -5.949470520019531, + "step": 402 + }, + { + "epoch": 6.830508474576272, + "grad_norm": 6.236701718893695, + "learning_rate": 3.19665680568445e-08, + "logits/chosen": -1.5909409523010254, + "logits/rejected": -1.5455948114395142, + "logps/chosen": -33.62629699707031, + "logps/rejected": -39.20241165161133, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20260879397392273, + "rewards/margins": 4.192948341369629, + "rewards/rejected": -4.395556926727295, + "step": 403 + }, + { + "epoch": 6.847457627118644, + "grad_norm": 4.563195691909389, + "learning_rate": 3.106626445109081e-08, + "logits/chosen": -1.4258350133895874, + "logits/rejected": -1.471103549003601, + "logps/chosen": -32.27821731567383, + "logps/rejected": -55.883445739746094, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37013059854507446, + "rewards/margins": 5.775850296020508, + "rewards/rejected": -6.1459808349609375, + "step": 404 + }, + { + "epoch": 6.864406779661017, + "grad_norm": 6.319828827539619, + "learning_rate": 3.017798013308645e-08, + "logits/chosen": -1.6110063791275024, + "logits/rejected": -1.5970886945724487, + "logps/chosen": -33.942317962646484, + "logps/rejected": -42.78126525878906, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08474293351173401, + "rewards/margins": 4.63005256652832, + "rewards/rejected": -4.545310020446777, + "step": 405 + }, + { + "epoch": 6.88135593220339, + "grad_norm": 6.956985946783924, + "learning_rate": 2.9301763868933153e-08, + "logits/chosen": -1.4388988018035889, + "logits/rejected": -1.4342849254608154, + "logps/chosen": -23.345102310180664, + "logps/rejected": -40.68294906616211, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11982996761798859, + "rewards/margins": 4.858503818511963, + "rewards/rejected": -4.978332996368408, + "step": 406 + }, + { + "epoch": 6.898305084745763, + "grad_norm": 5.4291102026527716, + "learning_rate": 2.843766376220616e-08, + "logits/chosen": -1.7093875408172607, + "logits/rejected": -1.6210181713104248, + "logps/chosen": -29.290470123291016, + "logps/rejected": -50.43269729614258, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7307620048522949, + "rewards/margins": 5.429152488708496, + "rewards/rejected": -6.159914016723633, + "step": 407 + }, + { + "epoch": 6.915254237288136, + "grad_norm": 5.393300173055395, + "learning_rate": 2.7585727251313195e-08, + "logits/chosen": -1.3670405149459839, + "logits/rejected": -1.1494945287704468, + "logps/chosen": -38.649288177490234, + "logps/rejected": -55.98222351074219, + "loss": 0.0388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4593397378921509, + "rewards/margins": 5.185694694519043, + "rewards/rejected": -5.645034313201904, + "step": 408 + }, + { + "epoch": 6.932203389830509, + "grad_norm": 4.837513841498393, + "learning_rate": 2.6746001106890377e-08, + "logits/chosen": -1.656294584274292, + "logits/rejected": -1.5977482795715332, + "logps/chosen": -29.096288681030273, + "logps/rejected": -46.76082229614258, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21669504046440125, + "rewards/margins": 5.154686450958252, + "rewards/rejected": -5.371381759643555, + "step": 409 + }, + { + "epoch": 6.9491525423728815, + "grad_norm": 5.248560918966433, + "learning_rate": 2.5918531429234364e-08, + "logits/chosen": -1.592848300933838, + "logits/rejected": -1.540168285369873, + "logps/chosen": -31.085020065307617, + "logps/rejected": -53.143741607666016, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6363227963447571, + "rewards/margins": 5.588076591491699, + "rewards/rejected": -6.224399089813232, + "step": 410 + }, + { + "epoch": 6.966101694915254, + "grad_norm": 5.2195974344109715, + "learning_rate": 2.5103363645771536e-08, + "logits/chosen": -1.65935218334198, + "logits/rejected": -1.6805386543273926, + "logps/chosen": -34.833702087402344, + "logps/rejected": -48.904422760009766, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1815856397151947, + "rewards/margins": 5.786221981048584, + "rewards/rejected": -5.967808246612549, + "step": 411 + }, + { + "epoch": 6.983050847457627, + "grad_norm": 5.470555151396108, + "learning_rate": 2.4300542508564114e-08, + "logits/chosen": -1.4322288036346436, + "logits/rejected": -1.3301326036453247, + "logps/chosen": -28.12655258178711, + "logps/rejected": -46.921104431152344, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31704282760620117, + "rewards/margins": 4.567241191864014, + "rewards/rejected": -4.884284019470215, + "step": 412 + }, + { + "epoch": 7.0, + "grad_norm": 5.578953824776314, + "learning_rate": 2.3510112091853357e-08, + "logits/chosen": -1.276556134223938, + "logits/rejected": -1.2297403812408447, + "logps/chosen": -21.443805694580078, + "logps/rejected": -48.807159423828125, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.017134033143520355, + "rewards/margins": 5.347229957580566, + "rewards/rejected": -5.330096244812012, + "step": 413 + }, + { + "epoch": 7.016949152542373, + "grad_norm": 6.999197685418954, + "learning_rate": 2.27321157896396e-08, + "logits/chosen": -1.5116349458694458, + "logits/rejected": -1.3849687576293945, + "logps/chosen": -27.718652725219727, + "logps/rejected": -47.36982345581055, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1311918944120407, + "rewards/margins": 5.171109676361084, + "rewards/rejected": -5.039917469024658, + "step": 414 + }, + { + "epoch": 7.033898305084746, + "grad_norm": 4.883886921689459, + "learning_rate": 2.1966596313300362e-08, + "logits/chosen": -1.6032100915908813, + "logits/rejected": -1.518362045288086, + "logps/chosen": -29.143009185791016, + "logps/rejected": -40.435001373291016, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24970372021198273, + "rewards/margins": 4.1580705642700195, + "rewards/rejected": -4.407774448394775, + "step": 415 + }, + { + "epoch": 7.0508474576271185, + "grad_norm": 5.855476388920514, + "learning_rate": 2.1213595689245384e-08, + "logits/chosen": -1.350822925567627, + "logits/rejected": -1.3723689317703247, + "logps/chosen": -23.788740158081055, + "logps/rejected": -40.06031799316406, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003228999674320221, + "rewards/margins": 4.06948184967041, + "rewards/rejected": -4.0727105140686035, + "step": 416 + }, + { + "epoch": 7.067796610169491, + "grad_norm": 5.942075006867736, + "learning_rate": 2.0473155256609363e-08, + "logits/chosen": -1.6869771480560303, + "logits/rejected": -1.6048226356506348, + "logps/chosen": -27.607175827026367, + "logps/rejected": -46.171112060546875, + "loss": 0.0478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15510573983192444, + "rewards/margins": 4.710604667663574, + "rewards/rejected": -4.865710258483887, + "step": 417 + }, + { + "epoch": 7.084745762711864, + "grad_norm": 5.9322015599453595, + "learning_rate": 1.9745315664982277e-08, + "logits/chosen": -1.5960508584976196, + "logits/rejected": -1.4311569929122925, + "logps/chosen": -20.32171630859375, + "logps/rejected": -41.58867645263672, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00745101273059845, + "rewards/margins": 5.94549560546875, + "rewards/rejected": -5.938044548034668, + "step": 418 + }, + { + "epoch": 7.101694915254237, + "grad_norm": 5.918275937210799, + "learning_rate": 1.9030116872178314e-08, + "logits/chosen": -1.699689269065857, + "logits/rejected": -1.5470737218856812, + "logps/chosen": -28.80027198791504, + "logps/rejected": -46.22833251953125, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4729538857936859, + "rewards/margins": 4.527037143707275, + "rewards/rejected": -4.999991416931152, + "step": 419 + }, + { + "epoch": 7.11864406779661, + "grad_norm": 6.2740518918594015, + "learning_rate": 1.8327598142041656e-08, + "logits/chosen": -1.4966247081756592, + "logits/rejected": -1.5006301403045654, + "logps/chosen": -39.597660064697266, + "logps/rejected": -59.670684814453125, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10089424252510071, + "rewards/margins": 5.958191871643066, + "rewards/rejected": -5.857297420501709, + "step": 420 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 5.848230208495223, + "learning_rate": 1.7637798042291125e-08, + "logits/chosen": -1.4466344118118286, + "logits/rejected": -1.4158880710601807, + "logps/chosen": -34.30101013183594, + "logps/rejected": -41.198646545410156, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5382071733474731, + "rewards/margins": 4.64283561706543, + "rewards/rejected": -5.1810431480407715, + "step": 421 + }, + { + "epoch": 7.1525423728813555, + "grad_norm": 4.83673555496754, + "learning_rate": 1.696075444240305e-08, + "logits/chosen": -1.481490135192871, + "logits/rejected": -1.3944220542907715, + "logps/chosen": -23.669601440429688, + "logps/rejected": -44.325191497802734, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12855800986289978, + "rewards/margins": 4.879583835601807, + "rewards/rejected": -5.008142471313477, + "step": 422 + }, + { + "epoch": 7.169491525423728, + "grad_norm": 5.610847549034329, + "learning_rate": 1.6296504511531834e-08, + "logits/chosen": -1.418038249015808, + "logits/rejected": -1.3723573684692383, + "logps/chosen": -27.353422164916992, + "logps/rejected": -49.684974670410156, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7425976395606995, + "rewards/margins": 4.4584550857543945, + "rewards/rejected": -5.201053142547607, + "step": 423 + }, + { + "epoch": 7.186440677966102, + "grad_norm": 5.7865923023012975, + "learning_rate": 1.5645084716469776e-08, + "logits/chosen": -1.6516090631484985, + "logits/rejected": -1.5412788391113281, + "logps/chosen": -33.868499755859375, + "logps/rejected": -49.333709716796875, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2233295440673828, + "rewards/margins": 5.964905261993408, + "rewards/rejected": -6.188235282897949, + "step": 424 + }, + { + "epoch": 7.203389830508475, + "grad_norm": 5.701505367137342, + "learning_rate": 1.5006530819644923e-08, + "logits/chosen": -1.416776418685913, + "logits/rejected": -1.5832912921905518, + "logps/chosen": -32.08591842651367, + "logps/rejected": -47.3320198059082, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.342715859413147, + "rewards/margins": 5.240063667297363, + "rewards/rejected": -5.582779884338379, + "step": 425 + }, + { + "epoch": 7.220338983050848, + "grad_norm": 4.437542789195646, + "learning_rate": 1.4380877877157832e-08, + "logits/chosen": -1.4847077131271362, + "logits/rejected": -1.4457132816314697, + "logps/chosen": -32.51253128051758, + "logps/rejected": -53.59151077270508, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6102309226989746, + "rewards/margins": 5.707812786102295, + "rewards/rejected": -6.3180437088012695, + "step": 426 + }, + { + "epoch": 7.237288135593221, + "grad_norm": 5.065631167693018, + "learning_rate": 1.3768160236856674e-08, + "logits/chosen": -1.4158098697662354, + "logits/rejected": -1.4317882061004639, + "logps/chosen": -29.830453872680664, + "logps/rejected": -54.25461196899414, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038821518421173096, + "rewards/margins": 5.01283597946167, + "rewards/rejected": -5.051657676696777, + "step": 427 + }, + { + "epoch": 7.254237288135593, + "grad_norm": 5.220392921602965, + "learning_rate": 1.316841153645215e-08, + "logits/chosen": -1.560931921005249, + "logits/rejected": -1.5064420700073242, + "logps/chosen": -30.01360321044922, + "logps/rejected": -48.26612854003906, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3863828778266907, + "rewards/margins": 5.301400184631348, + "rewards/rejected": -5.687783241271973, + "step": 428 + }, + { + "epoch": 7.271186440677966, + "grad_norm": 5.688311376504444, + "learning_rate": 1.2581664701670296e-08, + "logits/chosen": -1.52787446975708, + "logits/rejected": -1.4021316766738892, + "logps/chosen": -28.248119354248047, + "logps/rejected": -41.31006622314453, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3207528591156006, + "rewards/margins": 5.154188632965088, + "rewards/rejected": -5.474941253662109, + "step": 429 + }, + { + "epoch": 7.288135593220339, + "grad_norm": 5.800049450685864, + "learning_rate": 1.2007951944445121e-08, + "logits/chosen": -1.4032049179077148, + "logits/rejected": -1.3456979990005493, + "logps/chosen": -24.016460418701172, + "logps/rejected": -41.04709243774414, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04568904638290405, + "rewards/margins": 3.968392848968506, + "rewards/rejected": -4.0140814781188965, + "step": 430 + }, + { + "epoch": 7.305084745762712, + "grad_norm": 6.591249552173131, + "learning_rate": 1.144730476115019e-08, + "logits/chosen": -1.4663312435150146, + "logits/rejected": -1.437060832977295, + "logps/chosen": -25.586219787597656, + "logps/rejected": -59.44462585449219, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.418854296207428, + "rewards/margins": 6.551999092102051, + "rewards/rejected": -6.970853328704834, + "step": 431 + }, + { + "epoch": 7.322033898305085, + "grad_norm": 5.1436479741963765, + "learning_rate": 1.0899753930869394e-08, + "logits/chosen": -1.479379415512085, + "logits/rejected": -1.4856846332550049, + "logps/chosen": -24.105491638183594, + "logps/rejected": -42.0874137878418, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016550958156585693, + "rewards/margins": 4.623503684997559, + "rewards/rejected": -4.640054702758789, + "step": 432 + }, + { + "epoch": 7.338983050847458, + "grad_norm": 6.13135090791532, + "learning_rate": 1.036532951370736e-08, + "logits/chosen": -1.5114121437072754, + "logits/rejected": -1.487284541130066, + "logps/chosen": -30.636520385742188, + "logps/rejected": -55.582489013671875, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0013150721788406372, + "rewards/margins": 6.1965837478637695, + "rewards/rejected": -6.1952691078186035, + "step": 433 + }, + { + "epoch": 7.3559322033898304, + "grad_norm": 5.332642447483903, + "learning_rate": 9.844060849138997e-09, + "logits/chosen": -1.599272608757019, + "logits/rejected": -1.610948085784912, + "logps/chosen": -24.916534423828125, + "logps/rejected": -39.19096755981445, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01982739567756653, + "rewards/margins": 4.651766777038574, + "rewards/rejected": -4.671594142913818, + "step": 434 + }, + { + "epoch": 7.372881355932203, + "grad_norm": 6.134889313671512, + "learning_rate": 9.335976554398912e-09, + "logits/chosen": -1.540908932685852, + "logits/rejected": -1.5109443664550781, + "logps/chosen": -32.279666900634766, + "logps/rejected": -38.13484191894531, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6547858715057373, + "rewards/margins": 3.962502956390381, + "rewards/rejected": -4.617289066314697, + "step": 435 + }, + { + "epoch": 7.389830508474576, + "grad_norm": 5.165828974962686, + "learning_rate": 8.841104522910342e-09, + "logits/chosen": -1.5303874015808105, + "logits/rejected": -1.4300156831741333, + "logps/chosen": -33.03150939941406, + "logps/rejected": -49.22145080566406, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05898144841194153, + "rewards/margins": 5.506203651428223, + "rewards/rejected": -5.565184593200684, + "step": 436 + }, + { + "epoch": 7.406779661016949, + "grad_norm": 5.59136345566115, + "learning_rate": 8.359471922753714e-09, + "logits/chosen": -1.5242373943328857, + "logits/rejected": -1.3456315994262695, + "logps/chosen": -29.918407440185547, + "logps/rejected": -53.36083984375, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07045301795005798, + "rewards/margins": 5.991230487823486, + "rewards/rejected": -5.920777320861816, + "step": 437 + }, + { + "epoch": 7.423728813559322, + "grad_norm": 5.700922848308937, + "learning_rate": 7.891105195175356e-09, + "logits/chosen": -1.421931505203247, + "logits/rejected": -1.4313148260116577, + "logps/chosen": -30.72463607788086, + "logps/rejected": -39.33094787597656, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35566258430480957, + "rewards/margins": 4.320624351501465, + "rewards/rejected": -4.676286697387695, + "step": 438 + }, + { + "epoch": 7.440677966101695, + "grad_norm": 4.895447924779327, + "learning_rate": 7.4360300531355894e-09, + "logits/chosen": -1.2597088813781738, + "logits/rejected": -1.1974666118621826, + "logps/chosen": -34.747318267822266, + "logps/rejected": -60.813480377197266, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4029719829559326, + "rewards/margins": 5.782626628875732, + "rewards/rejected": -6.185598373413086, + "step": 439 + }, + { + "epoch": 7.4576271186440675, + "grad_norm": 4.753436790014126, + "learning_rate": 6.994271479897313e-09, + "logits/chosen": -1.2228598594665527, + "logits/rejected": -1.2148244380950928, + "logps/chosen": -23.159412384033203, + "logps/rejected": -39.971435546875, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20260876417160034, + "rewards/margins": 4.695430278778076, + "rewards/rejected": -4.49282169342041, + "step": 440 + }, + { + "epoch": 7.47457627118644, + "grad_norm": 5.465778826325062, + "learning_rate": 6.565853727654502e-09, + "logits/chosen": -1.6256568431854248, + "logits/rejected": -1.6748077869415283, + "logps/chosen": -35.98029327392578, + "logps/rejected": -51.9576530456543, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7835733890533447, + "rewards/margins": 5.101205825805664, + "rewards/rejected": -5.884779453277588, + "step": 441 + }, + { + "epoch": 7.491525423728813, + "grad_norm": 5.19068823671553, + "learning_rate": 6.150800316200605e-09, + "logits/chosen": -1.5622632503509521, + "logits/rejected": -1.6051169633865356, + "logps/chosen": -26.639972686767578, + "logps/rejected": -39.97083282470703, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18523569405078888, + "rewards/margins": 5.357626914978027, + "rewards/rejected": -5.172390937805176, + "step": 442 + }, + { + "epoch": 7.508474576271187, + "grad_norm": 5.290094026909458, + "learning_rate": 5.7491340316373485e-09, + "logits/chosen": -1.3767602443695068, + "logits/rejected": -1.2797472476959229, + "logps/chosen": -26.007776260375977, + "logps/rejected": -53.1204719543457, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03208550810813904, + "rewards/margins": 6.148884296417236, + "rewards/rejected": -6.18096923828125, + "step": 443 + }, + { + "epoch": 7.52542372881356, + "grad_norm": 5.384685543036387, + "learning_rate": 5.360876925123992e-09, + "logits/chosen": -1.660988450050354, + "logits/rejected": -1.495483160018921, + "logps/chosen": -38.71052551269531, + "logps/rejected": -63.9692497253418, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.767485499382019, + "rewards/margins": 6.337566375732422, + "rewards/rejected": -7.105052947998047, + "step": 444 + }, + { + "epoch": 7.5423728813559325, + "grad_norm": 6.204818072148143, + "learning_rate": 4.9860503116665176e-09, + "logits/chosen": -1.7063543796539307, + "logits/rejected": -1.692810297012329, + "logps/chosen": -25.562942504882812, + "logps/rejected": -48.51639175415039, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01574818789958954, + "rewards/margins": 4.9537529945373535, + "rewards/rejected": -4.969500541687012, + "step": 445 + }, + { + "epoch": 7.559322033898305, + "grad_norm": 5.655663554463141, + "learning_rate": 4.624674768947484e-09, + "logits/chosen": -1.598649024963379, + "logits/rejected": -1.4650629758834839, + "logps/chosen": -27.018341064453125, + "logps/rejected": -46.08781051635742, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027409642934799194, + "rewards/margins": 4.6786322593688965, + "rewards/rejected": -4.681373596191406, + "step": 446 + }, + { + "epoch": 7.576271186440678, + "grad_norm": 5.701136400159739, + "learning_rate": 4.2767701361964835e-09, + "logits/chosen": -1.3610193729400635, + "logits/rejected": -1.3563426733016968, + "logps/chosen": -35.585716247558594, + "logps/rejected": -50.32651901245117, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6664023995399475, + "rewards/margins": 4.827932834625244, + "rewards/rejected": -5.494334697723389, + "step": 447 + }, + { + "epoch": 7.593220338983051, + "grad_norm": 6.209794879281624, + "learning_rate": 3.942355513100792e-09, + "logits/chosen": -1.4733314514160156, + "logits/rejected": -1.4402073621749878, + "logps/chosen": -28.033966064453125, + "logps/rejected": -54.08881378173828, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.519957423210144, + "rewards/margins": 5.857639312744141, + "rewards/rejected": -6.377596855163574, + "step": 448 + }, + { + "epoch": 7.610169491525424, + "grad_norm": 4.997367172232428, + "learning_rate": 3.6214492587569313e-09, + "logits/chosen": -1.5576658248901367, + "logits/rejected": -1.6168861389160156, + "logps/chosen": -32.894317626953125, + "logps/rejected": -41.69158172607422, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22915619611740112, + "rewards/margins": 4.693375110626221, + "rewards/rejected": -4.922531604766846, + "step": 449 + }, + { + "epoch": 7.627118644067797, + "grad_norm": 5.4083679319206555, + "learning_rate": 3.314068990662805e-09, + "logits/chosen": -1.619524359703064, + "logits/rejected": -1.469055414199829, + "logps/chosen": -25.54304313659668, + "logps/rejected": -38.69428634643555, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.059964776039123535, + "rewards/margins": 5.082298278808594, + "rewards/rejected": -5.02233362197876, + "step": 450 + }, + { + "epoch": 7.6440677966101696, + "grad_norm": 5.217030218721459, + "learning_rate": 3.0202315837502545e-09, + "logits/chosen": -1.4242594242095947, + "logits/rejected": -1.439193844795227, + "logps/chosen": -31.73979949951172, + "logps/rejected": -39.235633850097656, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5601508617401123, + "rewards/margins": 4.23047399520874, + "rewards/rejected": -4.790624618530273, + "step": 451 + }, + { + "epoch": 7.661016949152542, + "grad_norm": 5.102186940922818, + "learning_rate": 2.7399531694589917e-09, + "logits/chosen": -1.5301525592803955, + "logits/rejected": -1.5297596454620361, + "logps/chosen": -27.26993179321289, + "logps/rejected": -46.745147705078125, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15520310401916504, + "rewards/margins": 5.617114543914795, + "rewards/rejected": -5.772317409515381, + "step": 452 + }, + { + "epoch": 7.677966101694915, + "grad_norm": 5.403786442321042, + "learning_rate": 2.473249134850808e-09, + "logits/chosen": -1.3538662195205688, + "logits/rejected": -1.3246500492095947, + "logps/chosen": -22.38913345336914, + "logps/rejected": -45.14683532714844, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.044996634125709534, + "rewards/margins": 4.89539098739624, + "rewards/rejected": -4.850394248962402, + "step": 453 + }, + { + "epoch": 7.694915254237288, + "grad_norm": 6.421766143852716, + "learning_rate": 2.220134121764833e-09, + "logits/chosen": -1.4915080070495605, + "logits/rejected": -1.452484130859375, + "logps/chosen": -16.578571319580078, + "logps/rejected": -37.22169876098633, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8915931582450867, + "rewards/margins": 5.497744560241699, + "rewards/rejected": -4.606151580810547, + "step": 454 + }, + { + "epoch": 7.711864406779661, + "grad_norm": 5.924650241904462, + "learning_rate": 1.9806220260137065e-09, + "logits/chosen": -1.5789787769317627, + "logits/rejected": -1.4192255735397339, + "logps/chosen": -30.962202072143555, + "logps/rejected": -46.20201873779297, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06508506834506989, + "rewards/margins": 5.4069318771362305, + "rewards/rejected": -5.341846466064453, + "step": 455 + }, + { + "epoch": 7.728813559322034, + "grad_norm": 5.712693057718049, + "learning_rate": 1.7547259966207705e-09, + "logits/chosen": -1.5617210865020752, + "logits/rejected": -1.489423394203186, + "logps/chosen": -30.215980529785156, + "logps/rejected": -46.79343795776367, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2140485942363739, + "rewards/margins": 6.268076419830322, + "rewards/rejected": -6.482125282287598, + "step": 456 + }, + { + "epoch": 7.745762711864407, + "grad_norm": 4.765004207763158, + "learning_rate": 1.5424584350981485e-09, + "logits/chosen": -1.5075204372406006, + "logits/rejected": -1.446047067642212, + "logps/chosen": -25.82567596435547, + "logps/rejected": -43.98649597167969, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2662041187286377, + "rewards/margins": 5.002760887145996, + "rewards/rejected": -5.268965244293213, + "step": 457 + }, + { + "epoch": 7.762711864406779, + "grad_norm": 4.613094215982073, + "learning_rate": 1.343830994765982e-09, + "logits/chosen": -1.5371060371398926, + "logits/rejected": -1.4451144933700562, + "logps/chosen": -26.201602935791016, + "logps/rejected": -57.06352996826172, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1458522379398346, + "rewards/margins": 6.390477657318115, + "rewards/rejected": -6.53632926940918, + "step": 458 + }, + { + "epoch": 7.779661016949152, + "grad_norm": 4.4872892882814535, + "learning_rate": 1.1588545801125837e-09, + "logits/chosen": -1.7496167421340942, + "logits/rejected": -1.6176550388336182, + "logps/chosen": -36.218074798583984, + "logps/rejected": -55.342689514160156, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4042947292327881, + "rewards/margins": 5.6281538009643555, + "rewards/rejected": -6.032447814941406, + "step": 459 + }, + { + "epoch": 7.796610169491525, + "grad_norm": 5.575240209393775, + "learning_rate": 9.87539346195776e-10, + "logits/chosen": -1.4099677801132202, + "logits/rejected": -1.2582119703292847, + "logps/chosen": -29.023414611816406, + "logps/rejected": -42.5074577331543, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27348968386650085, + "rewards/margins": 4.847697734832764, + "rewards/rejected": -5.121187210083008, + "step": 460 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 5.199396752692828, + "learning_rate": 8.298946980855315e-10, + "logits/chosen": -1.472285509109497, + "logits/rejected": -1.330193281173706, + "logps/chosen": -29.86899185180664, + "logps/rejected": -41.80485153198242, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15588414669036865, + "rewards/margins": 5.636477470397949, + "rewards/rejected": -5.792361259460449, + "step": 461 + }, + { + "epoch": 7.830508474576272, + "grad_norm": 4.521518393613341, + "learning_rate": 6.8592929034747e-10, + "logits/chosen": -1.4151493310928345, + "logits/rejected": -1.4073522090911865, + "logps/chosen": -27.720748901367188, + "logps/rejected": -53.330909729003906, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1728513091802597, + "rewards/margins": 5.166114807128906, + "rewards/rejected": -5.338965892791748, + "step": 462 + }, + { + "epoch": 7.847457627118644, + "grad_norm": 4.567347653583189, + "learning_rate": 5.556510265678771e-10, + "logits/chosen": -1.570703387260437, + "logits/rejected": -1.5427438020706177, + "logps/chosen": -24.184301376342773, + "logps/rejected": -44.638912200927734, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29552435874938965, + "rewards/margins": 5.448472499847412, + "rewards/rejected": -5.743997573852539, + "step": 463 + }, + { + "epoch": 7.864406779661017, + "grad_norm": 5.544082314499855, + "learning_rate": 4.390670589196621e-10, + "logits/chosen": -1.6661534309387207, + "logits/rejected": -1.5269910097122192, + "logps/chosen": -24.99061393737793, + "logps/rejected": -45.773014068603516, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4490078091621399, + "rewards/margins": 5.556340217590332, + "rewards/rejected": -6.005348205566406, + "step": 464 + }, + { + "epoch": 7.88135593220339, + "grad_norm": 4.867190786041254, + "learning_rate": 3.3618378776981147e-10, + "logits/chosen": -1.6171151399612427, + "logits/rejected": -1.5128624439239502, + "logps/chosen": -28.165693283081055, + "logps/rejected": -41.43475341796875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2984722852706909, + "rewards/margins": 4.441769599914551, + "rewards/rejected": -4.14329719543457, + "step": 465 + }, + { + "epoch": 7.898305084745763, + "grad_norm": 5.660749716669039, + "learning_rate": 2.4700686132803075e-10, + "logits/chosen": -1.5283398628234863, + "logits/rejected": -1.484012484550476, + "logps/chosen": -29.490188598632812, + "logps/rejected": -45.15831756591797, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029849261045455933, + "rewards/margins": 5.0221357345581055, + "rewards/rejected": -4.992286682128906, + "step": 466 + }, + { + "epoch": 7.915254237288136, + "grad_norm": 6.365487263229808, + "learning_rate": 1.715411753365481e-10, + "logits/chosen": -1.649171233177185, + "logits/rejected": -1.701267957687378, + "logps/chosen": -25.891307830810547, + "logps/rejected": -47.70183181762695, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6622889041900635, + "rewards/margins": 5.317337989807129, + "rewards/rejected": -5.9796271324157715, + "step": 467 + }, + { + "epoch": 7.932203389830509, + "grad_norm": 4.885763801093592, + "learning_rate": 1.0979087280141297e-10, + "logits/chosen": -1.2668333053588867, + "logits/rejected": -1.3214877843856812, + "logps/chosen": -21.41905403137207, + "logps/rejected": -38.722625732421875, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.182024747133255, + "rewards/margins": 4.632743835449219, + "rewards/rejected": -4.814768314361572, + "step": 468 + }, + { + "epoch": 7.9491525423728815, + "grad_norm": 5.3854848332900085, + "learning_rate": 6.175934376509429e-11, + "logits/chosen": -1.5046411752700806, + "logits/rejected": -1.5680880546569824, + "logps/chosen": -28.198741912841797, + "logps/rejected": -59.644596099853516, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0053573548793792725, + "rewards/margins": 6.178599834442139, + "rewards/rejected": -6.173242568969727, + "step": 469 + }, + { + "epoch": 7.966101694915254, + "grad_norm": 4.75232754040335, + "learning_rate": 2.7449225120268482e-11, + "logits/chosen": -1.4780237674713135, + "logits/rejected": -1.427555799484253, + "logps/chosen": -27.112102508544922, + "logps/rejected": -52.26079177856445, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19119581580162048, + "rewards/margins": 6.167375087738037, + "rewards/rejected": -6.3585710525512695, + "step": 470 + }, + { + "epoch": 7.983050847457627, + "grad_norm": 5.233750895631027, + "learning_rate": 6.862400465157403e-12, + "logits/chosen": -1.5693848133087158, + "logits/rejected": -1.5205610990524292, + "logps/chosen": -33.29484558105469, + "logps/rejected": -41.09092712402344, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30308201909065247, + "rewards/margins": 4.717187881469727, + "rewards/rejected": -5.020270347595215, + "step": 471 + }, + { + "epoch": 8.0, + "grad_norm": 6.397324083693935, + "learning_rate": 0.0, + "logits/chosen": -1.639676570892334, + "logits/rejected": -1.6202343702316284, + "logps/chosen": -33.1722526550293, + "logps/rejected": -40.651100158691406, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.057152003049850464, + "rewards/margins": 4.614887237548828, + "rewards/rejected": -4.672039031982422, + "step": 472 + }, + { + "epoch": 8.0, + "step": 472, + "total_flos": 0.0, + "train_loss": 0.15622181608736263, + "train_runtime": 4721.8854, + "train_samples_per_second": 12.788, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1, + "max_steps": 472, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 400, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}