{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 74.50819179863889, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.7660439014434814, "logits/rejected": -2.717564582824707, "logps/chosen": -269.8568420410156, "logps/rejected": -360.52459716796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 71.5827858042053, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.592801809310913, "logits/rejected": -2.5633366107940674, "logps/chosen": -264.5331726074219, "logps/rejected": -251.33367919921875, "loss": 0.6884, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.2647041380405426, "rewards/margins": 0.0454571396112442, "rewards/rejected": 0.2192470282316208, "step": 10 }, { "epoch": 0.04, "grad_norm": 33.37630632393394, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6635663509368896, "logits/rejected": -2.6177525520324707, "logps/chosen": -275.1928405761719, "logps/rejected": -290.4365539550781, "loss": 0.5763, "rewards/accuracies": 0.5, "rewards/chosen": 6.3604888916015625, "rewards/margins": -0.009852093644440174, "rewards/rejected": 6.370340824127197, "step": 20 }, { "epoch": 0.06, "grad_norm": 22.1278736890366, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.7272486686706543, "logits/rejected": -2.667067527770996, "logps/chosen": -285.1613464355469, "logps/rejected": -249.3108367919922, "loss": 0.4416, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 15.510467529296875, "rewards/margins": 0.8711569905281067, "rewards/rejected": 14.639310836791992, "step": 30 }, { "epoch": 0.08, "grad_norm": 17.071895487907064, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6888694763183594, "logits/rejected": -2.6701016426086426, "logps/chosen": -247.84716796875, "logps/rejected": -227.38131713867188, "loss": 0.3982, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 19.278215408325195, "rewards/margins": 2.267552137374878, "rewards/rejected": 17.010662078857422, "step": 40 }, { "epoch": 0.1, "grad_norm": 14.78162706214556, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.659508466720581, "logits/rejected": -2.6249804496765137, "logps/chosen": -259.9454650878906, "logps/rejected": -272.14227294921875, "loss": 0.3676, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 19.786420822143555, "rewards/margins": -0.8553922772407532, "rewards/rejected": 20.64181137084961, "step": 50 }, { "epoch": 0.13, "grad_norm": 14.285832773490087, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.6977388858795166, "logits/rejected": -2.654181957244873, "logps/chosen": -247.1780242919922, "logps/rejected": -275.7373962402344, "loss": 0.3521, "rewards/accuracies": 0.5625, "rewards/chosen": 24.428516387939453, "rewards/margins": 2.0845706462860107, "rewards/rejected": 22.343944549560547, "step": 60 }, { "epoch": 0.15, "grad_norm": 14.416469937136577, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.6118428707122803, "logits/rejected": -2.625479221343994, "logps/chosen": -239.4540252685547, "logps/rejected": -232.90463256835938, "loss": 0.3304, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 26.162424087524414, "rewards/margins": 2.349818706512451, "rewards/rejected": 23.812606811523438, "step": 70 }, { "epoch": 0.17, "grad_norm": 15.840881084472352, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.7612788677215576, "logits/rejected": -2.7243030071258545, "logps/chosen": -295.0336608886719, "logps/rejected": -240.8730010986328, "loss": 0.3248, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 27.784252166748047, "rewards/margins": 4.598628997802734, "rewards/rejected": 23.185623168945312, "step": 80 }, { "epoch": 0.19, "grad_norm": 13.661268677283298, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.6661014556884766, "logits/rejected": -2.645249128341675, "logps/chosen": -231.57553100585938, "logps/rejected": -228.09091186523438, "loss": 0.3223, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 27.535770416259766, "rewards/margins": 3.228619337081909, "rewards/rejected": 24.30714988708496, "step": 90 }, { "epoch": 0.21, "grad_norm": 11.61288143003843, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.6386702060699463, "logits/rejected": -2.6339759826660156, "logps/chosen": -233.39047241210938, "logps/rejected": -232.5922393798828, "loss": 0.3163, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 26.968032836914062, "rewards/margins": 2.5318057537078857, "rewards/rejected": 24.436227798461914, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.6968541145324707, "eval_logits/rejected": -2.670072555541992, "eval_logps/chosen": -235.37875366210938, "eval_logps/rejected": -238.44345092773438, "eval_loss": 0.31289389729499817, "eval_rewards/accuracies": 0.58203125, "eval_rewards/chosen": 27.21471405029297, "eval_rewards/margins": 2.99098801612854, "eval_rewards/rejected": 24.223726272583008, "eval_runtime": 96.735, "eval_samples_per_second": 20.675, "eval_steps_per_second": 0.331, "step": 100 }, { "epoch": 0.23, "grad_norm": 11.688620320219954, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.7125041484832764, "logits/rejected": -2.6624934673309326, "logps/chosen": -276.029052734375, "logps/rejected": -234.1141815185547, "loss": 0.3136, "rewards/accuracies": 0.625, "rewards/chosen": 28.551036834716797, "rewards/margins": 4.829342842102051, "rewards/rejected": 23.72169303894043, "step": 110 }, { "epoch": 0.25, "grad_norm": 14.849649400244427, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.6516470909118652, "logits/rejected": -2.647688865661621, "logps/chosen": -253.4019317626953, "logps/rejected": -234.5045623779297, "loss": 0.3065, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 27.765233993530273, "rewards/margins": 2.4132068157196045, "rewards/rejected": 25.352027893066406, "step": 120 }, { "epoch": 0.27, "grad_norm": 12.095477452171375, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.679412364959717, "logits/rejected": -2.6742541790008545, "logps/chosen": -249.6054229736328, "logps/rejected": -241.8912811279297, "loss": 0.2993, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 32.39772415161133, "rewards/margins": 5.853152275085449, "rewards/rejected": 26.544570922851562, "step": 130 }, { "epoch": 0.29, "grad_norm": 13.237989201417717, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.7010607719421387, "logits/rejected": -2.689103603363037, "logps/chosen": -284.6669921875, "logps/rejected": -270.44970703125, "loss": 0.3016, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 31.298425674438477, "rewards/margins": 1.071274995803833, "rewards/rejected": 30.22715187072754, "step": 140 }, { "epoch": 0.31, "grad_norm": 11.533759549255185, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.6910300254821777, "logits/rejected": -2.6623480319976807, "logps/chosen": -251.215576171875, "logps/rejected": -248.98348999023438, "loss": 0.2985, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 32.008628845214844, "rewards/margins": 4.783123970031738, "rewards/rejected": 27.225509643554688, "step": 150 }, { "epoch": 0.33, "grad_norm": 13.117822478323479, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.721895217895508, "logits/rejected": -2.675842523574829, "logps/chosen": -242.4053192138672, "logps/rejected": -230.8060302734375, "loss": 0.3009, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 30.662723541259766, "rewards/margins": 4.044883728027344, "rewards/rejected": 26.61783790588379, "step": 160 }, { "epoch": 0.36, "grad_norm": 11.340151801902158, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.670436382293701, "logits/rejected": -2.632450819015503, "logps/chosen": -220.5222625732422, "logps/rejected": -204.80908203125, "loss": 0.2938, "rewards/accuracies": 0.59375, "rewards/chosen": 28.81294822692871, "rewards/margins": 2.497253179550171, "rewards/rejected": 26.31569480895996, "step": 170 }, { "epoch": 0.38, "grad_norm": 11.477634324684333, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.646768093109131, "logits/rejected": -2.6306955814361572, "logps/chosen": -225.45016479492188, "logps/rejected": -200.42015075683594, "loss": 0.2914, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 30.908817291259766, "rewards/margins": 3.7578415870666504, "rewards/rejected": 27.150976181030273, "step": 180 }, { "epoch": 0.4, "grad_norm": 13.566633133843082, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.679771900177002, "logits/rejected": -2.6499440670013428, "logps/chosen": -241.45156860351562, "logps/rejected": -231.2630615234375, "loss": 0.2963, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 29.79128646850586, "rewards/margins": 1.2995483875274658, "rewards/rejected": 28.49173927307129, "step": 190 }, { "epoch": 0.42, "grad_norm": 16.736011308973627, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.630007028579712, "logits/rejected": -2.6183559894561768, "logps/chosen": -230.09048461914062, "logps/rejected": -223.8180694580078, "loss": 0.2918, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 29.806177139282227, "rewards/margins": 1.575269341468811, "rewards/rejected": 28.230907440185547, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -2.708475112915039, "eval_logits/rejected": -2.682575225830078, "eval_logps/chosen": -232.24124145507812, "eval_logps/rejected": -236.21038818359375, "eval_loss": 0.29230329394340515, "eval_rewards/accuracies": 0.58203125, "eval_rewards/chosen": 30.35222816467285, "eval_rewards/margins": 3.8954334259033203, "eval_rewards/rejected": 26.45679473876953, "eval_runtime": 96.829, "eval_samples_per_second": 20.655, "eval_steps_per_second": 0.33, "step": 200 }, { "epoch": 0.44, "grad_norm": 11.417465496451523, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.6330389976501465, "logits/rejected": -2.6055209636688232, "logps/chosen": -257.6673889160156, "logps/rejected": -225.943359375, "loss": 0.2902, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 32.02475357055664, "rewards/margins": 6.720486640930176, "rewards/rejected": 25.304264068603516, "step": 210 }, { "epoch": 0.46, "grad_norm": 12.04727391696027, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.5957412719726562, "logits/rejected": -2.5795822143554688, "logps/chosen": -233.29476928710938, "logps/rejected": -217.3531951904297, "loss": 0.2919, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 32.082313537597656, "rewards/margins": 3.7717392444610596, "rewards/rejected": 28.310577392578125, "step": 220 }, { "epoch": 0.48, "grad_norm": 11.505656123665526, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.6124305725097656, "logits/rejected": -2.5944228172302246, "logps/chosen": -217.5354461669922, "logps/rejected": -220.5460205078125, "loss": 0.3047, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 31.32999038696289, "rewards/margins": 4.138183116912842, "rewards/rejected": 27.19180679321289, "step": 230 }, { "epoch": 0.5, "grad_norm": 11.083392566284138, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.660727024078369, "logits/rejected": -2.6385245323181152, "logps/chosen": -232.0665740966797, "logps/rejected": -219.62210083007812, "loss": 0.2834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 31.633642196655273, "rewards/margins": 2.1873562335968018, "rewards/rejected": 29.446285247802734, "step": 240 }, { "epoch": 0.52, "grad_norm": 11.463127161742676, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.6206917762756348, "logits/rejected": -2.576387405395508, "logps/chosen": -264.06439208984375, "logps/rejected": -229.7786865234375, "loss": 0.2818, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 34.12608337402344, "rewards/margins": 4.382205009460449, "rewards/rejected": 29.743881225585938, "step": 250 }, { "epoch": 0.54, "grad_norm": 10.661524920447267, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.6774675846099854, "logits/rejected": -2.668527364730835, "logps/chosen": -260.33514404296875, "logps/rejected": -225.80810546875, "loss": 0.2858, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 33.976402282714844, "rewards/margins": 5.804098606109619, "rewards/rejected": 28.17230224609375, "step": 260 }, { "epoch": 0.56, "grad_norm": 11.916616915089687, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.6731224060058594, "logits/rejected": -2.6551766395568848, "logps/chosen": -245.6435089111328, "logps/rejected": -228.1649932861328, "loss": 0.2808, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 31.35245704650879, "rewards/margins": 1.8731645345687866, "rewards/rejected": 29.479290008544922, "step": 270 }, { "epoch": 0.59, "grad_norm": 11.982078860289866, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.6452529430389404, "logits/rejected": -2.6127915382385254, "logps/chosen": -229.02554321289062, "logps/rejected": -215.188720703125, "loss": 0.2835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 32.651554107666016, "rewards/margins": 5.653929233551025, "rewards/rejected": 26.99761962890625, "step": 280 }, { "epoch": 0.61, "grad_norm": 11.17239233559609, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.675553321838379, "logits/rejected": -2.662069082260132, "logps/chosen": -219.8170928955078, "logps/rejected": -211.7806396484375, "loss": 0.2849, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 31.27024269104004, "rewards/margins": 1.0949894189834595, "rewards/rejected": 30.175247192382812, "step": 290 }, { "epoch": 0.63, "grad_norm": 9.847053265544167, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.65397572517395, "logits/rejected": -2.6134414672851562, "logps/chosen": -268.84588623046875, "logps/rejected": -232.80752563476562, "loss": 0.286, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 32.15021514892578, "rewards/margins": 4.852233409881592, "rewards/rejected": 27.297988891601562, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -2.695726156234741, "eval_logits/rejected": -2.6716713905334473, "eval_logps/chosen": -231.15402221679688, "eval_logps/rejected": -235.42864990234375, "eval_loss": 0.29209351539611816, "eval_rewards/accuracies": 0.58203125, "eval_rewards/chosen": 31.439437866210938, "eval_rewards/margins": 4.200903415679932, "eval_rewards/rejected": 27.238534927368164, "eval_runtime": 96.789, "eval_samples_per_second": 20.664, "eval_steps_per_second": 0.331, "step": 300 }, { "epoch": 0.65, "grad_norm": 11.299461074514115, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.6087942123413086, "logits/rejected": -2.607959270477295, "logps/chosen": -263.2939758300781, "logps/rejected": -229.5752716064453, "loss": 0.2804, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 32.041908264160156, "rewards/margins": 3.100654363632202, "rewards/rejected": 28.941247940063477, "step": 310 }, { "epoch": 0.67, "grad_norm": 11.979925902064297, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.652468204498291, "logits/rejected": -2.6433398723602295, "logps/chosen": -260.83233642578125, "logps/rejected": -216.2664337158203, "loss": 0.2788, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 32.8377571105957, "rewards/margins": 4.280916213989258, "rewards/rejected": 28.556838989257812, "step": 320 }, { "epoch": 0.69, "grad_norm": 10.289416601586245, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.678496837615967, "logits/rejected": -2.634920835494995, "logps/chosen": -229.55624389648438, "logps/rejected": -231.64407348632812, "loss": 0.2812, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 32.6539306640625, "rewards/margins": 4.799349784851074, "rewards/rejected": 27.854583740234375, "step": 330 }, { "epoch": 0.71, "grad_norm": 12.940304501019066, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.687782049179077, "logits/rejected": -2.6474757194519043, "logps/chosen": -258.529541015625, "logps/rejected": -247.69125366210938, "loss": 0.2752, "rewards/accuracies": 0.53125, "rewards/chosen": 31.464908599853516, "rewards/margins": -0.8856052160263062, "rewards/rejected": 32.35051727294922, "step": 340 }, { "epoch": 0.73, "grad_norm": 13.446019747621028, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.6681811809539795, "logits/rejected": -2.6358139514923096, "logps/chosen": -227.58425903320312, "logps/rejected": -212.9467010498047, "loss": 0.2866, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 30.626983642578125, "rewards/margins": 2.8648905754089355, "rewards/rejected": 27.7620906829834, "step": 350 }, { "epoch": 0.75, "grad_norm": 10.212615361555141, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.691338300704956, "logits/rejected": -2.6329030990600586, "logps/chosen": -269.2547302246094, "logps/rejected": -233.14053344726562, "loss": 0.2785, "rewards/accuracies": 0.625, "rewards/chosen": 33.437278747558594, "rewards/margins": 5.27285623550415, "rewards/rejected": 28.1644287109375, "step": 360 }, { "epoch": 0.77, "grad_norm": 12.701608094493194, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.6507325172424316, "logits/rejected": -2.6226696968078613, "logps/chosen": -243.0960693359375, "logps/rejected": -207.664794921875, "loss": 0.2854, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 32.23695373535156, "rewards/margins": 1.6676933765411377, "rewards/rejected": 30.569263458251953, "step": 370 }, { "epoch": 0.79, "grad_norm": 11.004484883830752, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.592874526977539, "logits/rejected": -2.5939741134643555, "logps/chosen": -206.689697265625, "logps/rejected": -228.67898559570312, "loss": 0.2774, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 32.79497146606445, "rewards/margins": 2.7575299739837646, "rewards/rejected": 30.037445068359375, "step": 380 }, { "epoch": 0.82, "grad_norm": 12.608909298282311, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.6360385417938232, "logits/rejected": -2.6261894702911377, "logps/chosen": -270.9910888671875, "logps/rejected": -252.8332977294922, "loss": 0.276, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 34.645816802978516, "rewards/margins": 3.508648633956909, "rewards/rejected": 31.137165069580078, "step": 390 }, { "epoch": 0.84, "grad_norm": 11.347134923103408, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.632523775100708, "logits/rejected": -2.594832181930542, "logps/chosen": -236.8807830810547, "logps/rejected": -237.6399688720703, "loss": 0.2819, "rewards/accuracies": 0.625, "rewards/chosen": 33.6544189453125, "rewards/margins": 4.281933784484863, "rewards/rejected": 29.372488021850586, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -2.6868975162506104, "eval_logits/rejected": -2.66192626953125, "eval_logps/chosen": -230.7387237548828, "eval_logps/rejected": -235.19105529785156, "eval_loss": 0.2787904143333435, "eval_rewards/accuracies": 0.578125, "eval_rewards/chosen": 31.854747772216797, "eval_rewards/margins": 4.3786234855651855, "eval_rewards/rejected": 27.476125717163086, "eval_runtime": 96.6885, "eval_samples_per_second": 20.685, "eval_steps_per_second": 0.331, "step": 400 }, { "epoch": 0.86, "grad_norm": 12.175943173191595, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.6695199012756348, "logits/rejected": -2.626798152923584, "logps/chosen": -263.4989318847656, "logps/rejected": -240.9721221923828, "loss": 0.2806, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 35.418556213378906, "rewards/margins": 7.573515892028809, "rewards/rejected": 27.845043182373047, "step": 410 }, { "epoch": 0.88, "grad_norm": 11.7624491150407, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.6308817863464355, "logits/rejected": -2.620222568511963, "logps/chosen": -264.280517578125, "logps/rejected": -247.2097625732422, "loss": 0.2882, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 32.79326248168945, "rewards/margins": 5.5407843589782715, "rewards/rejected": 27.252477645874023, "step": 420 }, { "epoch": 0.9, "grad_norm": 11.16296113559481, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.577580213546753, "logits/rejected": -2.5429909229278564, "logps/chosen": -248.5481719970703, "logps/rejected": -228.4681396484375, "loss": 0.2851, "rewards/accuracies": 0.53125, "rewards/chosen": 30.489971160888672, "rewards/margins": 1.1781085729599, "rewards/rejected": 29.311859130859375, "step": 430 }, { "epoch": 0.92, "grad_norm": 10.453636294498436, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.654780864715576, "logits/rejected": -2.619481086730957, "logps/chosen": -251.1508026123047, "logps/rejected": -240.0060272216797, "loss": 0.2805, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 33.18633270263672, "rewards/margins": 4.080627918243408, "rewards/rejected": 29.1057071685791, "step": 440 }, { "epoch": 0.94, "grad_norm": 10.779162534358996, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.598240375518799, "logits/rejected": -2.6028037071228027, "logps/chosen": -259.9753112792969, "logps/rejected": -276.95166015625, "loss": 0.2836, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 33.70884323120117, "rewards/margins": 3.5860488414764404, "rewards/rejected": 30.122793197631836, "step": 450 }, { "epoch": 0.96, "grad_norm": 12.07874608208951, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.6384501457214355, "logits/rejected": -2.618943452835083, "logps/chosen": -240.47885131835938, "logps/rejected": -213.6422882080078, "loss": 0.2815, "rewards/accuracies": 0.5625, "rewards/chosen": 31.485698699951172, "rewards/margins": 2.44018292427063, "rewards/rejected": 29.045513153076172, "step": 460 }, { "epoch": 0.98, "grad_norm": 11.390948919388384, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.6327641010284424, "logits/rejected": -2.6079437732696533, "logps/chosen": -245.8006591796875, "logps/rejected": -253.76730346679688, "loss": 0.2778, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 32.898033142089844, "rewards/margins": 4.314266204833984, "rewards/rejected": 28.58376121520996, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.31381568898715734, "train_runtime": 7749.4814, "train_samples_per_second": 7.889, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }