{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 395, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 41.54115560671131, "learning_rate": 1.25e-08, "logits/chosen": -4.306375503540039, "logits/rejected": -4.599514007568359, "logps/chosen": -381.2711181640625, "logps/rejected": -391.8406982421875, "loss": 0.6929, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 38.89233173260305, "learning_rate": 1.25e-07, "logits/chosen": -4.333991050720215, "logits/rejected": -4.4896559715271, "logps/chosen": -376.6128845214844, "logps/rejected": -396.8119201660156, "loss": 0.6925, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.0025132838636636734, "rewards/margins": 0.0018265678081661463, "rewards/rejected": -0.004339851904660463, "step": 10 }, { "epoch": 0.05, "grad_norm": 38.139879750053254, "learning_rate": 2.5e-07, "logits/chosen": -4.47939920425415, "logits/rejected": -4.573966979980469, "logps/chosen": -391.5196838378906, "logps/rejected": -438.828857421875, "loss": 0.6674, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09850569069385529, "rewards/margins": 0.05668836832046509, "rewards/rejected": -0.15519407391548157, "step": 20 }, { "epoch": 0.08, "grad_norm": 47.79366128002695, "learning_rate": 3.75e-07, "logits/chosen": -4.530553340911865, "logits/rejected": -4.708470344543457, "logps/chosen": -464.93084716796875, "logps/rejected": -509.82861328125, "loss": 0.6261, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.5474977493286133, "rewards/margins": 0.25755801796913147, "rewards/rejected": -0.8050557374954224, "step": 30 }, { "epoch": 0.1, "grad_norm": 47.01153640565049, "learning_rate": 5e-07, "logits/chosen": -4.857049942016602, "logits/rejected": -5.048783779144287, "logps/chosen": -437.2730407714844, "logps/rejected": -497.80078125, "loss": 0.5883, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.6910537481307983, "rewards/margins": 0.3994576036930084, "rewards/rejected": -1.0905113220214844, "step": 40 }, { "epoch": 0.13, "grad_norm": 50.785580057659836, "learning_rate": 4.990217055187362e-07, "logits/chosen": -4.775557518005371, "logits/rejected": -4.96406364440918, "logps/chosen": -425.1912536621094, "logps/rejected": -523.9107666015625, "loss": 0.5553, "rewards/accuracies": 0.71875, "rewards/chosen": -0.48143666982650757, "rewards/margins": 0.5418455004692078, "rewards/rejected": -1.0232822895050049, "step": 50 }, { "epoch": 0.15, "grad_norm": 47.58443781578147, "learning_rate": 4.960944785556813e-07, "logits/chosen": -4.912293434143066, "logits/rejected": -5.1732072830200195, "logps/chosen": -461.0411682128906, "logps/rejected": -549.9326171875, "loss": 0.5619, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.7086145281791687, "rewards/margins": 0.5085697174072266, "rewards/rejected": -1.21718430519104, "step": 60 }, { "epoch": 0.18, "grad_norm": 50.17116749835004, "learning_rate": 4.912412286307025e-07, "logits/chosen": -5.128066062927246, "logits/rejected": -5.452770709991455, "logps/chosen": -429.89202880859375, "logps/rejected": -530.6647338867188, "loss": 0.5226, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5659561157226562, "rewards/margins": 0.6657453775405884, "rewards/rejected": -1.2317016124725342, "step": 70 }, { "epoch": 0.2, "grad_norm": 56.54681066075575, "learning_rate": 4.844999390047419e-07, "logits/chosen": -5.285617828369141, "logits/rejected": -5.636483192443848, "logps/chosen": -460.7303161621094, "logps/rejected": -563.2789916992188, "loss": 0.5138, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7176406383514404, "rewards/margins": 0.732753574848175, "rewards/rejected": -1.4503942728042603, "step": 80 }, { "epoch": 0.23, "grad_norm": 50.83178244078908, "learning_rate": 4.7592336940930097e-07, "logits/chosen": -5.608884811401367, "logits/rejected": -5.990847587585449, "logps/chosen": -460.23272705078125, "logps/rejected": -559.8355712890625, "loss": 0.5099, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.593207061290741, "rewards/margins": 0.8474555015563965, "rewards/rejected": -1.4406626224517822, "step": 90 }, { "epoch": 0.25, "grad_norm": 59.601370802822075, "learning_rate": 4.655786431300069e-07, "logits/chosen": -5.9478230476379395, "logits/rejected": -6.265199184417725, "logps/chosen": -431.99151611328125, "logps/rejected": -531.1004638671875, "loss": 0.5186, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.6661251783370972, "rewards/margins": 0.7114373445510864, "rewards/rejected": -1.3775627613067627, "step": 100 }, { "epoch": 0.28, "grad_norm": 54.889294284535225, "learning_rate": 4.535467216758936e-07, "logits/chosen": -5.984147548675537, "logits/rejected": -6.4844231605529785, "logps/chosen": -461.1131286621094, "logps/rejected": -544.032958984375, "loss": 0.5057, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.6606206893920898, "rewards/margins": 0.6746741533279419, "rewards/rejected": -1.3352949619293213, "step": 110 }, { "epoch": 0.3, "grad_norm": 61.24432368201881, "learning_rate": 4.3992177114582117e-07, "logits/chosen": -6.2256269454956055, "logits/rejected": -6.654993534088135, "logps/chosen": -477.16326904296875, "logps/rejected": -610.1165771484375, "loss": 0.4971, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.8679486513137817, "rewards/margins": 0.9162583351135254, "rewards/rejected": -1.7842069864273071, "step": 120 }, { "epoch": 0.33, "grad_norm": 73.6030496927752, "learning_rate": 4.248104252510785e-07, "logits/chosen": -5.852092266082764, "logits/rejected": -6.4555840492248535, "logps/chosen": -436.20867919921875, "logps/rejected": -535.7562866210938, "loss": 0.5056, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6110241413116455, "rewards/margins": 0.7542751431465149, "rewards/rejected": -1.3652993440628052, "step": 130 }, { "epoch": 0.35, "grad_norm": 47.56900195447933, "learning_rate": 4.0833095076201176e-07, "logits/chosen": -5.862217903137207, "logits/rejected": -6.282025337219238, "logps/chosen": -448.24188232421875, "logps/rejected": -539.3355712890625, "loss": 0.5071, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7590019702911377, "rewards/margins": 0.7245356440544128, "rewards/rejected": -1.4835376739501953, "step": 140 }, { "epoch": 0.38, "grad_norm": 61.35802392759384, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -5.939135551452637, "logits/rejected": -6.367193698883057, "logps/chosen": -452.0302734375, "logps/rejected": -549.789306640625, "loss": 0.5017, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.6960467100143433, "rewards/margins": 0.7666617631912231, "rewards/rejected": -1.4627084732055664, "step": 150 }, { "epoch": 0.41, "grad_norm": 47.33126253925995, "learning_rate": 3.717932109901991e-07, "logits/chosen": -6.0608344078063965, "logits/rejected": -6.6918182373046875, "logps/chosen": -481.3443298339844, "logps/rejected": -580.4495849609375, "loss": 0.4842, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8736475706100464, "rewards/margins": 0.889535129070282, "rewards/rejected": -1.7631828784942627, "step": 160 }, { "epoch": 0.43, "grad_norm": 55.24382676161828, "learning_rate": 3.520209030608662e-07, "logits/chosen": -5.88026237487793, "logits/rejected": -6.288437843322754, "logps/chosen": -452.7632751464844, "logps/rejected": -572.0946655273438, "loss": 0.4902, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6406772136688232, "rewards/margins": 0.8104515075683594, "rewards/rejected": -1.4511287212371826, "step": 170 }, { "epoch": 0.46, "grad_norm": 55.62294024430127, "learning_rate": 3.314501432400294e-07, "logits/chosen": -6.004621505737305, "logits/rejected": -6.520898342132568, "logps/chosen": -464.07867431640625, "logps/rejected": -585.2781982421875, "loss": 0.4718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7512537240982056, "rewards/margins": 0.9464238286018372, "rewards/rejected": -1.6976773738861084, "step": 180 }, { "epoch": 0.48, "grad_norm": 52.01142149507834, "learning_rate": 3.1024192561415357e-07, "logits/chosen": -6.592843532562256, "logits/rejected": -6.868170738220215, "logps/chosen": -469.8837890625, "logps/rejected": -633.4119873046875, "loss": 0.4847, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.087369680404663, "rewards/margins": 1.0047850608825684, "rewards/rejected": -2.0921549797058105, "step": 190 }, { "epoch": 0.51, "grad_norm": 59.64754211046823, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -6.358391761779785, "logits/rejected": -6.768553733825684, "logps/chosen": -484.43780517578125, "logps/rejected": -597.5379638671875, "loss": 0.485, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.9880083799362183, "rewards/margins": 0.7920882105827332, "rewards/rejected": -1.7800966501235962, "step": 200 }, { "epoch": 0.53, "grad_norm": 55.126905350837, "learning_rate": 2.66580739108776e-07, "logits/chosen": -6.6527838706970215, "logits/rejected": -7.2258100509643555, "logps/chosen": -467.697509765625, "logps/rejected": -590.52734375, "loss": 0.4806, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8441788554191589, "rewards/margins": 0.9318108558654785, "rewards/rejected": -1.7759897708892822, "step": 210 }, { "epoch": 0.56, "grad_norm": 52.277945286098316, "learning_rate": 2.444694782117033e-07, "logits/chosen": -6.5264458656311035, "logits/rejected": -7.032387733459473, "logps/chosen": -461.33795166015625, "logps/rejected": -560.9654541015625, "loss": 0.4717, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9239259958267212, "rewards/margins": 0.8057114481925964, "rewards/rejected": -1.7296375036239624, "step": 220 }, { "epoch": 0.58, "grad_norm": 56.70702029670774, "learning_rate": 2.2240150114618259e-07, "logits/chosen": -6.4634904861450195, "logits/rejected": -6.930532932281494, "logps/chosen": -506.49505615234375, "logps/rejected": -632.0296020507812, "loss": 0.4759, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.0257264375686646, "rewards/margins": 1.0186141729354858, "rewards/rejected": -2.0443403720855713, "step": 230 }, { "epoch": 0.61, "grad_norm": 50.19002044845227, "learning_rate": 2.0054951975362065e-07, "logits/chosen": -6.56687068939209, "logits/rejected": -7.020349979400635, "logps/chosen": -470.70648193359375, "logps/rejected": -628.5145263671875, "loss": 0.4777, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.882061779499054, "rewards/margins": 1.0980554819107056, "rewards/rejected": -1.9801172018051147, "step": 240 }, { "epoch": 0.63, "grad_norm": 62.96869333763073, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -6.792383670806885, "logits/rejected": -7.117269039154053, "logps/chosen": -466.4208984375, "logps/rejected": -605.7127075195312, "loss": 0.4758, "rewards/accuracies": 0.840624988079071, "rewards/chosen": -0.8703948855400085, "rewards/margins": 1.0195105075836182, "rewards/rejected": -1.889905571937561, "step": 250 }, { "epoch": 0.66, "grad_norm": 62.510903657866734, "learning_rate": 1.5817460058381084e-07, "logits/chosen": -6.452023506164551, "logits/rejected": -6.968575477600098, "logps/chosen": -495.3380432128906, "logps/rejected": -609.4599609375, "loss": 0.4864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9492015838623047, "rewards/margins": 0.9177757501602173, "rewards/rejected": -1.866977334022522, "step": 260 }, { "epoch": 0.68, "grad_norm": 62.01447255874997, "learning_rate": 1.3798330400310537e-07, "logits/chosen": -6.2711181640625, "logits/rejected": -6.893205165863037, "logps/chosen": -465.7061462402344, "logps/rejected": -592.8031005859375, "loss": 0.4625, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.7948521375656128, "rewards/margins": 1.056058645248413, "rewards/rejected": -1.8509107828140259, "step": 270 }, { "epoch": 0.71, "grad_norm": 62.84305172490392, "learning_rate": 1.1866868994642534e-07, "logits/chosen": -6.332844257354736, "logits/rejected": -6.893272399902344, "logps/chosen": -478.971435546875, "logps/rejected": -597.5892333984375, "loss": 0.4598, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8677324056625366, "rewards/margins": 0.9433156251907349, "rewards/rejected": -1.811047911643982, "step": 280 }, { "epoch": 0.73, "grad_norm": 69.95001635354407, "learning_rate": 1.0038192145648567e-07, "logits/chosen": -6.384323596954346, "logits/rejected": -6.853055477142334, "logps/chosen": -527.7755126953125, "logps/rejected": -647.9898681640625, "loss": 0.4736, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.156057357788086, "rewards/margins": 1.0375856161117554, "rewards/rejected": -2.193643093109131, "step": 290 }, { "epoch": 0.76, "grad_norm": 74.81540906001793, "learning_rate": 8.32661172908373e-08, "logits/chosen": -6.537497043609619, "logits/rejected": -6.883517265319824, "logps/chosen": -472.21649169921875, "logps/rejected": -605.0711059570312, "loss": 0.4672, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9630511999130249, "rewards/margins": 0.9703599810600281, "rewards/rejected": -1.9334112405776978, "step": 300 }, { "epoch": 0.78, "grad_norm": 60.19828111344577, "learning_rate": 6.745523182354146e-08, "logits/chosen": -6.695423126220703, "logits/rejected": -7.178382873535156, "logps/chosen": -465.47760009765625, "logps/rejected": -609.17919921875, "loss": 0.4645, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0548077821731567, "rewards/margins": 0.9832090139389038, "rewards/rejected": -2.0380167961120605, "step": 310 }, { "epoch": 0.81, "grad_norm": 62.819619859552624, "learning_rate": 5.307300667057049e-08, "logits/chosen": -6.586479187011719, "logits/rejected": -7.047415733337402, "logps/chosen": -449.232421875, "logps/rejected": -559.2984008789062, "loss": 0.4502, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -0.937958836555481, "rewards/margins": 0.9285039901733398, "rewards/rejected": -1.8664629459381104, "step": 320 }, { "epoch": 0.84, "grad_norm": 54.74190137372815, "learning_rate": 4.023200224388787e-08, "logits/chosen": -6.525613307952881, "logits/rejected": -7.149096488952637, "logps/chosen": -492.0001525878906, "logps/rejected": -622.4295043945312, "loss": 0.4583, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.126334309577942, "rewards/margins": 1.0052974224090576, "rewards/rejected": -2.131631851196289, "step": 330 }, { "epoch": 0.86, "grad_norm": 65.41299711527839, "learning_rate": 2.903271681360972e-08, "logits/chosen": -6.488680839538574, "logits/rejected": -7.005003452301025, "logps/chosen": -486.0816955566406, "logps/rejected": -597.4981689453125, "loss": 0.4664, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -1.010422945022583, "rewards/margins": 1.0139751434326172, "rewards/rejected": -2.0243980884552, "step": 340 }, { "epoch": 0.89, "grad_norm": 67.03908350203817, "learning_rate": 1.956279997278043e-08, "logits/chosen": -6.695385932922363, "logits/rejected": -7.170092582702637, "logps/chosen": -490.95477294921875, "logps/rejected": -671.6006469726562, "loss": 0.4609, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0624668598175049, "rewards/margins": 1.2378281354904175, "rewards/rejected": -2.300295352935791, "step": 350 }, { "epoch": 0.91, "grad_norm": 62.41118335177348, "learning_rate": 1.1896366660467171e-08, "logits/chosen": -6.916273593902588, "logits/rejected": -7.29934549331665, "logps/chosen": -442.0613708496094, "logps/rejected": -587.3243408203125, "loss": 0.4614, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -1.0646042823791504, "rewards/margins": 0.939795196056366, "rewards/rejected": -2.004399538040161, "step": 360 }, { "epoch": 0.94, "grad_norm": 64.20669715502403, "learning_rate": 6.093417111873306e-09, "logits/chosen": -6.631227016448975, "logits/rejected": -7.061822414398193, "logps/chosen": -475.503173828125, "logps/rejected": -603.5335693359375, "loss": 0.4628, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0652824640274048, "rewards/margins": 0.9914292097091675, "rewards/rejected": -2.0567116737365723, "step": 370 }, { "epoch": 0.96, "grad_norm": 56.2255930339814, "learning_rate": 2.1993672751463576e-09, "logits/chosen": -6.618802070617676, "logits/rejected": -7.291499137878418, "logps/chosen": -474.84222412109375, "logps/rejected": -613.4112548828125, "loss": 0.4616, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9348451495170593, "rewards/margins": 1.0984827280044556, "rewards/rejected": -2.03332781791687, "step": 380 }, { "epoch": 0.99, "grad_norm": 54.17486776663219, "learning_rate": 2.4469337000659897e-10, "logits/chosen": -6.589730739593506, "logits/rejected": -7.092536926269531, "logps/chosen": -502.3226623535156, "logps/rejected": -631.1239013671875, "loss": 0.4561, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -1.0657384395599365, "rewards/margins": 1.0461207628250122, "rewards/rejected": -2.111859083175659, "step": 390 }, { "epoch": 1.0, "step": 395, "total_flos": 0.0, "train_loss": 0.5006634004508392, "train_runtime": 11862.2044, "train_samples_per_second": 8.521, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 395, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }