{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 13421.071613225095, "learning_rate": 2.702702702702703e-10, "logits/chosen": -1.3332719802856445, "logits/rejected": -1.246394395828247, "logps/chosen": -286.9539794921875, "logps/rejected": -263.3782958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 20347.808666602745, "learning_rate": 2.702702702702703e-09, "logits/chosen": -1.6190177202224731, "logits/rejected": -1.3982949256896973, "logps/chosen": -342.49090576171875, "logps/rejected": -294.5392761230469, "loss": 2.904, "rewards/accuracies": 0.4548611044883728, "rewards/chosen": 0.43856528401374817, "rewards/margins": 0.4652646481990814, "rewards/rejected": -0.026699384674429893, "step": 10 }, { "epoch": 0.11, "grad_norm": 14961.634137592142, "learning_rate": 5.405405405405406e-09, "logits/chosen": -1.4918005466461182, "logits/rejected": -1.3144338130950928, "logps/chosen": -314.73779296875, "logps/rejected": -279.31842041015625, "loss": 2.9253, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.2821510434150696, "rewards/margins": 0.4083018898963928, "rewards/rejected": -0.12615084648132324, "step": 20 }, { "epoch": 0.16, "grad_norm": 18492.420742404967, "learning_rate": 8.108108108108109e-09, "logits/chosen": -1.5456618070602417, "logits/rejected": -1.377851128578186, "logps/chosen": -324.8970642089844, "logps/rejected": -286.29644775390625, "loss": 3.069, "rewards/accuracies": 0.484375, "rewards/chosen": 0.0898141860961914, "rewards/margins": 0.26539188623428345, "rewards/rejected": -0.17557768523693085, "step": 30 }, { "epoch": 0.22, "grad_norm": 16172.032582850023, "learning_rate": 9.997973265157192e-09, "logits/chosen": -1.5278599262237549, "logits/rejected": -1.348915934562683, "logps/chosen": -325.442626953125, "logps/rejected": -285.65606689453125, "loss": 3.1235, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.4925743043422699, "rewards/margins": -0.42746010422706604, "rewards/rejected": -0.06511423736810684, "step": 40 }, { "epoch": 0.27, "grad_norm": 16865.011165666692, "learning_rate": 9.961988113473708e-09, "logits/chosen": -1.5399911403656006, "logits/rejected": -1.3937371969223022, "logps/chosen": -337.0230712890625, "logps/rejected": -297.31866455078125, "loss": 2.8389, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.021488964557647705, "rewards/margins": 0.0530308373272419, "rewards/rejected": -0.0745197981595993, "step": 50 }, { "epoch": 0.33, "grad_norm": 14690.810465030829, "learning_rate": 9.881337335184878e-09, "logits/chosen": -1.58315908908844, "logits/rejected": -1.4344873428344727, "logps/chosen": -319.7967529296875, "logps/rejected": -285.00897216796875, "loss": 2.6638, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.46145668625831604, "rewards/margins": 1.7262779474258423, "rewards/rejected": -1.2648210525512695, "step": 60 }, { "epoch": 0.38, "grad_norm": 19235.210685128586, "learning_rate": 9.756746912994832e-09, "logits/chosen": -1.5140897035598755, "logits/rejected": -1.353376865386963, "logps/chosen": -312.042236328125, "logps/rejected": -275.03875732421875, "loss": 2.6382, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6530985236167908, "rewards/margins": 1.2754881381988525, "rewards/rejected": -0.6223896741867065, "step": 70 }, { "epoch": 0.43, "grad_norm": 14005.352374602855, "learning_rate": 9.589338354885628e-09, "logits/chosen": -1.5981298685073853, "logits/rejected": -1.4447776079177856, "logps/chosen": -323.3012390136719, "logps/rejected": -288.0871887207031, "loss": 2.5625, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8214060664176941, "rewards/margins": 2.007897138595581, "rewards/rejected": -1.186490774154663, "step": 80 }, { "epoch": 0.49, "grad_norm": 13300.546208456732, "learning_rate": 9.380618598797472e-09, "logits/chosen": -1.614319086074829, "logits/rejected": -1.418944239616394, "logps/chosen": -319.93804931640625, "logps/rejected": -281.74871826171875, "loss": 2.4466, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 1.4227933883666992, "rewards/margins": 2.7808287143707275, "rewards/rejected": -1.3580353260040283, "step": 90 }, { "epoch": 0.54, "grad_norm": 14640.023348603876, "learning_rate": 9.132466447838596e-09, "logits/chosen": -1.5410282611846924, "logits/rejected": -1.3655275106430054, "logps/chosen": -321.95489501953125, "logps/rejected": -282.6141662597656, "loss": 2.5501, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 1.5853115320205688, "rewards/margins": 2.8258020877838135, "rewards/rejected": -1.240490198135376, "step": 100 }, { "epoch": 0.6, "grad_norm": 13236.809764267844, "learning_rate": 8.847115658129039e-09, "logits/chosen": -1.505202293395996, "logits/rejected": -1.3764159679412842, "logps/chosen": -318.1921081542969, "logps/rejected": -287.1940002441406, "loss": 2.1932, "rewards/accuracies": 0.609375, "rewards/chosen": 1.5191149711608887, "rewards/margins": 2.8392601013183594, "rewards/rejected": -1.3201450109481812, "step": 110 }, { "epoch": 0.65, "grad_norm": 14131.088229346033, "learning_rate": 8.527134831514116e-09, "logits/chosen": -1.5827970504760742, "logits/rejected": -1.4286653995513916, "logps/chosen": -331.36956787109375, "logps/rejected": -297.8145446777344, "loss": 2.3092, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 2.5670523643493652, "rewards/margins": 2.359330654144287, "rewards/rejected": 0.20772182941436768, "step": 120 }, { "epoch": 0.71, "grad_norm": 12432.379320236165, "learning_rate": 8.175404294144481e-09, "logits/chosen": -1.6129405498504639, "logits/rejected": -1.4252656698226929, "logps/chosen": -317.1763610839844, "logps/rejected": -271.4945068359375, "loss": 2.1552, "rewards/accuracies": 0.671875, "rewards/chosen": 3.3237526416778564, "rewards/margins": 3.513510227203369, "rewards/rejected": -0.1897575855255127, "step": 130 }, { "epoch": 0.76, "grad_norm": 14125.607348595095, "learning_rate": 7.79509016905158e-09, "logits/chosen": -1.5669504404067993, "logits/rejected": -1.4181562662124634, "logps/chosen": -331.1703186035156, "logps/rejected": -294.15667724609375, "loss": 2.238, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 4.217373847961426, "rewards/margins": 3.5030651092529297, "rewards/rejected": 0.7143087387084961, "step": 140 }, { "epoch": 0.82, "grad_norm": 13250.148461496135, "learning_rate": 7.389615876105773e-09, "logits/chosen": -1.5488533973693848, "logits/rejected": -1.4203182458877563, "logps/chosen": -314.6199645996094, "logps/rejected": -291.7601623535156, "loss": 2.2017, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 4.456366539001465, "rewards/margins": 3.679431200027466, "rewards/rejected": 0.7769355773925781, "step": 150 }, { "epoch": 0.87, "grad_norm": 12949.50039152227, "learning_rate": 6.962631315901861e-09, "logits/chosen": -1.5129776000976562, "logits/rejected": -1.396781325340271, "logps/chosen": -318.1156005859375, "logps/rejected": -291.0067138671875, "loss": 2.2242, "rewards/accuracies": 0.578125, "rewards/chosen": 4.011959075927734, "rewards/margins": 2.890659809112549, "rewards/rejected": 1.1212995052337646, "step": 160 }, { "epoch": 0.92, "grad_norm": 12255.51409729144, "learning_rate": 6.517980014965139e-09, "logits/chosen": -1.5933173894882202, "logits/rejected": -1.4045393466949463, "logps/chosen": -331.5554504394531, "logps/rejected": -289.4686584472656, "loss": 2.1595, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 4.3387274742126465, "rewards/margins": 4.077757358551025, "rewards/rejected": 0.2609703540802002, "step": 170 }, { "epoch": 0.98, "grad_norm": 11568.19856890153, "learning_rate": 6.059664528022266e-09, "logits/chosen": -1.5902655124664307, "logits/rejected": -1.4390876293182373, "logps/chosen": -315.20379638671875, "logps/rejected": -276.70684814453125, "loss": 2.1416, "rewards/accuracies": 0.6875, "rewards/chosen": 4.6242899894714355, "rewards/margins": 4.689143180847168, "rewards/rejected": -0.06485319137573242, "step": 180 }, { "epoch": 1.03, "grad_norm": 11270.506557264835, "learning_rate": 5.591810408770492e-09, "logits/chosen": -1.553971529006958, "logits/rejected": -1.380243182182312, "logps/chosen": -315.759033203125, "logps/rejected": -278.6817932128906, "loss": 1.8829, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 4.153166770935059, "rewards/margins": 4.272480010986328, "rewards/rejected": -0.11931288242340088, "step": 190 }, { "epoch": 1.09, "grad_norm": 13205.342425147532, "learning_rate": 5.118629073464423e-09, "logits/chosen": -1.5635509490966797, "logits/rejected": -1.3522040843963623, "logps/chosen": -326.10589599609375, "logps/rejected": -282.65509033203125, "loss": 2.0122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.118946075439453, "rewards/margins": 4.013725757598877, "rewards/rejected": 1.105220079421997, "step": 200 }, { "epoch": 1.14, "grad_norm": 12686.21644324199, "learning_rate": 4.644379891605983e-09, "logits/chosen": -1.6106517314910889, "logits/rejected": -1.4335637092590332, "logps/chosen": -324.8961486816406, "logps/rejected": -291.36163330078125, "loss": 1.9558, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 4.859737873077393, "rewards/margins": 4.917017459869385, "rewards/rejected": -0.05727982521057129, "step": 210 }, { "epoch": 1.2, "grad_norm": 12142.898576557665, "learning_rate": 4.173331844980362e-09, "logits/chosen": -1.5359071493148804, "logits/rejected": -1.4112781286239624, "logps/chosen": -324.1145324707031, "logps/rejected": -293.42950439453125, "loss": 1.9354, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 4.830060005187988, "rewards/margins": 4.525102138519287, "rewards/rejected": 0.3049588203430176, "step": 220 }, { "epoch": 1.25, "grad_norm": 12759.193686594554, "learning_rate": 3.7097251001664824e-09, "logits/chosen": -1.5307817459106445, "logits/rejected": -1.3716728687286377, "logps/chosen": -324.10162353515625, "logps/rejected": -286.9132995605469, "loss": 1.856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 5.137416362762451, "rewards/margins": 4.384617805480957, "rewards/rejected": 0.7527987360954285, "step": 230 }, { "epoch": 1.3, "grad_norm": 13355.016786310944, "learning_rate": 3.2577328404292057e-09, "logits/chosen": -1.5365327596664429, "logits/rejected": -1.4058964252471924, "logps/chosen": -312.64398193359375, "logps/rejected": -285.9228515625, "loss": 1.8652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.964874267578125, "rewards/margins": 4.990009307861328, "rewards/rejected": 0.9748651385307312, "step": 240 }, { "epoch": 1.36, "grad_norm": 13138.491204289365, "learning_rate": 2.821423700565763e-09, "logits/chosen": -1.5989090204238892, "logits/rejected": -1.4208343029022217, "logps/chosen": -350.8832092285156, "logps/rejected": -306.54180908203125, "loss": 1.7277, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 6.4689040184021, "rewards/margins": 5.668680191040039, "rewards/rejected": 0.8002230525016785, "step": 250 }, { "epoch": 1.41, "grad_norm": 13320.645189206805, "learning_rate": 2.4047251428513483e-09, "logits/chosen": -1.6121330261230469, "logits/rejected": -1.4577114582061768, "logps/chosen": -325.4482421875, "logps/rejected": -291.07183837890625, "loss": 1.9519, "rewards/accuracies": 0.734375, "rewards/chosen": 6.1689534187316895, "rewards/margins": 5.415326118469238, "rewards/rejected": 0.7536273002624512, "step": 260 }, { "epoch": 1.47, "grad_norm": 10367.972989627691, "learning_rate": 2.011388103757442e-09, "logits/chosen": -1.5253089666366577, "logits/rejected": -1.381744146347046, "logps/chosen": -316.5883483886719, "logps/rejected": -285.75799560546875, "loss": 1.8335, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 6.201399803161621, "rewards/margins": 5.151947021484375, "rewards/rejected": 1.0494521856307983, "step": 270 }, { "epoch": 1.52, "grad_norm": 11861.49317635369, "learning_rate": 1.644953229677474e-09, "logits/chosen": -1.5998437404632568, "logits/rejected": -1.4171390533447266, "logps/chosen": -326.30133056640625, "logps/rejected": -284.77996826171875, "loss": 1.8816, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 6.093815803527832, "rewards/margins": 5.462450981140137, "rewards/rejected": 0.6313648223876953, "step": 280 }, { "epoch": 1.58, "grad_norm": 12967.35903281612, "learning_rate": 1.308719005590957e-09, "logits/chosen": -1.5077544450759888, "logits/rejected": -1.392764687538147, "logps/chosen": -318.66082763671875, "logps/rejected": -282.51806640625, "loss": 1.8195, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 5.925144195556641, "rewards/margins": 5.62863826751709, "rewards/rejected": 0.29650676250457764, "step": 290 }, { "epoch": 1.63, "grad_norm": 10720.865204139644, "learning_rate": 1.005712063557776e-09, "logits/chosen": -1.632625937461853, "logits/rejected": -1.4543178081512451, "logps/chosen": -324.3389587402344, "logps/rejected": -290.5496520996094, "loss": 1.7503, "rewards/accuracies": 0.671875, "rewards/chosen": 5.449400901794434, "rewards/margins": 4.368684768676758, "rewards/rejected": 1.0807160139083862, "step": 300 }, { "epoch": 1.68, "grad_norm": 10876.995374840744, "learning_rate": 7.386599383124321e-10, "logits/chosen": -1.5634690523147583, "logits/rejected": -1.381116509437561, "logps/chosen": -322.0839538574219, "logps/rejected": -285.80072021484375, "loss": 1.9509, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 5.297797203063965, "rewards/margins": 4.9336748123168945, "rewards/rejected": 0.36412325501441956, "step": 310 }, { "epoch": 1.74, "grad_norm": 11694.435027751537, "learning_rate": 5.099665152003929e-10, "logits/chosen": -1.5935719013214111, "logits/rejected": -1.3827770948410034, "logps/chosen": -334.022705078125, "logps/rejected": -289.9593505859375, "loss": 1.8646, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 5.919422149658203, "rewards/margins": 5.926436424255371, "rewards/rejected": -0.00701451301574707, "step": 320 }, { "epoch": 1.79, "grad_norm": 13425.084683268695, "learning_rate": 3.216903914633745e-10, "logits/chosen": -1.5508067607879639, "logits/rejected": -1.4279029369354248, "logps/chosen": -325.4474792480469, "logps/rejected": -296.10894775390625, "loss": 1.9418, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 5.799554347991943, "rewards/margins": 5.1590423583984375, "rewards/rejected": 0.6405118703842163, "step": 330 }, { "epoch": 1.85, "grad_norm": 12502.360923512058, "learning_rate": 1.7552634565570324e-10, "logits/chosen": -1.5481187105178833, "logits/rejected": -1.379558801651001, "logps/chosen": -330.1790466308594, "logps/rejected": -292.8612365722656, "loss": 1.8007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 6.542909145355225, "rewards/margins": 5.726317405700684, "rewards/rejected": 0.8165918588638306, "step": 340 }, { "epoch": 1.9, "grad_norm": 12479.355591476906, "learning_rate": 7.279008199590543e-11, "logits/chosen": -1.5400382280349731, "logits/rejected": -1.3772073984146118, "logps/chosen": -326.68316650390625, "logps/rejected": -291.93536376953125, "loss": 1.8342, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 6.312959671020508, "rewards/margins": 5.429690361022949, "rewards/rejected": 0.8832691311836243, "step": 350 }, { "epoch": 1.96, "grad_norm": 12177.012050569681, "learning_rate": 1.4406386978128017e-11, "logits/chosen": -1.6199939250946045, "logits/rejected": -1.4238550662994385, "logps/chosen": -331.38580322265625, "logps/rejected": -291.71551513671875, "loss": 1.8517, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 6.9059858322143555, "rewards/margins": 6.215287208557129, "rewards/rejected": 0.6906987428665161, "step": 360 }, { "epoch": 2.0, "step": 368, "total_flos": 0.0, "train_loss": 2.18696221968402, "train_runtime": 9935.8575, "train_samples_per_second": 9.48, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }