{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6710.683257043726, "learning_rate": 2.702702702702703e-10, "logits/chosen": -1.3332719802856445, "logits/rejected": -1.246394395828247, "logps/chosen": -286.9539794921875, "logps/rejected": -263.3782958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 8627.07198024136, "learning_rate": 2.702702702702703e-09, "logits/chosen": -1.6131304502487183, "logits/rejected": -1.3906824588775635, "logps/chosen": -342.4801025390625, "logps/rejected": -294.5570068359375, "loss": 1.5693, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.2733865976333618, "rewards/margins": 0.37529557943344116, "rewards/rejected": -0.10190902650356293, "step": 10 }, { "epoch": 0.11, "grad_norm": 7866.070005970393, "learning_rate": 5.405405405405406e-09, "logits/chosen": -1.4882968664169312, "logits/rejected": -1.310754418373108, "logps/chosen": -314.77069091796875, "logps/rejected": -279.344970703125, "loss": 1.5992, "rewards/accuracies": 0.53125, "rewards/chosen": -0.023275405168533325, "rewards/margins": 0.17254583537578583, "rewards/rejected": -0.19582125544548035, "step": 20 }, { "epoch": 0.16, "grad_norm": 9136.009378303972, "learning_rate": 8.108108108108109e-09, "logits/chosen": -1.5432701110839844, "logits/rejected": -1.3749229907989502, "logps/chosen": -324.8939208984375, "logps/rejected": -286.3031005859375, "loss": 1.7105, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.060706138610839844, "rewards/margins": 0.18149089813232422, "rewards/rejected": -0.12078475952148438, "step": 30 }, { "epoch": 0.22, "grad_norm": 8253.74733424752, "learning_rate": 9.997973265157192e-09, "logits/chosen": -1.530524730682373, "logits/rejected": -1.3523082733154297, "logps/chosen": -325.4083557128906, "logps/rejected": -285.64794921875, "loss": 1.688, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.07491829246282578, "rewards/margins": -0.08290421962738037, "rewards/rejected": 0.007985919713973999, "step": 40 }, { "epoch": 0.27, "grad_norm": 7989.013903323287, "learning_rate": 9.961988113473708e-09, "logits/chosen": -1.537957787513733, "logits/rejected": -1.390915870666504, "logps/chosen": -336.9923400878906, "logps/rejected": -297.2927551269531, "loss": 1.5308, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.14283373951911926, "rewards/margins": 0.05056797340512276, "rewards/rejected": 0.09226574748754501, "step": 50 }, { "epoch": 0.33, "grad_norm": 7487.424543983984, "learning_rate": 9.881337335184878e-09, "logits/chosen": -1.5855934619903564, "logits/rejected": -1.4371620416641235, "logps/chosen": -319.7807922363281, "logps/rejected": -284.994140625, "loss": 1.4965, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.3105394244194031, "rewards/margins": 0.8686850666999817, "rewards/rejected": -0.5581457018852234, "step": 60 }, { "epoch": 0.38, "grad_norm": 8830.269579570355, "learning_rate": 9.756746912994832e-09, "logits/chosen": -1.5150383710861206, "logits/rejected": -1.354196310043335, "logps/chosen": -312.10052490234375, "logps/rejected": -275.0794982910156, "loss": 1.409, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 0.03486788272857666, "rewards/margins": 0.5497146844863892, "rewards/rejected": -0.5148467421531677, "step": 70 }, { "epoch": 0.43, "grad_norm": 7046.728190784739, "learning_rate": 9.589338354885628e-09, "logits/chosen": -1.593528389930725, "logits/rejected": -1.4396848678588867, "logps/chosen": -323.3490295410156, "logps/rejected": -288.1017150878906, "loss": 1.4073, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1715652495622635, "rewards/margins": 0.8374398946762085, "rewards/rejected": -0.6658747792243958, "step": 80 }, { "epoch": 0.49, "grad_norm": 6929.551893900645, "learning_rate": 9.380618598797472e-09, "logits/chosen": -1.6050440073013306, "logits/rejected": -1.4080655574798584, "logps/chosen": -319.95452880859375, "logps/rejected": -281.72833251953125, "loss": 1.3787, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.6286121606826782, "rewards/margins": 1.2058827877044678, "rewards/rejected": -0.5772705078125, "step": 90 }, { "epoch": 0.54, "grad_norm": 6275.615806226439, "learning_rate": 9.132466447838596e-09, "logits/chosen": -1.5423895120620728, "logits/rejected": -1.3670966625213623, "logps/chosen": -321.89300537109375, "logps/rejected": -282.581787109375, "loss": 1.2362, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1018999814987183, "rewards/margins": 1.560381531715393, "rewards/rejected": -0.45848163962364197, "step": 100 }, { "epoch": 0.6, "grad_norm": 6651.119283999509, "learning_rate": 8.847115658129039e-09, "logits/chosen": -1.512652039527893, "logits/rejected": -1.3851889371871948, "logps/chosen": -318.17474365234375, "logps/rejected": -287.15032958984375, "loss": 1.2039, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.8460339307785034, "rewards/margins": 1.2877717018127441, "rewards/rejected": -0.44173774123191833, "step": 110 }, { "epoch": 0.65, "grad_norm": 6722.274057952871, "learning_rate": 8.527134831514116e-09, "logits/chosen": -1.5804601907730103, "logits/rejected": -1.4257571697235107, "logps/chosen": -331.40728759765625, "logps/rejected": -297.8313903808594, "loss": 1.2367, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.094892978668213, "rewards/margins": 1.075182318687439, "rewards/rejected": 0.019710630178451538, "step": 120 }, { "epoch": 0.71, "grad_norm": 5891.724908819839, "learning_rate": 8.175404294144481e-09, "logits/chosen": -1.6209514141082764, "logits/rejected": -1.4345628023147583, "logps/chosen": -317.20465087890625, "logps/rejected": -271.5218811035156, "loss": 1.1341, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.5204100608825684, "rewards/margins": 1.7521209716796875, "rewards/rejected": -0.23171091079711914, "step": 130 }, { "epoch": 0.76, "grad_norm": 6773.700242649663, "learning_rate": 7.79509016905158e-09, "logits/chosen": -1.5730321407318115, "logits/rejected": -1.4252164363861084, "logps/chosen": -331.2398681640625, "logps/rejected": -294.24530029296875, "loss": 1.2492, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 1.7606897354125977, "rewards/margins": 1.846710443496704, "rewards/rejected": -0.08602098375558853, "step": 140 }, { "epoch": 0.82, "grad_norm": 6236.159184960732, "learning_rate": 7.389615876105773e-09, "logits/chosen": -1.551709532737732, "logits/rejected": -1.4236745834350586, "logps/chosen": -314.64178466796875, "logps/rejected": -291.77093505859375, "loss": 1.1633, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 2.1191415786743164, "rewards/margins": 1.7847158908843994, "rewards/rejected": 0.33442583680152893, "step": 150 }, { "epoch": 0.87, "grad_norm": 6133.347836072311, "learning_rate": 6.962631315901861e-09, "logits/chosen": -1.526906967163086, "logits/rejected": -1.411745548248291, "logps/chosen": -318.1049499511719, "logps/rejected": -290.96685791015625, "loss": 1.1827, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.0592172145843506, "rewards/margins": 1.2991172075271606, "rewards/rejected": 0.7601001262664795, "step": 160 }, { "epoch": 0.92, "grad_norm": 5643.626720023826, "learning_rate": 6.517980014965139e-09, "logits/chosen": -1.599163293838501, "logits/rejected": -1.4116249084472656, "logps/chosen": -331.5318603515625, "logps/rejected": -289.4364929199219, "loss": 1.1298, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.287135601043701, "rewards/margins": 1.9960601329803467, "rewards/rejected": 0.29107528924942017, "step": 170 }, { "epoch": 0.98, "grad_norm": 5637.630920957477, "learning_rate": 6.059664528022266e-09, "logits/chosen": -1.5944277048110962, "logits/rejected": -1.4438059329986572, "logps/chosen": -315.2278747558594, "logps/rejected": -276.7435607910156, "loss": 1.0907, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 2.1914913654327393, "rewards/margins": 2.4074912071228027, "rewards/rejected": -0.21599988639354706, "step": 180 }, { "epoch": 1.03, "grad_norm": 6180.318875375917, "learning_rate": 5.591810408770492e-09, "logits/chosen": -1.5520689487457275, "logits/rejected": -1.3785512447357178, "logps/chosen": -315.73565673828125, "logps/rejected": -278.69659423828125, "loss": 1.0339, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 2.1935009956359863, "rewards/margins": 2.327456474304199, "rewards/rejected": -0.1339556723833084, "step": 190 }, { "epoch": 1.09, "grad_norm": 6463.364092385962, "learning_rate": 5.118629073464423e-09, "logits/chosen": -1.56984543800354, "logits/rejected": -1.3597148656845093, "logps/chosen": -326.0714416503906, "logps/rejected": -282.65838623046875, "loss": 1.0602, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.7320990562438965, "rewards/margins": 2.1956944465637207, "rewards/rejected": 0.5364044904708862, "step": 200 }, { "epoch": 1.14, "grad_norm": 5690.544515251583, "learning_rate": 4.644379891605983e-09, "logits/chosen": -1.6162197589874268, "logits/rejected": -1.4398715496063232, "logps/chosen": -324.8927001953125, "logps/rejected": -291.322998046875, "loss": 1.0255, "rewards/accuracies": 0.690625011920929, "rewards/chosen": 2.447072982788086, "rewards/margins": 2.282660961151123, "rewards/rejected": 0.1644122153520584, "step": 210 }, { "epoch": 1.2, "grad_norm": 6400.665884048493, "learning_rate": 4.173331844980362e-09, "logits/chosen": -1.5352128744125366, "logits/rejected": -1.4104220867156982, "logps/chosen": -324.0686340332031, "logps/rejected": -293.37646484375, "loss": 1.0832, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 2.6446428298950195, "rewards/margins": 2.2270336151123047, "rewards/rejected": 0.41760945320129395, "step": 220 }, { "epoch": 1.25, "grad_norm": 7190.025557983883, "learning_rate": 3.7097251001664824e-09, "logits/chosen": -1.5286749601364136, "logits/rejected": -1.3688592910766602, "logps/chosen": -324.03509521484375, "logps/rejected": -286.93658447265625, "loss": 0.9913, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 2.901033401489258, "rewards/margins": 2.64115834236145, "rewards/rejected": 0.25987523794174194, "step": 230 }, { "epoch": 1.3, "grad_norm": 6451.277776389874, "learning_rate": 3.2577328404292057e-09, "logits/chosen": -1.5419480800628662, "logits/rejected": -1.4117484092712402, "logps/chosen": -312.6917724609375, "logps/rejected": -286.0001220703125, "loss": 0.9838, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 2.7435173988342285, "rewards/margins": 2.642423629760742, "rewards/rejected": 0.101093590259552, "step": 240 }, { "epoch": 1.36, "grad_norm": 6314.813043200908, "learning_rate": 2.821423700565763e-09, "logits/chosen": -1.597633719444275, "logits/rejected": -1.419690728187561, "logps/chosen": -350.88946533203125, "logps/rejected": -306.6169128417969, "loss": 0.9826, "rewards/accuracies": 0.734375, "rewards/chosen": 3.2030205726623535, "rewards/margins": 3.1783461570739746, "rewards/rejected": 0.024674177169799805, "step": 250 }, { "epoch": 1.41, "grad_norm": 6095.456225806253, "learning_rate": 2.4047251428513483e-09, "logits/chosen": -1.6133763790130615, "logits/rejected": -1.4590764045715332, "logps/chosen": -325.47283935546875, "logps/rejected": -291.05609130859375, "loss": 1.0523, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 2.9614436626434326, "rewards/margins": 2.505828619003296, "rewards/rejected": 0.45561495423316956, "step": 260 }, { "epoch": 1.47, "grad_norm": 4454.374261141978, "learning_rate": 2.011388103757442e-09, "logits/chosen": -1.5234829187393188, "logits/rejected": -1.3793760538101196, "logps/chosen": -316.57562255859375, "logps/rejected": -285.84161376953125, "loss": 1.0123, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 3.164306640625, "rewards/margins": 3.0578055381774902, "rewards/rejected": 0.10650081932544708, "step": 270 }, { "epoch": 1.52, "grad_norm": 6732.22761580186, "learning_rate": 1.644953229677474e-09, "logits/chosen": -1.5963881015777588, "logits/rejected": -1.4128029346466064, "logps/chosen": -326.2952575683594, "logps/rejected": -284.8047790527344, "loss": 1.0398, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 3.0773532390594482, "rewards/margins": 2.8858485221862793, "rewards/rejected": 0.1915048062801361, "step": 280 }, { "epoch": 1.58, "grad_norm": 5790.846466596631, "learning_rate": 1.308719005590957e-09, "logits/chosen": -1.508599042892456, "logits/rejected": -1.3934084177017212, "logps/chosen": -318.65557861328125, "logps/rejected": -282.54144287109375, "loss": 1.0287, "rewards/accuracies": 0.703125, "rewards/chosen": 2.989049196243286, "rewards/margins": 2.957663059234619, "rewards/rejected": 0.031385958194732666, "step": 290 }, { "epoch": 1.63, "grad_norm": 5615.161677680287, "learning_rate": 1.005712063557776e-09, "logits/chosen": -1.6266453266143799, "logits/rejected": -1.4476075172424316, "logps/chosen": -324.32855224609375, "logps/rejected": -290.5617370605469, "loss": 1.0124, "rewards/accuracies": 0.6875, "rewards/chosen": 2.7768070697784424, "rewards/margins": 2.2968738079071045, "rewards/rejected": 0.47993311285972595, "step": 300 }, { "epoch": 1.68, "grad_norm": 5693.104301118471, "learning_rate": 7.386599383124321e-10, "logits/chosen": -1.5692319869995117, "logits/rejected": -1.3874424695968628, "logps/chosen": -322.054443359375, "logps/rejected": -285.78363037109375, "loss": 1.0301, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 2.796619176864624, "rewards/margins": 2.5291390419006348, "rewards/rejected": 0.26748019456863403, "step": 310 }, { "epoch": 1.74, "grad_norm": 5750.226909350172, "learning_rate": 5.099665152003929e-10, "logits/chosen": -1.5892612934112549, "logits/rejected": -1.377362608909607, "logps/chosen": -333.9684143066406, "logps/rejected": -289.87786865234375, "loss": 1.0507, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 3.231071949005127, "rewards/margins": 2.8270695209503174, "rewards/rejected": 0.40400204062461853, "step": 320 }, { "epoch": 1.79, "grad_norm": 5901.981652001965, "learning_rate": 3.216903914633745e-10, "logits/chosen": -1.5573378801345825, "logits/rejected": -1.4351418018341064, "logps/chosen": -325.4165954589844, "logps/rejected": -296.0995178222656, "loss": 1.0543, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 3.054168939590454, "rewards/margins": 2.6867241859436035, "rewards/rejected": 0.367445170879364, "step": 330 }, { "epoch": 1.85, "grad_norm": 5320.494017847622, "learning_rate": 1.7552634565570324e-10, "logits/chosen": -1.5548150539398193, "logits/rejected": -1.3869941234588623, "logps/chosen": -330.1687316894531, "logps/rejected": -292.9322509765625, "loss": 0.9934, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 3.3230159282684326, "rewards/margins": 3.269806385040283, "rewards/rejected": 0.053209513425827026, "step": 340 }, { "epoch": 1.9, "grad_norm": 5796.257294954858, "learning_rate": 7.279008199590543e-11, "logits/chosen": -1.5367571115493774, "logits/rejected": -1.3734115362167358, "logps/chosen": -326.6300048828125, "logps/rejected": -291.9888916015625, "loss": 1.0303, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 3.4221725463867188, "rewards/margins": 3.2483201026916504, "rewards/rejected": 0.1738525927066803, "step": 350 }, { "epoch": 1.96, "grad_norm": 5941.322275258092, "learning_rate": 1.4406386978128017e-11, "logits/chosen": -1.6203804016113281, "logits/rejected": -1.4243013858795166, "logps/chosen": -331.37005615234375, "logps/rejected": -291.6743469238281, "loss": 0.9963, "rewards/accuracies": 0.71875, "rewards/chosen": 3.5317084789276123, "rewards/margins": 2.9804892539978027, "rewards/rejected": 0.5512194633483887, "step": 360 }, { "epoch": 2.0, "step": 368, "total_flos": 0.0, "train_loss": 1.1851136749205382, "train_runtime": 9917.443, "train_samples_per_second": 9.497, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }