{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9980806142034548, "eval_steps": 10000000, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1135.2510024076782, "learning_rate": 1.282051282051282e-08, "logits/chosen": -2.5583817958831787, "logits/rejected": -2.4487552642822266, "logps/chosen": -258.1644592285156, "logps/rejected": -216.25729370117188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 1064.195577422658, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -2.606004476547241, "logits/rejected": -2.553109884262085, "logps/chosen": -267.5234680175781, "logps/rejected": -217.6415557861328, "loss": 0.7054, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.03280753642320633, "rewards/margins": 0.0353083573281765, "rewards/rejected": -0.002500815549865365, "step": 10 }, { "epoch": 0.05, "grad_norm": 736.2634036624544, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -2.630505323410034, "logits/rejected": -2.5676522254943848, "logps/chosen": -260.584716796875, "logps/rejected": -207.07144165039062, "loss": 0.5213, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5755742788314819, "rewards/margins": 0.5894275903701782, "rewards/rejected": -0.013853324577212334, "step": 20 }, { "epoch": 0.08, "grad_norm": 1076.3695793406284, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -2.6462178230285645, "logits/rejected": -2.571561336517334, "logps/chosen": -250.9139862060547, "logps/rejected": -198.4534912109375, "loss": 0.3324, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.3866074085235596, "rewards/margins": 3.0545947551727295, "rewards/rejected": 0.3320125639438629, "step": 30 }, { "epoch": 0.1, "grad_norm": 418.3228099023361, "learning_rate": 4.99989986344963e-07, "logits/chosen": -2.6392903327941895, "logits/rejected": -2.5602712631225586, "logps/chosen": -243.54013061523438, "logps/rejected": -192.9114227294922, "loss": 0.3161, "rewards/accuracies": 0.84375, "rewards/chosen": 5.447351455688477, "rewards/margins": 4.827452182769775, "rewards/rejected": 0.6198989748954773, "step": 40 }, { "epoch": 0.13, "grad_norm": 630.2703390024756, "learning_rate": 4.987893180827479e-07, "logits/chosen": -2.651214361190796, "logits/rejected": -2.57964825630188, "logps/chosen": -258.42962646484375, "logps/rejected": -203.57992553710938, "loss": 0.366, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 7.846033573150635, "rewards/margins": 6.590806007385254, "rewards/rejected": 1.255226731300354, "step": 50 }, { "epoch": 0.15, "grad_norm": 655.8352889546771, "learning_rate": 4.955969343539162e-07, "logits/chosen": -2.60957932472229, "logits/rejected": -2.5362067222595215, "logps/chosen": -262.3640441894531, "logps/rejected": -209.32199096679688, "loss": 0.3453, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 5.170942306518555, "rewards/margins": 6.18172550201416, "rewards/rejected": -1.0107834339141846, "step": 60 }, { "epoch": 0.18, "grad_norm": 456.9589116841801, "learning_rate": 4.90438392204474e-07, "logits/chosen": -2.5825228691101074, "logits/rejected": -2.5089833736419678, "logps/chosen": -291.7918395996094, "logps/rejected": -227.83432006835938, "loss": 0.3454, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.895013809204102, "rewards/margins": 7.00995397567749, "rewards/rejected": -2.1149401664733887, "step": 70 }, { "epoch": 0.2, "grad_norm": 816.8720109326792, "learning_rate": 4.83354989019146e-07, "logits/chosen": -2.5420753955841064, "logits/rejected": -2.467258930206299, "logps/chosen": -259.6270446777344, "logps/rejected": -204.15179443359375, "loss": 0.3311, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 6.344871997833252, "rewards/margins": 7.2052764892578125, "rewards/rejected": -0.860403835773468, "step": 80 }, { "epoch": 0.23, "grad_norm": 922.6738539012168, "learning_rate": 4.7440343190975353e-07, "logits/chosen": -2.5713560581207275, "logits/rejected": -2.513441801071167, "logps/chosen": -257.0751037597656, "logps/rejected": -217.1184844970703, "loss": 0.3343, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.73614239692688, "rewards/margins": 5.834546089172363, "rewards/rejected": -2.0984034538269043, "step": 90 }, { "epoch": 0.26, "grad_norm": 406.82707972381877, "learning_rate": 4.6365538373900506e-07, "logits/chosen": -2.6249356269836426, "logits/rejected": -2.5500850677490234, "logps/chosen": -236.4239501953125, "logps/rejected": -200.73150634765625, "loss": 0.5974, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.595959663391113, "rewards/margins": 6.244544506072998, "rewards/rejected": -1.648585557937622, "step": 100 }, { "epoch": 0.28, "grad_norm": 656.3071663391811, "learning_rate": 4.5119688941406386e-07, "logits/chosen": -2.618974208831787, "logits/rejected": -2.5380780696868896, "logps/chosen": -257.79248046875, "logps/rejected": -209.8715362548828, "loss": 0.4404, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 6.305555820465088, "rewards/margins": 7.463587760925293, "rewards/rejected": -1.158031940460205, "step": 110 }, { "epoch": 0.31, "grad_norm": 810.7648282749318, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -2.5895957946777344, "logits/rejected": -2.519530773162842, "logps/chosen": -262.7950134277344, "logps/rejected": -208.9604949951172, "loss": 0.438, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 5.333884239196777, "rewards/margins": 7.409175872802734, "rewards/rejected": -2.075291156768799, "step": 120 }, { "epoch": 0.33, "grad_norm": 513.4959841183485, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -2.5553436279296875, "logits/rejected": -2.487457752227783, "logps/chosen": -251.7507781982422, "logps/rejected": -197.44088745117188, "loss": 0.4027, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.441976070404053, "rewards/margins": 7.408116340637207, "rewards/rejected": -2.966140031814575, "step": 130 }, { "epoch": 0.36, "grad_norm": 650.5511601275197, "learning_rate": 4.046196825665637e-07, "logits/chosen": -2.5706536769866943, "logits/rejected": -2.500262498855591, "logps/chosen": -270.2043762207031, "logps/rejected": -217.0515594482422, "loss": 0.4293, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.977551221847534, "rewards/margins": 6.7731499671936035, "rewards/rejected": -2.7955987453460693, "step": 140 }, { "epoch": 0.38, "grad_norm": 530.5799871161138, "learning_rate": 3.864411275486261e-07, "logits/chosen": -2.5574281215667725, "logits/rejected": -2.488007068634033, "logps/chosen": -263.3489685058594, "logps/rejected": -212.54638671875, "loss": 0.4583, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 5.79421329498291, "rewards/margins": 7.515044212341309, "rewards/rejected": -1.720831274986267, "step": 150 }, { "epoch": 0.41, "grad_norm": 600.6086946072276, "learning_rate": 3.671702752161759e-07, "logits/chosen": -2.563870906829834, "logits/rejected": -2.493649482727051, "logps/chosen": -244.5281219482422, "logps/rejected": -198.3011474609375, "loss": 0.4465, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.527863025665283, "rewards/margins": 7.751578330993652, "rewards/rejected": -4.223715782165527, "step": 160 }, { "epoch": 0.44, "grad_norm": 753.6856997505446, "learning_rate": 3.4696140090121375e-07, "logits/chosen": -2.5673775672912598, "logits/rejected": -2.500842571258545, "logps/chosen": -265.5797119140625, "logps/rejected": -211.0306854248047, "loss": 0.3547, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 4.089644908905029, "rewards/margins": 7.812180519104004, "rewards/rejected": -3.7225348949432373, "step": 170 }, { "epoch": 0.46, "grad_norm": 645.3967547220625, "learning_rate": 3.259762893935617e-07, "logits/chosen": -2.6238903999328613, "logits/rejected": -2.534097194671631, "logps/chosen": -236.9849395751953, "logps/rejected": -186.6522674560547, "loss": 0.4499, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 4.038764953613281, "rewards/margins": 6.760235786437988, "rewards/rejected": -2.721470594406128, "step": 180 }, { "epoch": 0.49, "grad_norm": 707.705744532387, "learning_rate": 3.0438293975154184e-07, "logits/chosen": -2.582486867904663, "logits/rejected": -2.5034093856811523, "logps/chosen": -261.0556945800781, "logps/rejected": -205.6962890625, "loss": 0.3591, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.011924982070923, "rewards/margins": 8.104998588562012, "rewards/rejected": -5.093073844909668, "step": 190 }, { "epoch": 0.51, "grad_norm": 822.8629977119059, "learning_rate": 2.823542203635138e-07, "logits/chosen": -2.615396499633789, "logits/rejected": -2.5223731994628906, "logps/chosen": -277.3884582519531, "logps/rejected": -221.803466796875, "loss": 0.4468, "rewards/accuracies": 0.875, "rewards/chosen": 2.5077309608459473, "rewards/margins": 9.055838584899902, "rewards/rejected": -6.548108100891113, "step": 200 }, { "epoch": 0.54, "grad_norm": 954.7034527431528, "learning_rate": 2.600664850273538e-07, "logits/chosen": -2.603569269180298, "logits/rejected": -2.5283331871032715, "logps/chosen": -269.19873046875, "logps/rejected": -213.823974609375, "loss": 0.6013, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.4840681552886963, "rewards/margins": 7.454611778259277, "rewards/rejected": -5.97054386138916, "step": 210 }, { "epoch": 0.56, "grad_norm": 685.0955562473252, "learning_rate": 2.3769816112703045e-07, "logits/chosen": -2.6224589347839355, "logits/rejected": -2.55679988861084, "logps/chosen": -257.71661376953125, "logps/rejected": -214.28329467773438, "loss": 0.4806, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.168187141418457, "rewards/margins": 6.781345367431641, "rewards/rejected": -3.6131577491760254, "step": 220 }, { "epoch": 0.59, "grad_norm": 568.8894162951807, "learning_rate": 2.1542832120881677e-07, "logits/chosen": -2.664320945739746, "logits/rejected": -2.5764544010162354, "logps/chosen": -266.98114013671875, "logps/rejected": -216.44894409179688, "loss": 0.4149, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 5.137583255767822, "rewards/margins": 7.965329647064209, "rewards/rejected": -2.827746629714966, "step": 230 }, { "epoch": 0.61, "grad_norm": 923.147651672606, "learning_rate": 1.934352493925695e-07, "logits/chosen": -2.6468780040740967, "logits/rejected": -2.5980067253112793, "logps/chosen": -262.94610595703125, "logps/rejected": -220.69448852539062, "loss": 0.3991, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.828115463256836, "rewards/margins": 9.526643753051758, "rewards/rejected": -5.69852876663208, "step": 240 }, { "epoch": 0.64, "grad_norm": 615.4120078013015, "learning_rate": 1.7189501409486059e-07, "logits/chosen": -2.656362533569336, "logits/rejected": -2.584864616394043, "logps/chosen": -267.7325439453125, "logps/rejected": -222.2632293701172, "loss": 0.4004, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.2369320392608643, "rewards/margins": 7.942319393157959, "rewards/rejected": -4.705387115478516, "step": 250 }, { "epoch": 0.67, "grad_norm": 488.0068782741624, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -2.64605450630188, "logits/rejected": -2.586585283279419, "logps/chosen": -261.89093017578125, "logps/rejected": -208.77493286132812, "loss": 0.3817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.175231456756592, "rewards/margins": 7.630448818206787, "rewards/rejected": -4.455216884613037, "step": 260 }, { "epoch": 0.69, "grad_norm": 559.4430135222711, "learning_rate": 1.30857819994673e-07, "logits/chosen": -2.6208698749542236, "logits/rejected": -2.5371921062469482, "logps/chosen": -274.78753662109375, "logps/rejected": -230.4307861328125, "loss": 0.5355, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.437089204788208, "rewards/margins": 9.265036582946777, "rewards/rejected": -7.82794713973999, "step": 270 }, { "epoch": 0.72, "grad_norm": 432.8210354095987, "learning_rate": 1.116893898236716e-07, "logits/chosen": -2.654949426651001, "logits/rejected": -2.5985524654388428, "logps/chosen": -270.3836975097656, "logps/rejected": -219.8002471923828, "loss": 0.3718, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.6581013202667236, "rewards/margins": 8.142509460449219, "rewards/rejected": -5.484408378601074, "step": 280 }, { "epoch": 0.74, "grad_norm": 482.2442984028295, "learning_rate": 9.362822335518062e-08, "logits/chosen": -2.6166903972625732, "logits/rejected": -2.5696167945861816, "logps/chosen": -268.19140625, "logps/rejected": -216.9479522705078, "loss": 0.3568, "rewards/accuracies": 0.875, "rewards/chosen": 3.0037200450897217, "rewards/margins": 7.667593479156494, "rewards/rejected": -4.663873195648193, "step": 290 }, { "epoch": 0.77, "grad_norm": 492.9163861530474, "learning_rate": 7.681891162260015e-08, "logits/chosen": -2.636460781097412, "logits/rejected": -2.580770254135132, "logps/chosen": -274.6198425292969, "logps/rejected": -220.8531951904297, "loss": 0.3983, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.163914680480957, "rewards/margins": 7.829231262207031, "rewards/rejected": -4.665315628051758, "step": 300 }, { "epoch": 0.79, "grad_norm": 437.9917779014462, "learning_rate": 6.139602377230247e-08, "logits/chosen": -2.6010611057281494, "logits/rejected": -2.532543897628784, "logps/chosen": -278.3953552246094, "logps/rejected": -215.9014129638672, "loss": 0.4376, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 3.1028757095336914, "rewards/margins": 8.152434349060059, "rewards/rejected": -5.049559593200684, "step": 310 }, { "epoch": 0.82, "grad_norm": 649.8222699481745, "learning_rate": 4.748302975270837e-08, "logits/chosen": -2.6264309883117676, "logits/rejected": -2.5793588161468506, "logps/chosen": -261.37890625, "logps/rejected": -204.51773071289062, "loss": 0.405, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.8262996673583984, "rewards/margins": 7.375731468200684, "rewards/rejected": -4.549432277679443, "step": 320 }, { "epoch": 0.84, "grad_norm": 583.9617574483902, "learning_rate": 3.5191311859445795e-08, "logits/chosen": -2.6449975967407227, "logits/rejected": -2.586719512939453, "logps/chosen": -264.58428955078125, "logps/rejected": -217.4517364501953, "loss": 0.3924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.091521263122559, "rewards/margins": 7.869417667388916, "rewards/rejected": -3.7778968811035156, "step": 330 }, { "epoch": 0.87, "grad_norm": 516.763098966226, "learning_rate": 2.4619273049795996e-08, "logits/chosen": -2.631946563720703, "logits/rejected": -2.5740180015563965, "logps/chosen": -260.0722961425781, "logps/rejected": -210.775146484375, "loss": 0.3558, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.791111469268799, "rewards/margins": 8.514566421508789, "rewards/rejected": -4.723455905914307, "step": 340 }, { "epoch": 0.9, "grad_norm": 434.316228593937, "learning_rate": 1.5851549164932115e-08, "logits/chosen": -2.641859531402588, "logits/rejected": -2.592379093170166, "logps/chosen": -269.5948181152344, "logps/rejected": -226.536865234375, "loss": 0.382, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.7248435020446777, "rewards/margins": 7.7656402587890625, "rewards/rejected": -4.040797233581543, "step": 350 }, { "epoch": 0.92, "grad_norm": 570.6334718025578, "learning_rate": 8.958331366609423e-09, "logits/chosen": -2.6432430744171143, "logits/rejected": -2.574936628341675, "logps/chosen": -275.0256652832031, "logps/rejected": -219.6584014892578, "loss": 0.4253, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.3530006408691406, "rewards/margins": 8.096589088439941, "rewards/rejected": -4.743588447570801, "step": 360 }, { "epoch": 0.95, "grad_norm": 877.4134874498682, "learning_rate": 3.994804212627461e-09, "logits/chosen": -2.6024394035339355, "logits/rejected": -2.5662083625793457, "logps/chosen": -273.9478454589844, "logps/rejected": -229.1957550048828, "loss": 0.4977, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 4.258389472961426, "rewards/margins": 7.956662178039551, "rewards/rejected": -3.698272705078125, "step": 370 }, { "epoch": 0.97, "grad_norm": 416.60583937652194, "learning_rate": 1.0007038696262516e-09, "logits/chosen": -2.651128053665161, "logits/rejected": -2.610159397125244, "logps/chosen": -263.07269287109375, "logps/rejected": -230.61502075195312, "loss": 0.3902, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.250136375427246, "rewards/margins": 8.099352836608887, "rewards/rejected": -3.8492164611816406, "step": 380 }, { "epoch": 1.0, "grad_norm": 678.8175373396961, "learning_rate": 0.0, "logits/chosen": -2.6594204902648926, "logits/rejected": -2.5979819297790527, "logps/chosen": -250.8957977294922, "logps/rejected": -210.31497192382812, "loss": 0.4132, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.278926134109497, "rewards/margins": 7.683538913726807, "rewards/rejected": -4.4046125411987305, "step": 390 }, { "epoch": 1.0, "step": 390, "total_flos": 0.0, "train_loss": 0.4220164916454217, "train_runtime": 5868.9984, "train_samples_per_second": 8.519, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }