{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9966722129783694, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1151.6184415193795, "learning_rate": 3.333333333333333e-10, "logits/chosen": -4.106247425079346, "logits/rejected": -4.200438499450684, "logps/chosen": -382.81439208984375, "logps/rejected": -357.65960693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 1153.8530059959064, "learning_rate": 3.3333333333333334e-09, "logits/chosen": -4.217168807983398, "logits/rejected": -4.321505069732666, "logps/chosen": -334.6739501953125, "logps/rejected": -313.41986083984375, "loss": 0.7234, "rewards/accuracies": 0.3784722089767456, "rewards/chosen": -0.04607396200299263, "rewards/margins": -0.04357295483350754, "rewards/rejected": -0.0025010076351463795, "step": 10 }, { "epoch": 0.13, "grad_norm": 1270.1091311718026, "learning_rate": 6.666666666666667e-09, "logits/chosen": -4.26615571975708, "logits/rejected": -4.41886043548584, "logps/chosen": -313.93829345703125, "logps/rejected": -288.78863525390625, "loss": 0.7299, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.012899210676550865, "rewards/margins": -0.021626513451337814, "rewards/rejected": 0.00872730277478695, "step": 20 }, { "epoch": 0.2, "grad_norm": 1195.4358872902142, "learning_rate": 1e-08, "logits/chosen": -4.3016462326049805, "logits/rejected": -4.365716457366943, "logps/chosen": -308.2979431152344, "logps/rejected": -285.63018798828125, "loss": 0.7311, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.010719490237534046, "rewards/margins": -0.010031750425696373, "rewards/rejected": -0.000687739229761064, "step": 30 }, { "epoch": 0.27, "grad_norm": 1144.0076561795222, "learning_rate": 9.966191788709716e-09, "logits/chosen": -4.187338829040527, "logits/rejected": -4.271176338195801, "logps/chosen": -332.39453125, "logps/rejected": -307.4621276855469, "loss": 0.7199, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.04900529980659485, "rewards/margins": 0.05206792429089546, "rewards/rejected": -0.0030626237858086824, "step": 40 }, { "epoch": 0.33, "grad_norm": 1135.1845096891816, "learning_rate": 9.86522435289912e-09, "logits/chosen": -4.197329044342041, "logits/rejected": -4.366620063781738, "logps/chosen": -333.64678955078125, "logps/rejected": -309.0525817871094, "loss": 0.7189, "rewards/accuracies": 0.515625, "rewards/chosen": 0.04922889173030853, "rewards/margins": 0.008506924845278263, "rewards/rejected": 0.040721967816352844, "step": 50 }, { "epoch": 0.4, "grad_norm": 1124.4015217482226, "learning_rate": 9.698463103929542e-09, "logits/chosen": -4.186924457550049, "logits/rejected": -4.269418239593506, "logps/chosen": -323.7723693847656, "logps/rejected": -308.26971435546875, "loss": 0.7082, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.11511299759149551, "rewards/margins": 0.052407026290893555, "rewards/rejected": 0.06270597130060196, "step": 60 }, { "epoch": 0.47, "grad_norm": 1133.1247984062034, "learning_rate": 9.468163201617062e-09, "logits/chosen": -4.147147178649902, "logits/rejected": -4.3019304275512695, "logps/chosen": -344.58563232421875, "logps/rejected": -314.4212951660156, "loss": 0.7005, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 0.23254117369651794, "rewards/margins": 0.13268980383872986, "rewards/rejected": 0.09985135495662689, "step": 70 }, { "epoch": 0.53, "grad_norm": 1057.1503651106625, "learning_rate": 9.177439057064682e-09, "logits/chosen": -4.223555564880371, "logits/rejected": -4.389444828033447, "logps/chosen": -334.3599853515625, "logps/rejected": -305.4422607421875, "loss": 0.6844, "rewards/accuracies": 0.59375, "rewards/chosen": 0.23962649703025818, "rewards/margins": 0.09747296571731567, "rewards/rejected": 0.1421535313129425, "step": 80 }, { "epoch": 0.6, "grad_norm": 1055.57995971272, "learning_rate": 8.830222215594889e-09, "logits/chosen": -4.2292633056640625, "logits/rejected": -4.349828243255615, "logps/chosen": -327.07330322265625, "logps/rejected": -307.81707763671875, "loss": 0.6772, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.2870681881904602, "rewards/margins": 0.06986425817012787, "rewards/rejected": 0.21720390021800995, "step": 90 }, { "epoch": 0.67, "grad_norm": 1124.231150209405, "learning_rate": 8.431208189343668e-09, "logits/chosen": -4.214221000671387, "logits/rejected": -4.424824237823486, "logps/chosen": -320.45928955078125, "logps/rejected": -289.71466064453125, "loss": 0.6738, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.3042110502719879, "rewards/margins": 0.12749677896499634, "rewards/rejected": 0.1767142415046692, "step": 100 }, { "epoch": 0.73, "grad_norm": 1045.9935050945592, "learning_rate": 7.98579295851393e-09, "logits/chosen": -4.233684062957764, "logits/rejected": -4.286379814147949, "logps/chosen": -314.39251708984375, "logps/rejected": -303.2556457519531, "loss": 0.6623, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.3848657011985779, "rewards/margins": 0.10032075643539429, "rewards/rejected": 0.2845449447631836, "step": 110 }, { "epoch": 0.8, "grad_norm": 1107.1293654591734, "learning_rate": 7.500000000000001e-09, "logits/chosen": -4.151052951812744, "logits/rejected": -4.307383060455322, "logps/chosen": -338.46759033203125, "logps/rejected": -313.0832214355469, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": 0.49397316575050354, "rewards/margins": 0.20816774666309357, "rewards/rejected": 0.2858053743839264, "step": 120 }, { "epoch": 0.87, "grad_norm": 1113.1665283582252, "learning_rate": 6.980398830195784e-09, "logits/chosen": -4.174288272857666, "logits/rejected": -4.36756706237793, "logps/chosen": -323.51800537109375, "logps/rejected": -304.7021484375, "loss": 0.6392, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5511430501937866, "rewards/margins": 0.23519937694072723, "rewards/rejected": 0.31594371795654297, "step": 130 }, { "epoch": 0.93, "grad_norm": 1022.5201031232197, "learning_rate": 6.434016163555451e-09, "logits/chosen": -4.292388916015625, "logits/rejected": -4.358359336853027, "logps/chosen": -307.1917419433594, "logps/rejected": -296.78192138671875, "loss": 0.6402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5292906761169434, "rewards/margins": 0.16237936913967133, "rewards/rejected": 0.36691129207611084, "step": 140 }, { "epoch": 1.0, "grad_norm": 1051.3521300860295, "learning_rate": 5.868240888334653e-09, "logits/chosen": -4.260380268096924, "logits/rejected": -4.333888053894043, "logps/chosen": -311.844482421875, "logps/rejected": -299.74261474609375, "loss": 0.6387, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.5666605830192566, "rewards/margins": 0.1995116025209427, "rewards/rejected": 0.3671489655971527, "step": 150 }, { "epoch": 1.06, "grad_norm": 1029.1548288848146, "learning_rate": 5.290724144552379e-09, "logits/chosen": -4.191515922546387, "logits/rejected": -4.348960876464844, "logps/chosen": -326.90582275390625, "logps/rejected": -305.37689208984375, "loss": 0.6032, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.6744669079780579, "rewards/margins": 0.37094420194625854, "rewards/rejected": 0.3035227358341217, "step": 160 }, { "epoch": 1.13, "grad_norm": 991.2604083787021, "learning_rate": 4.709275855447621e-09, "logits/chosen": -4.241927623748779, "logits/rejected": -4.339847564697266, "logps/chosen": -314.4874572753906, "logps/rejected": -295.49151611328125, "loss": 0.6151, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.6504887938499451, "rewards/margins": 0.3018999397754669, "rewards/rejected": 0.3485889136791229, "step": 170 }, { "epoch": 1.2, "grad_norm": 976.0494957890444, "learning_rate": 4.131759111665349e-09, "logits/chosen": -4.174123287200928, "logits/rejected": -4.313396453857422, "logps/chosen": -325.78045654296875, "logps/rejected": -304.82989501953125, "loss": 0.6147, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.6252425909042358, "rewards/margins": 0.27976280450820923, "rewards/rejected": 0.3454797863960266, "step": 180 }, { "epoch": 1.26, "grad_norm": 949.77837430784, "learning_rate": 3.56598383644455e-09, "logits/chosen": -4.231461524963379, "logits/rejected": -4.367356777191162, "logps/chosen": -326.8230895996094, "logps/rejected": -299.7027587890625, "loss": 0.5969, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.7501948475837708, "rewards/margins": 0.35921385884284973, "rewards/rejected": 0.39098095893859863, "step": 190 }, { "epoch": 1.33, "grad_norm": 963.9694294192556, "learning_rate": 3.0196011698042157e-09, "logits/chosen": -4.249307155609131, "logits/rejected": -4.317980766296387, "logps/chosen": -313.9615783691406, "logps/rejected": -293.1615905761719, "loss": 0.5922, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.7425140142440796, "rewards/margins": 0.3114904761314392, "rewards/rejected": 0.4310235381126404, "step": 200 }, { "epoch": 1.4, "grad_norm": 1038.4899227109647, "learning_rate": 2.5000000000000013e-09, "logits/chosen": -4.174517631530762, "logits/rejected": -4.288783073425293, "logps/chosen": -327.461181640625, "logps/rejected": -304.9690246582031, "loss": 0.5952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7787834405899048, "rewards/margins": 0.3926675319671631, "rewards/rejected": 0.3861159086227417, "step": 210 }, { "epoch": 1.46, "grad_norm": 992.6867793357102, "learning_rate": 2.0142070414860704e-09, "logits/chosen": -4.165514945983887, "logits/rejected": -4.316833019256592, "logps/chosen": -358.1416931152344, "logps/rejected": -323.22998046875, "loss": 0.5994, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.8851108551025391, "rewards/margins": 0.35754603147506714, "rewards/rejected": 0.5275647640228271, "step": 220 }, { "epoch": 1.53, "grad_norm": 918.0076197376386, "learning_rate": 1.5687918106563326e-09, "logits/chosen": -4.234222412109375, "logits/rejected": -4.369565486907959, "logps/chosen": -328.3675537109375, "logps/rejected": -303.726806640625, "loss": 0.5919, "rewards/accuracies": 0.65625, "rewards/chosen": 0.8034540414810181, "rewards/margins": 0.2974655032157898, "rewards/rejected": 0.5059884786605835, "step": 230 }, { "epoch": 1.6, "grad_norm": 944.3107996293463, "learning_rate": 1.1697777844051105e-09, "logits/chosen": -4.202380180358887, "logits/rejected": -4.353602409362793, "logps/chosen": -326.75604248046875, "logps/rejected": -297.03619384765625, "loss": 0.588, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.8500032424926758, "rewards/margins": 0.3546527922153473, "rewards/rejected": 0.4953504502773285, "step": 240 }, { "epoch": 1.66, "grad_norm": 944.2690060176395, "learning_rate": 8.225609429353187e-10, "logits/chosen": -4.173297882080078, "logits/rejected": -4.368635654449463, "logps/chosen": -321.2724914550781, "logps/rejected": -293.50311279296875, "loss": 0.5856, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.8207529783248901, "rewards/margins": 0.40889453887939453, "rewards/rejected": 0.4118584990501404, "step": 250 }, { "epoch": 1.73, "grad_norm": 957.8767493880096, "learning_rate": 5.318367983829391e-10, "logits/chosen": -4.18589448928833, "logits/rejected": -4.284361839294434, "logps/chosen": -348.6470031738281, "logps/rejected": -326.4360046386719, "loss": 0.5895, "rewards/accuracies": 0.690625011920929, "rewards/chosen": 0.8914071917533875, "rewards/margins": 0.37524908781051636, "rewards/rejected": 0.5161582231521606, "step": 260 }, { "epoch": 1.8, "grad_norm": 949.249423709826, "learning_rate": 3.015368960704584e-10, "logits/chosen": -4.271857738494873, "logits/rejected": -4.4089555740356445, "logps/chosen": -325.19049072265625, "logps/rejected": -302.7436828613281, "loss": 0.5938, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.9044697880744934, "rewards/margins": 0.38229116797447205, "rewards/rejected": 0.522178590297699, "step": 270 }, { "epoch": 1.86, "grad_norm": 932.1311904270524, "learning_rate": 1.3477564710088098e-10, "logits/chosen": -4.163296222686768, "logits/rejected": -4.306557655334473, "logps/chosen": -327.57257080078125, "logps/rejected": -306.2264099121094, "loss": 0.5865, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.8713304400444031, "rewards/margins": 0.4181024134159088, "rewards/rejected": 0.45322805643081665, "step": 280 }, { "epoch": 1.93, "grad_norm": 884.8124763448137, "learning_rate": 3.380821129028488e-11, "logits/chosen": -4.1040120124816895, "logits/rejected": -4.260913372039795, "logps/chosen": -334.97686767578125, "logps/rejected": -312.01617431640625, "loss": 0.5924, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.9016163945198059, "rewards/margins": 0.4117993414402008, "rewards/rejected": 0.48981720209121704, "step": 290 }, { "epoch": 2.0, "grad_norm": 959.7419745329076, "learning_rate": 0.0, "logits/chosen": -4.31270170211792, "logits/rejected": -4.468858242034912, "logps/chosen": -314.4400939941406, "logps/rejected": -284.8276672363281, "loss": 0.5825, "rewards/accuracies": 0.6875, "rewards/chosen": 0.82757169008255, "rewards/margins": 0.36657077074050903, "rewards/rejected": 0.4610009789466858, "step": 300 }, { "epoch": 2.0, "step": 300, "total_flos": 0.0, "train_loss": 0.6411590957641602, "train_runtime": 8942.072, "train_samples_per_second": 8.599, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }