{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986824769433466, "eval_steps": 100, "global_step": 379, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 34.01362102288599, "learning_rate": 1.3157894736842104e-08, "logits/chosen": -4.685327529907227, "logits/rejected": -4.87608528137207, "logps/chosen": -207.7137451171875, "logps/rejected": -145.5098114013672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 33.835137410082986, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -4.499408721923828, "logits/rejected": -4.84108829498291, "logps/chosen": -223.5843048095703, "logps/rejected": -160.73016357421875, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0008713232818990946, "rewards/margins": 3.9665635995334014e-05, "rewards/rejected": 0.000831657787784934, "step": 10 }, { "epoch": 0.05, "grad_norm": 32.9122864599857, "learning_rate": 2.631578947368421e-07, "logits/chosen": -4.521907806396484, "logits/rejected": -4.8204779624938965, "logps/chosen": -220.248779296875, "logps/rejected": -173.30508422851562, "loss": 0.6818, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.01710951328277588, "rewards/margins": 0.023774703964591026, "rewards/rejected": -0.006665193475782871, "step": 20 }, { "epoch": 0.08, "grad_norm": 32.91997500077958, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -4.713895320892334, "logits/rejected": -5.012445449829102, "logps/chosen": -223.3787841796875, "logps/rejected": -201.9126434326172, "loss": 0.6294, "rewards/accuracies": 0.8125, "rewards/chosen": -0.047253355383872986, "rewards/margins": 0.1270204484462738, "rewards/rejected": -0.1742737889289856, "step": 30 }, { "epoch": 0.11, "grad_norm": 44.205273980146465, "learning_rate": 4.999575626062319e-07, "logits/chosen": -4.862967491149902, "logits/rejected": -5.199351787567139, "logps/chosen": -284.15850830078125, "logps/rejected": -260.1568298339844, "loss": 0.5878, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5039564371109009, "rewards/margins": 0.3390721082687378, "rewards/rejected": -0.8430284261703491, "step": 40 }, { "epoch": 0.13, "grad_norm": 33.79573625552293, "learning_rate": 4.984737660598186e-07, "logits/chosen": -4.860326766967773, "logits/rejected": -5.1770920753479, "logps/chosen": -287.49212646484375, "logps/rejected": -275.1466979980469, "loss": 0.544, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6591841578483582, "rewards/margins": 0.3825286030769348, "rewards/rejected": -1.0417125225067139, "step": 50 }, { "epoch": 0.16, "grad_norm": 37.5548220875479, "learning_rate": 4.948824853131236e-07, "logits/chosen": -5.250467777252197, "logits/rejected": -5.646960258483887, "logps/chosen": -355.8065185546875, "logps/rejected": -370.16064453125, "loss": 0.4873, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3389923572540283, "rewards/margins": 0.6164921522140503, "rewards/rejected": -1.955484390258789, "step": 60 }, { "epoch": 0.18, "grad_norm": 40.80845104625175, "learning_rate": 4.892141805936084e-07, "logits/chosen": -5.506089687347412, "logits/rejected": -5.9388532638549805, "logps/chosen": -373.0067443847656, "logps/rejected": -409.2164001464844, "loss": 0.4619, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.4274970293045044, "rewards/margins": 0.8630696535110474, "rewards/rejected": -2.290566921234131, "step": 70 }, { "epoch": 0.21, "grad_norm": 62.51614299706751, "learning_rate": 4.81516928858564e-07, "logits/chosen": -5.957489967346191, "logits/rejected": -6.362034797668457, "logps/chosen": -431.8949279785156, "logps/rejected": -484.7469177246094, "loss": 0.4382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0529427528381348, "rewards/margins": 1.0657070875167847, "rewards/rejected": -3.118650197982788, "step": 80 }, { "epoch": 0.24, "grad_norm": 42.64563417225672, "learning_rate": 4.7185601601995784e-07, "logits/chosen": -5.974350929260254, "logits/rejected": -6.67104959487915, "logps/chosen": -387.2274475097656, "logps/rejected": -464.65826416015625, "loss": 0.4236, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6722873449325562, "rewards/margins": 1.3039813041687012, "rewards/rejected": -2.976268768310547, "step": 90 }, { "epoch": 0.26, "grad_norm": 49.657859383445725, "learning_rate": 4.603133832077953e-07, "logits/chosen": -6.59436559677124, "logits/rejected": -7.055686950683594, "logps/chosen": -443.9141540527344, "logps/rejected": -517.8334350585938, "loss": 0.4015, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.304152011871338, "rewards/margins": 1.163777232170105, "rewards/rejected": -3.4679291248321533, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -6.410887718200684, "eval_logits/rejected": -6.793334007263184, "eval_logps/chosen": -826.5081176757812, "eval_logps/rejected": -987.8357543945312, "eval_loss": 0.9856035113334656, "eval_rewards/accuracies": 0.52734375, "eval_rewards/chosen": -4.360336780548096, "eval_rewards/margins": 0.3341439962387085, "eval_rewards/rejected": -4.694480895996094, "eval_runtime": 97.5721, "eval_samples_per_second": 20.498, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.29, "grad_norm": 59.660602657641974, "learning_rate": 4.4698693176863316e-07, "logits/chosen": -6.508014678955078, "logits/rejected": -6.995993137359619, "logps/chosen": -467.217041015625, "logps/rejected": -571.2991943359375, "loss": 0.3652, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5293219089508057, "rewards/margins": 1.5391861200332642, "rewards/rejected": -4.068508148193359, "step": 110 }, { "epoch": 0.32, "grad_norm": 48.65436374701769, "learning_rate": 4.319896928940505e-07, "logits/chosen": -6.631407737731934, "logits/rejected": -7.224958896636963, "logps/chosen": -449.9879455566406, "logps/rejected": -551.552734375, "loss": 0.3856, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.370181083679199, "rewards/margins": 1.3927323818206787, "rewards/rejected": -3.762913465499878, "step": 120 }, { "epoch": 0.34, "grad_norm": 41.65961698696447, "learning_rate": 4.1544886892205354e-07, "logits/chosen": -6.516014099121094, "logits/rejected": -7.032981872558594, "logps/chosen": -490.3934020996094, "logps/rejected": -610.3671264648438, "loss": 0.3736, "rewards/accuracies": 0.78125, "rewards/chosen": -2.659696578979492, "rewards/margins": 1.6181414127349854, "rewards/rejected": -4.277838230133057, "step": 130 }, { "epoch": 0.37, "grad_norm": 61.060769075088814, "learning_rate": 3.975047544428254e-07, "logits/chosen": -6.8892974853515625, "logits/rejected": -7.428150177001953, "logps/chosen": -479.0992736816406, "logps/rejected": -578.5732421875, "loss": 0.3918, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.728147506713867, "rewards/margins": 1.3954027891159058, "rewards/rejected": -4.123549938201904, "step": 140 }, { "epoch": 0.4, "grad_norm": 40.0921041292044, "learning_rate": 3.78309546359696e-07, "logits/chosen": -7.085695743560791, "logits/rejected": -7.593710422515869, "logps/chosen": -485.8174743652344, "logps/rejected": -611.1568603515625, "loss": 0.3821, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8462297916412354, "rewards/margins": 1.527190089225769, "rewards/rejected": -4.373419761657715, "step": 150 }, { "epoch": 0.42, "grad_norm": 45.94809301945095, "learning_rate": 3.580260529980584e-07, "logits/chosen": -6.79840087890625, "logits/rejected": -7.57172155380249, "logps/chosen": -476.08197021484375, "logps/rejected": -613.65576171875, "loss": 0.3533, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6100144386291504, "rewards/margins": 1.7850373983383179, "rewards/rejected": -4.3950514793396, "step": 160 }, { "epoch": 0.45, "grad_norm": 49.925985141071024, "learning_rate": 3.36826313211205e-07, "logits/chosen": -7.27915096282959, "logits/rejected": -7.999810695648193, "logps/chosen": -464.27606201171875, "logps/rejected": -618.3897705078125, "loss": 0.3586, "rewards/accuracies": 0.8125, "rewards/chosen": -2.638946056365967, "rewards/margins": 1.9340379238128662, "rewards/rejected": -4.572983741760254, "step": 170 }, { "epoch": 0.47, "grad_norm": 49.77263247276722, "learning_rate": 3.14890137195437e-07, "logits/chosen": -7.062252998352051, "logits/rejected": -7.757845878601074, "logps/chosen": -499.0604553222656, "logps/rejected": -631.4205322265625, "loss": 0.35, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.841132640838623, "rewards/margins": 1.6729885339736938, "rewards/rejected": -4.514122009277344, "step": 180 }, { "epoch": 0.5, "grad_norm": 47.32333832777734, "learning_rate": 2.9240358139084013e-07, "logits/chosen": -7.432755947113037, "logits/rejected": -8.177480697631836, "logps/chosen": -571.7742919921875, "logps/rejected": -753.3114013671875, "loss": 0.3584, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.5975048542022705, "rewards/margins": 2.1865601539611816, "rewards/rejected": -5.784065246582031, "step": 190 }, { "epoch": 0.53, "grad_norm": 47.64189353399637, "learning_rate": 2.695573704031885e-07, "logits/chosen": -6.939781188964844, "logits/rejected": -7.71111536026001, "logps/chosen": -500.54449462890625, "logps/rejected": -651.9711303710938, "loss": 0.3649, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.752312421798706, "rewards/margins": 1.9267911911010742, "rewards/rejected": -4.679104328155518, "step": 200 }, { "epoch": 0.53, "eval_logits/chosen": -7.152472496032715, "eval_logits/rejected": -7.637792110443115, "eval_logps/chosen": -878.0755615234375, "eval_logps/rejected": -1032.680908203125, "eval_loss": 1.1239182949066162, "eval_rewards/accuracies": 0.48828125, "eval_rewards/chosen": -4.876009941101074, "eval_rewards/margins": 0.26692283153533936, "eval_rewards/rejected": -5.142932891845703, "eval_runtime": 97.9349, "eval_samples_per_second": 20.422, "eval_steps_per_second": 0.327, "step": 200 }, { "epoch": 0.55, "grad_norm": 54.30113996467665, "learning_rate": 2.465452793317865e-07, "logits/chosen": -6.918679237365723, "logits/rejected": -7.81919002532959, "logps/chosen": -490.4891662597656, "logps/rejected": -645.8287353515625, "loss": 0.351, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.56154727935791, "rewards/margins": 1.9791193008422852, "rewards/rejected": -4.5406670570373535, "step": 210 }, { "epoch": 0.58, "grad_norm": 48.87773515687126, "learning_rate": 2.2356249022388789e-07, "logits/chosen": -7.1236982345581055, "logits/rejected": -8.037015914916992, "logps/chosen": -493.3589782714844, "logps/rejected": -644.3180541992188, "loss": 0.3503, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.7893471717834473, "rewards/margins": 2.016085386276245, "rewards/rejected": -4.805432319641113, "step": 220 }, { "epoch": 0.61, "grad_norm": 42.654084351174774, "learning_rate": 2.0080393659578038e-07, "logits/chosen": -7.341279029846191, "logits/rejected": -8.349458694458008, "logps/chosen": -546.9940795898438, "logps/rejected": -735.5299072265625, "loss": 0.3336, "rewards/accuracies": 0.875, "rewards/chosen": -3.2596652507781982, "rewards/margins": 2.3632309436798096, "rewards/rejected": -5.622895240783691, "step": 230 }, { "epoch": 0.63, "grad_norm": 48.836991831948275, "learning_rate": 1.7846265006183976e-07, "logits/chosen": -7.382364749908447, "logits/rejected": -8.096385955810547, "logps/chosen": -559.1055908203125, "logps/rejected": -690.3636474609375, "loss": 0.3239, "rewards/accuracies": 0.78125, "rewards/chosen": -3.478092670440674, "rewards/margins": 1.7500022649765015, "rewards/rejected": -5.228094577789307, "step": 240 }, { "epoch": 0.66, "grad_norm": 41.88272757196894, "learning_rate": 1.5672812309497722e-07, "logits/chosen": -7.3642730712890625, "logits/rejected": -8.137662887573242, "logps/chosen": -481.7484436035156, "logps/rejected": -658.2447509765625, "loss": 0.3435, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.742130756378174, "rewards/margins": 2.204786777496338, "rewards/rejected": -4.946917533874512, "step": 250 }, { "epoch": 0.69, "grad_norm": 42.22161557256122, "learning_rate": 1.357847018050843e-07, "logits/chosen": -7.285035133361816, "logits/rejected": -8.04423999786377, "logps/chosen": -495.21929931640625, "logps/rejected": -637.4512939453125, "loss": 0.3575, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5677530765533447, "rewards/margins": 1.859423041343689, "rewards/rejected": -4.427175521850586, "step": 260 }, { "epoch": 0.71, "grad_norm": 36.62148112990462, "learning_rate": 1.1581002236747328e-07, "logits/chosen": -7.350560188293457, "logits/rejected": -8.334383964538574, "logps/chosen": -470.9756774902344, "logps/rejected": -644.9085083007812, "loss": 0.3419, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.7616236209869385, "rewards/margins": 2.1581368446350098, "rewards/rejected": -4.919760704040527, "step": 270 }, { "epoch": 0.74, "grad_norm": 42.46981787888043, "learning_rate": 9.697350436308427e-08, "logits/chosen": -7.353733062744141, "logits/rejected": -8.123218536376953, "logps/chosen": -547.2511596679688, "logps/rejected": -669.4581298828125, "loss": 0.3277, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1215298175811768, "rewards/margins": 1.7408149242401123, "rewards/rejected": -4.862344264984131, "step": 280 }, { "epoch": 0.76, "grad_norm": 45.06982020026593, "learning_rate": 7.943491380952188e-08, "logits/chosen": -7.6978254318237305, "logits/rejected": -8.468889236450195, "logps/chosen": -488.8667907714844, "logps/rejected": -652.3041381835938, "loss": 0.3357, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.835703134536743, "rewards/margins": 2.056410789489746, "rewards/rejected": -4.89211368560791, "step": 290 }, { "epoch": 0.79, "grad_norm": 50.46942762667779, "learning_rate": 6.334300807088508e-08, "logits/chosen": -7.3240485191345215, "logits/rejected": -8.329621315002441, "logps/chosen": -509.5638122558594, "logps/rejected": -673.7567749023438, "loss": 0.3506, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0604350566864014, "rewards/margins": 2.1173126697540283, "rewards/rejected": -5.1777472496032715, "step": 300 }, { "epoch": 0.79, "eval_logits/chosen": -7.444587707519531, "eval_logits/rejected": -7.9900360107421875, "eval_logps/chosen": -951.4573974609375, "eval_logps/rejected": -1114.78076171875, "eval_loss": 1.1842519044876099, "eval_rewards/accuracies": 0.51171875, "eval_rewards/chosen": -5.609828948974609, "eval_rewards/margins": 0.35410135984420776, "eval_rewards/rejected": -5.963930606842041, "eval_runtime": 97.9073, "eval_samples_per_second": 20.427, "eval_steps_per_second": 0.327, "step": 300 }, { "epoch": 0.82, "grad_norm": 45.51913834484837, "learning_rate": 4.8834274139883084e-08, "logits/chosen": -7.380696773529053, "logits/rejected": -8.350247383117676, "logps/chosen": -504.37054443359375, "logps/rejected": -678.6265869140625, "loss": 0.3248, "rewards/accuracies": 0.84375, "rewards/chosen": -2.946779727935791, "rewards/margins": 2.2168285846710205, "rewards/rejected": -5.163609027862549, "step": 310 }, { "epoch": 0.84, "grad_norm": 53.031113784661194, "learning_rate": 3.60317709937693e-08, "logits/chosen": -7.483295440673828, "logits/rejected": -8.33633804321289, "logps/chosen": -519.6754150390625, "logps/rejected": -667.0064697265625, "loss": 0.334, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.902660369873047, "rewards/margins": 2.0172414779663086, "rewards/rejected": -4.919901371002197, "step": 320 }, { "epoch": 0.87, "grad_norm": 41.68995945520798, "learning_rate": 2.5044085842905683e-08, "logits/chosen": -7.596086025238037, "logits/rejected": -8.42108154296875, "logps/chosen": -542.1593627929688, "logps/rejected": -761.1915893554688, "loss": 0.3439, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.237351894378662, "rewards/margins": 2.630371332168579, "rewards/rejected": -5.867722988128662, "step": 330 }, { "epoch": 0.9, "grad_norm": 44.38600213815557, "learning_rate": 1.5964413124758493e-08, "logits/chosen": -7.379315376281738, "logits/rejected": -8.234747886657715, "logps/chosen": -516.7742919921875, "logps/rejected": -690.0197143554688, "loss": 0.3394, "rewards/accuracies": 0.875, "rewards/chosen": -2.9615204334259033, "rewards/margins": 2.160275936126709, "rewards/rejected": -5.121796607971191, "step": 340 }, { "epoch": 0.92, "grad_norm": 49.890826040889124, "learning_rate": 8.869764055041501e-09, "logits/chosen": -7.525488376617432, "logits/rejected": -8.198974609375, "logps/chosen": -535.8231201171875, "logps/rejected": -737.1505737304688, "loss": 0.3403, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1437525749206543, "rewards/margins": 2.3052563667297363, "rewards/rejected": -5.449008941650391, "step": 350 }, { "epoch": 0.95, "grad_norm": 41.28866479778979, "learning_rate": 3.82031344036729e-09, "logits/chosen": -7.295458793640137, "logits/rejected": -8.147361755371094, "logps/chosen": -541.268798828125, "logps/rejected": -692.4953002929688, "loss": 0.3191, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.197694778442383, "rewards/margins": 1.9286365509033203, "rewards/rejected": -5.126331329345703, "step": 360 }, { "epoch": 0.97, "grad_norm": 56.54843526627352, "learning_rate": 8.588892925590063e-10, "logits/chosen": -7.414445400238037, "logits/rejected": -8.447690963745117, "logps/chosen": -499.84197998046875, "logps/rejected": -670.2615966796875, "loss": 0.3342, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.8063206672668457, "rewards/margins": 2.2892918586730957, "rewards/rejected": -5.0956130027771, "step": 370 }, { "epoch": 1.0, "step": 379, "total_flos": 0.0, "train_loss": 0.39733357479830217, "train_runtime": 5845.1464, "train_samples_per_second": 8.303, "train_steps_per_second": 0.065 } ], "logging_steps": 10, "max_steps": 379, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }