{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 422.289956912934, "learning_rate": 1.25e-09, "logits/chosen": -4.623842239379883, "logits/rejected": -4.85917854309082, "logps/chosen": -239.31422424316406, "logps/rejected": -207.56365966796875, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 397.335507450448, "learning_rate": 1.25e-08, "logits/chosen": -4.334493160247803, "logits/rejected": -4.643917560577393, "logps/chosen": -265.1265563964844, "logps/rejected": -215.76223754882812, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.0019649432506412268, "rewards/margins": 0.0034448718652129173, "rewards/rejected": -0.005409814417362213, "step": 10 }, { "epoch": 0.05, "grad_norm": 411.8624893441942, "learning_rate": 2.5e-08, "logits/chosen": -4.506826400756836, "logits/rejected": -4.740732192993164, "logps/chosen": -267.86932373046875, "logps/rejected": -216.64578247070312, "loss": 0.6921, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.007061410695314407, "rewards/margins": 0.010701502673327923, "rewards/rejected": -0.003640091512352228, "step": 20 }, { "epoch": 0.08, "grad_norm": 426.4548766919431, "learning_rate": 3.75e-08, "logits/chosen": -4.585576057434082, "logits/rejected": -4.762608528137207, "logps/chosen": -258.25, "logps/rejected": -214.71231079101562, "loss": 0.6756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.028281379491090775, "rewards/margins": 0.04623327776789665, "rewards/rejected": -0.017951902002096176, "step": 30 }, { "epoch": 0.1, "grad_norm": 368.66232685986097, "learning_rate": 5e-08, "logits/chosen": -4.62213659286499, "logits/rejected": -4.706842422485352, "logps/chosen": -252.6122283935547, "logps/rejected": -220.41427612304688, "loss": 0.6177, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0986957773566246, "rewards/margins": 0.1730131059885025, "rewards/rejected": -0.07431730628013611, "step": 40 }, { "epoch": 0.13, "grad_norm": 294.81806277707193, "learning_rate": 4.989992961303737e-08, "logits/chosen": -4.516692161560059, "logits/rejected": -4.714283466339111, "logps/chosen": -269.63470458984375, "logps/rejected": -227.5962371826172, "loss": 0.5368, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.2334650307893753, "rewards/margins": 0.4059367775917053, "rewards/rejected": -0.17247170209884644, "step": 50 }, { "epoch": 0.15, "grad_norm": 263.2732793061953, "learning_rate": 4.960051957873725e-08, "logits/chosen": -4.610293388366699, "logits/rejected": -4.734058856964111, "logps/chosen": -237.3783721923828, "logps/rejected": -218.6456298828125, "loss": 0.4381, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.3744103014469147, "rewards/margins": 0.7368327975273132, "rewards/rejected": -0.3624224364757538, "step": 60 }, { "epoch": 0.18, "grad_norm": 243.25478495437125, "learning_rate": 4.910416686333906e-08, "logits/chosen": -4.496267795562744, "logits/rejected": -4.724743843078613, "logps/chosen": -248.19540405273438, "logps/rejected": -222.69140625, "loss": 0.4014, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.49465441703796387, "rewards/margins": 0.9875491261482239, "rewards/rejected": -0.49289458990097046, "step": 70 }, { "epoch": 0.2, "grad_norm": 203.40516001004536, "learning_rate": 4.841484508350678e-08, "logits/chosen": -4.488529682159424, "logits/rejected": -4.711686611175537, "logps/chosen": -254.6974639892578, "logps/rejected": -235.7099151611328, "loss": 0.3429, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.652252197265625, "rewards/margins": 1.1913691759109497, "rewards/rejected": -0.5391170978546143, "step": 80 }, { "epoch": 0.23, "grad_norm": 237.79736326938396, "learning_rate": 4.7538072695020406e-08, "logits/chosen": -4.58192777633667, "logits/rejected": -4.797459602355957, "logps/chosen": -243.06143188476562, "logps/rejected": -210.63308715820312, "loss": 0.3085, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.8151354789733887, "rewards/margins": 1.595442533493042, "rewards/rejected": -0.7803069353103638, "step": 90 }, { "epoch": 0.26, "grad_norm": 222.07031689896021, "learning_rate": 4.6480868814055416e-08, "logits/chosen": -4.459914207458496, "logits/rejected": -4.747165679931641, "logps/chosen": -247.2704620361328, "logps/rejected": -236.6487274169922, "loss": 0.2984, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.9447771906852722, "rewards/margins": 1.8987438678741455, "rewards/rejected": -0.953966498374939, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -4.55070686340332, "eval_logits/rejected": -4.762002944946289, "eval_logps/chosen": -390.7516174316406, "eval_logps/rejected": -515.337158203125, "eval_loss": 0.9402573704719543, "eval_rewards/accuracies": 0.40625, "eval_rewards/chosen": -0.027714576572179794, "eval_rewards/margins": -0.3327641487121582, "eval_rewards/rejected": 0.3050495684146881, "eval_runtime": 97.8238, "eval_samples_per_second": 20.445, "eval_steps_per_second": 0.327, "step": 100 }, { "epoch": 0.28, "grad_norm": 179.38114597248955, "learning_rate": 4.525169702472916e-08, "logits/chosen": -4.51773738861084, "logits/rejected": -4.777291297912598, "logps/chosen": -238.6410369873047, "logps/rejected": -227.3874053955078, "loss": 0.2711, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.0544707775115967, "rewards/margins": 2.1246237754821777, "rewards/rejected": -1.070152997970581, "step": 110 }, { "epoch": 0.31, "grad_norm": 163.17108435846185, "learning_rate": 4.386039762276975e-08, "logits/chosen": -4.462746620178223, "logits/rejected": -4.7056145668029785, "logps/chosen": -262.86920166015625, "logps/rejected": -238.99801635742188, "loss": 0.267, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3120397329330444, "rewards/margins": 2.3509771823883057, "rewards/rejected": -1.0389372110366821, "step": 120 }, { "epoch": 0.33, "grad_norm": 229.70994586547334, "learning_rate": 4.231810883773999e-08, "logits/chosen": -4.532160758972168, "logits/rejected": -4.803128719329834, "logps/chosen": -246.3385009765625, "logps/rejected": -227.83792114257812, "loss": 0.2577, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2105365991592407, "rewards/margins": 2.461458921432495, "rewards/rejected": -1.2509223222732544, "step": 130 }, { "epoch": 0.36, "grad_norm": 152.87001146329087, "learning_rate": 4.063717766448194e-08, "logits/chosen": -4.559675216674805, "logits/rejected": -4.842075824737549, "logps/chosen": -272.3346252441406, "logps/rejected": -246.54464721679688, "loss": 0.2453, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1860424280166626, "rewards/margins": 2.6231608390808105, "rewards/rejected": -1.4371181726455688, "step": 140 }, { "epoch": 0.38, "grad_norm": 176.7311824941399, "learning_rate": 3.8831061017632845e-08, "logits/chosen": -4.557957172393799, "logits/rejected": -4.812293529510498, "logps/chosen": -239.32144165039062, "logps/rejected": -232.82479858398438, "loss": 0.2425, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3096827268600464, "rewards/margins": 2.680234432220459, "rewards/rejected": -1.3705517053604126, "step": 150 }, { "epoch": 0.41, "grad_norm": 179.5862213559593, "learning_rate": 3.691421800053269e-08, "logits/chosen": -4.614952564239502, "logits/rejected": -4.799678802490234, "logps/chosen": -238.4506378173828, "logps/rejected": -229.4785614013672, "loss": 0.2216, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.300586462020874, "rewards/margins": 2.687537908554077, "rewards/rejected": -1.3869514465332031, "step": 160 }, { "epoch": 0.43, "grad_norm": 160.70849517962517, "learning_rate": 3.490199415097892e-08, "logits/chosen": -4.499081611633301, "logits/rejected": -4.779529571533203, "logps/chosen": -244.9915771484375, "logps/rejected": -235.46743774414062, "loss": 0.2108, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6082652807235718, "rewards/margins": 3.174710988998413, "rewards/rejected": -1.5664453506469727, "step": 170 }, { "epoch": 0.46, "grad_norm": 149.11333079529007, "learning_rate": 3.2810498590513937e-08, "logits/chosen": -4.59390926361084, "logits/rejected": -4.832152366638184, "logps/chosen": -222.95986938476562, "logps/rejected": -211.53585815429688, "loss": 0.2274, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3903782367706299, "rewards/margins": 2.9419426918029785, "rewards/rejected": -1.5515644550323486, "step": 180 }, { "epoch": 0.49, "grad_norm": 171.77003361632143, "learning_rate": 3.065647506074306e-08, "logits/chosen": -4.56182861328125, "logits/rejected": -4.7075724601745605, "logps/chosen": -245.95556640625, "logps/rejected": -247.3394012451172, "loss": 0.2299, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.4937294721603394, "rewards/margins": 2.711061477661133, "rewards/rejected": -1.2173320055007935, "step": 190 }, { "epoch": 0.51, "grad_norm": 152.3542212939215, "learning_rate": 2.8457167879118325e-08, "logits/chosen": -4.556639194488525, "logits/rejected": -4.735670566558838, "logps/chosen": -237.8849639892578, "logps/rejected": -229.5240020751953, "loss": 0.2338, "rewards/accuracies": 0.875, "rewards/chosen": 1.4254279136657715, "rewards/margins": 2.843172073364258, "rewards/rejected": -1.4177442789077759, "step": 200 }, { "epoch": 0.51, "eval_logits/chosen": -4.538640975952148, "eval_logits/rejected": -4.758352279663086, "eval_logps/chosen": -391.07916259765625, "eval_logps/rejected": -514.2457885742188, "eval_loss": 1.0996507406234741, "eval_rewards/accuracies": 0.38671875, "eval_rewards/chosen": -0.06046929210424423, "eval_rewards/margins": -0.4746614694595337, "eval_rewards/rejected": 0.41419219970703125, "eval_runtime": 98.0841, "eval_samples_per_second": 20.391, "eval_steps_per_second": 0.326, "step": 200 }, { "epoch": 0.54, "grad_norm": 139.65082338502864, "learning_rate": 2.6230183887296952e-08, "logits/chosen": -4.619709014892578, "logits/rejected": -4.859663963317871, "logps/chosen": -249.32113647460938, "logps/rejected": -246.33468627929688, "loss": 0.1967, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.6563961505889893, "rewards/margins": 3.425267457962036, "rewards/rejected": -1.768871545791626, "step": 210 }, { "epoch": 0.56, "grad_norm": 147.42119588032455, "learning_rate": 2.3993351497264626e-08, "logits/chosen": -4.466108798980713, "logits/rejected": -4.793113708496094, "logps/chosen": -247.28756713867188, "logps/rejected": -244.1797637939453, "loss": 0.2159, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.5523773431777954, "rewards/margins": 3.5030410289764404, "rewards/rejected": -1.9506635665893555, "step": 220 }, { "epoch": 0.59, "grad_norm": 158.88183877851495, "learning_rate": 2.1764577963648613e-08, "logits/chosen": -4.541924476623535, "logits/rejected": -4.858447074890137, "logps/chosen": -245.6726531982422, "logps/rejected": -243.28677368164062, "loss": 0.2197, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.5296446084976196, "rewards/margins": 3.487210750579834, "rewards/rejected": -1.957566499710083, "step": 230 }, { "epoch": 0.61, "grad_norm": 157.203527489415, "learning_rate": 1.9561706024845818e-08, "logits/chosen": -4.5143561363220215, "logits/rejected": -4.772491455078125, "logps/chosen": -256.7393798828125, "logps/rejected": -240.91226196289062, "loss": 0.2123, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6083800792694092, "rewards/margins": 3.60624623298645, "rewards/rejected": -1.9978656768798828, "step": 240 }, { "epoch": 0.64, "grad_norm": 159.9556615524972, "learning_rate": 1.740237106064383e-08, "logits/chosen": -4.641883850097656, "logits/rejected": -4.893360614776611, "logps/chosen": -234.55264282226562, "logps/rejected": -224.25631713867188, "loss": 0.2024, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.414884090423584, "rewards/margins": 3.079598903656006, "rewards/rejected": -1.6647151708602905, "step": 250 }, { "epoch": 0.66, "grad_norm": 198.60636587673295, "learning_rate": 1.530385990987863e-08, "logits/chosen": -4.517378330230713, "logits/rejected": -4.799233913421631, "logps/chosen": -238.4044189453125, "logps/rejected": -242.4611358642578, "loss": 0.2025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5644124746322632, "rewards/margins": 3.8356785774230957, "rewards/rejected": -2.271266460418701, "step": 260 }, { "epoch": 0.69, "grad_norm": 186.10967020286805, "learning_rate": 1.3282972478382409e-08, "logits/chosen": -4.603947639465332, "logits/rejected": -4.826247215270996, "logps/chosen": -245.811279296875, "logps/rejected": -236.3955841064453, "loss": 0.2127, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.5221991539001465, "rewards/margins": 3.6592516899108887, "rewards/rejected": -2.137052536010742, "step": 270 }, { "epoch": 0.72, "grad_norm": 144.59147008918274, "learning_rate": 1.1355887245137383e-08, "logits/chosen": -4.557550430297852, "logits/rejected": -4.780216217041016, "logps/chosen": -248.1402587890625, "logps/rejected": -262.3576354980469, "loss": 0.1884, "rewards/accuracies": 0.96875, "rewards/chosen": 1.636661171913147, "rewards/margins": 3.578484296798706, "rewards/rejected": -1.9418232440948486, "step": 280 }, { "epoch": 0.74, "grad_norm": 331.8489693457681, "learning_rate": 9.538031743343628e-09, "logits/chosen": -4.426544666290283, "logits/rejected": -4.761611461639404, "logps/chosen": -255.4153289794922, "logps/rejected": -234.84487915039062, "loss": 0.1966, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.7193052768707275, "rewards/margins": 3.6408512592315674, "rewards/rejected": -1.9215457439422607, "step": 290 }, { "epoch": 0.77, "grad_norm": 177.15049850318, "learning_rate": 7.843959053281662e-09, "logits/chosen": -4.541173934936523, "logits/rejected": -4.741909503936768, "logps/chosen": -236.41854858398438, "logps/rejected": -236.483642578125, "loss": 0.2158, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.6989631652832031, "rewards/margins": 3.7507786750793457, "rewards/rejected": -2.0518155097961426, "step": 300 }, { "epoch": 0.77, "eval_logits/chosen": -4.539734840393066, "eval_logits/rejected": -4.759631633758545, "eval_logps/chosen": -392.2895812988281, "eval_logps/rejected": -515.406494140625, "eval_loss": 1.1359957456588745, "eval_rewards/accuracies": 0.41015625, "eval_rewards/chosen": -0.18151262402534485, "eval_rewards/margins": -0.47963014245033264, "eval_rewards/rejected": 0.2981175184249878, "eval_runtime": 97.905, "eval_samples_per_second": 20.428, "eval_steps_per_second": 0.327, "step": 300 }, { "epoch": 0.79, "grad_norm": 159.8961908197972, "learning_rate": 6.28723129572247e-09, "logits/chosen": -4.461672782897949, "logits/rejected": -4.776505470275879, "logps/chosen": -244.0063934326172, "logps/rejected": -244.4510955810547, "loss": 0.2028, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.7133222818374634, "rewards/margins": 3.9386374950408936, "rewards/rejected": -2.2253153324127197, "step": 310 }, { "epoch": 0.82, "grad_norm": 185.55678167306448, "learning_rate": 4.880311058593617e-09, "logits/chosen": -4.521292209625244, "logits/rejected": -4.848372936248779, "logps/chosen": -230.04397583007812, "logps/rejected": -226.9331817626953, "loss": 0.2095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5120347738265991, "rewards/margins": 3.560204267501831, "rewards/rejected": -2.0481698513031006, "step": 320 }, { "epoch": 0.84, "grad_norm": 155.26912676521727, "learning_rate": 3.6344616260994942e-09, "logits/chosen": -4.473151683807373, "logits/rejected": -4.766911029815674, "logps/chosen": -251.7646484375, "logps/rejected": -254.7379913330078, "loss": 0.1928, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.638421654701233, "rewards/margins": 3.8422539234161377, "rewards/rejected": -2.2038321495056152, "step": 330 }, { "epoch": 0.87, "grad_norm": 151.70242269299357, "learning_rate": 2.5596568090246547e-09, "logits/chosen": -4.4894232749938965, "logits/rejected": -4.816695213317871, "logps/chosen": -255.334716796875, "logps/rejected": -228.4678192138672, "loss": 0.1963, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.8811309337615967, "rewards/margins": 4.049709320068359, "rewards/rejected": -2.1685783863067627, "step": 340 }, { "epoch": 0.9, "grad_norm": 150.47860807724385, "learning_rate": 1.6645010980854079e-09, "logits/chosen": -4.505433559417725, "logits/rejected": -4.675290584564209, "logps/chosen": -240.54714965820312, "logps/rejected": -246.03665161132812, "loss": 0.2168, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.4924026727676392, "rewards/margins": 3.3190674781799316, "rewards/rejected": -1.8266645669937134, "step": 350 }, { "epoch": 0.92, "grad_norm": 165.46679700251414, "learning_rate": 9.561607795526006e-10, "logits/chosen": -4.49678373336792, "logits/rejected": -4.711674690246582, "logps/chosen": -246.95388793945312, "logps/rejected": -247.2928009033203, "loss": 0.2121, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.468100905418396, "rewards/margins": 3.451129913330078, "rewards/rejected": -1.9830286502838135, "step": 360 }, { "epoch": 0.95, "grad_norm": 149.25359071163066, "learning_rate": 4.403065646083809e-10, "logits/chosen": -4.518364906311035, "logits/rejected": -4.680220603942871, "logps/chosen": -241.2300262451172, "logps/rejected": -255.6038818359375, "loss": 0.2103, "rewards/accuracies": 0.875, "rewards/chosen": 1.5591168403625488, "rewards/margins": 3.434800386428833, "rewards/rejected": -1.8756835460662842, "step": 370 }, { "epoch": 0.97, "grad_norm": 160.57572024314433, "learning_rate": 1.2106819172520434e-10, "logits/chosen": -4.618407726287842, "logits/rejected": -4.8883843421936035, "logps/chosen": -246.15853881835938, "logps/rejected": -243.1090087890625, "loss": 0.2034, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5924150943756104, "rewards/margins": 3.8508572578430176, "rewards/rejected": -2.2584421634674072, "step": 380 }, { "epoch": 1.0, "grad_norm": 235.40189038757265, "learning_rate": 1.0013655036916758e-12, "logits/chosen": -4.611303329467773, "logits/rejected": -4.857443809509277, "logps/chosen": -242.1800994873047, "logps/rejected": -241.1270294189453, "loss": 0.199, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.7492481470108032, "rewards/margins": 3.8104407787323, "rewards/rejected": -2.0611929893493652, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 0.0, "train_loss": 0.29024992444935965, "train_runtime": 6148.7126, "train_samples_per_second": 8.132, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }