{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006279434850863423, "grad_norm": 10.430633053065636, "learning_rate": 1.875e-08, "logits/chosen": 1.9384945631027222, "logits/rejected": 1.9276118278503418, "logps/chosen": -271.40283203125, "logps/pi_response": -164.53562927246094, "logps/ref_response": -164.53562927246094, "logps/rejected": -331.295166015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06279434850863422, "grad_norm": 12.398250917152115, "learning_rate": 1.875e-07, "logits/chosen": 1.375727891921997, "logits/rejected": 1.5644251108169556, "logps/chosen": -274.7219543457031, "logps/pi_response": -147.36016845703125, "logps/ref_response": -147.31085205078125, "logps/rejected": -435.1158142089844, "loss": 0.6938, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.0004548828292172402, "rewards/margins": -0.00019649550085887313, "rewards/rejected": -0.0002583875320851803, "step": 10 }, { "epoch": 0.12558869701726844, "grad_norm": 8.66500301618874, "learning_rate": 2.9942119880575817e-07, "logits/chosen": 1.4651533365249634, "logits/rejected": 1.655652403831482, "logps/chosen": -306.84954833984375, "logps/pi_response": -169.71615600585938, "logps/ref_response": -169.712646484375, "logps/rejected": -432.33612060546875, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007200081832706928, "rewards/margins": 0.005233117379248142, "rewards/rejected": -0.01243319921195507, "step": 20 }, { "epoch": 0.18838304552590268, "grad_norm": 9.072183489589987, "learning_rate": 2.929608750821129e-07, "logits/chosen": 1.4678703546524048, "logits/rejected": 1.6495119333267212, "logps/chosen": -303.84332275390625, "logps/pi_response": -163.2515106201172, "logps/ref_response": -162.7336883544922, "logps/rejected": -418.14385986328125, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": -0.04476435109972954, "rewards/margins": 0.027997547760605812, "rewards/rejected": -0.0727619007229805, "step": 30 }, { "epoch": 0.25117739403453687, "grad_norm": 7.9558717739641445, "learning_rate": 2.7962832564252725e-07, "logits/chosen": 1.2683569192886353, "logits/rejected": 1.6301815509796143, "logps/chosen": -296.2575988769531, "logps/pi_response": -164.4340362548828, "logps/ref_response": -164.83663940429688, "logps/rejected": -465.8512268066406, "loss": 0.6494, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.08111439645290375, "rewards/margins": 0.10455696284770966, "rewards/rejected": -0.1856713443994522, "step": 40 }, { "epoch": 0.3139717425431711, "grad_norm": 9.575559868471615, "learning_rate": 2.6006445513357056e-07, "logits/chosen": 1.1666990518569946, "logits/rejected": 1.442095398902893, "logps/chosen": -279.1966857910156, "logps/pi_response": -159.5543670654297, "logps/ref_response": -159.3308563232422, "logps/rejected": -451.89306640625, "loss": 0.6245, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1434028297662735, "rewards/margins": 0.18818414211273193, "rewards/rejected": -0.33158695697784424, "step": 50 }, { "epoch": 0.37676609105180536, "grad_norm": 7.235393720583013, "learning_rate": 2.3520971200967334e-07, "logits/chosen": 1.2313123941421509, "logits/rejected": 1.3668440580368042, "logps/chosen": -317.02679443359375, "logps/pi_response": -161.31985473632812, "logps/ref_response": -161.15023803710938, "logps/rejected": -474.43475341796875, "loss": 0.6077, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21095111966133118, "rewards/margins": 0.2169308215379715, "rewards/rejected": -0.4278818964958191, "step": 60 }, { "epoch": 0.43956043956043955, "grad_norm": 7.493835961341088, "learning_rate": 2.0625888054143427e-07, "logits/chosen": 1.0015811920166016, "logits/rejected": 1.1100066900253296, "logps/chosen": -305.7568359375, "logps/pi_response": -157.75637817382812, "logps/ref_response": -160.76815795898438, "logps/rejected": -470.54522705078125, "loss": 0.6033, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1870805323123932, "rewards/margins": 0.23715010285377502, "rewards/rejected": -0.42423057556152344, "step": 70 }, { "epoch": 0.5023547880690737, "grad_norm": 6.931068106744734, "learning_rate": 1.7460364672965327e-07, "logits/chosen": 0.8727623224258423, "logits/rejected": 1.101215124130249, "logps/chosen": -268.4558410644531, "logps/pi_response": -142.99215698242188, "logps/ref_response": -145.04220581054688, "logps/rejected": -471.4241638183594, "loss": 0.5909, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.18771064281463623, "rewards/margins": 0.33435240387916565, "rewards/rejected": -0.5220630764961243, "step": 80 }, { "epoch": 0.565149136577708, "grad_norm": 7.075043680291521, "learning_rate": 1.4176569902035086e-07, "logits/chosen": 0.7811240553855896, "logits/rejected": 1.0503191947937012, "logps/chosen": -308.6497497558594, "logps/pi_response": -152.72732543945312, "logps/ref_response": -156.76785278320312, "logps/rejected": -511.48516845703125, "loss": 0.5858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2674116790294647, "rewards/margins": 0.39314374327659607, "rewards/rejected": -0.6605554819107056, "step": 90 }, { "epoch": 0.6279434850863422, "grad_norm": 7.416871526087097, "learning_rate": 1.0932357971453743e-07, "logits/chosen": 0.6527765393257141, "logits/rejected": 0.881365180015564, "logps/chosen": -290.30291748046875, "logps/pi_response": -137.30520629882812, "logps/ref_response": -141.33499145507812, "logps/rejected": -528.6287231445312, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": -0.2469521462917328, "rewards/margins": 0.4689961075782776, "rewards/rejected": -0.7159483432769775, "step": 100 }, { "epoch": 0.6907378335949764, "grad_norm": 6.868702818175434, "learning_rate": 7.883680337481599e-08, "logits/chosen": 0.5740480422973633, "logits/rejected": 0.858615517616272, "logps/chosen": -303.41119384765625, "logps/pi_response": -136.2499542236328, "logps/ref_response": -139.31204223632812, "logps/rejected": -475.9146423339844, "loss": 0.5688, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3227171003818512, "rewards/margins": 0.34727370738983154, "rewards/rejected": -0.6699907183647156, "step": 110 }, { "epoch": 0.7535321821036107, "grad_norm": 7.117305529201249, "learning_rate": 5.177088990820725e-08, "logits/chosen": 0.8573010563850403, "logits/rejected": 1.0616093873977661, "logps/chosen": -343.8194885253906, "logps/pi_response": -163.02926635742188, "logps/ref_response": -168.28973388671875, "logps/rejected": -572.6094970703125, "loss": 0.5672, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.34128373861312866, "rewards/margins": 0.48513007164001465, "rewards/rejected": -0.8264138102531433, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 7.6342489936056275, "learning_rate": 2.942691603548416e-08, "logits/chosen": 0.6449292302131653, "logits/rejected": 0.8830472826957703, "logps/chosen": -324.7782287597656, "logps/pi_response": -156.79000854492188, "logps/ref_response": -161.82154846191406, "logps/rejected": -524.2703857421875, "loss": 0.5844, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3194042146205902, "rewards/margins": 0.4711576998233795, "rewards/rejected": -0.7905619740486145, "step": 130 }, { "epoch": 0.8791208791208791, "grad_norm": 6.752884100895449, "learning_rate": 1.2878971655412513e-08, "logits/chosen": 0.8525398373603821, "logits/rejected": 1.0120213031768799, "logps/chosen": -330.0039978027344, "logps/pi_response": -156.17630004882812, "logps/ref_response": -161.65866088867188, "logps/rejected": -559.6900634765625, "loss": 0.5577, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2980651557445526, "rewards/margins": 0.5507915616035461, "rewards/rejected": -0.8488567471504211, "step": 140 }, { "epoch": 0.9419152276295133, "grad_norm": 7.563586487187802, "learning_rate": 2.922527618666465e-09, "logits/chosen": 0.5885689854621887, "logits/rejected": 0.828190803527832, "logps/chosen": -297.84844970703125, "logps/pi_response": -152.97872924804688, "logps/ref_response": -157.01953125, "logps/rejected": -508.4051818847656, "loss": 0.5623, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2831505537033081, "rewards/margins": 0.48666033148765564, "rewards/rejected": -0.7698109149932861, "step": 150 }, { "epoch": 0.9984301412872841, "step": 159, "total_flos": 0.0, "train_loss": 0.6040852474716475, "train_runtime": 3000.2234, "train_samples_per_second": 6.792, "train_steps_per_second": 0.053 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }