{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006279434850863423, "grad_norm": 5.557445139643298, "learning_rate": 3.125e-08, "logits/chosen": 0.18015038967132568, "logits/rejected": 0.2519298493862152, "logps/chosen": -297.10906982421875, "logps/pi_response": -130.58929443359375, "logps/ref_response": -130.58929443359375, "logps/rejected": -316.44769287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06279434850863422, "grad_norm": 5.880023460230945, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.16638809442520142, "logits/rejected": 0.3159521222114563, "logps/chosen": -243.77159118652344, "logps/pi_response": -120.18633270263672, "logps/ref_response": -120.15902709960938, "logps/rejected": -281.09716796875, "loss": 0.6928, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.0007160517852753401, "rewards/margins": 0.00040519109461456537, "rewards/rejected": -0.0011212429963052273, "step": 10 }, { "epoch": 0.12558869701726844, "grad_norm": 6.011032264913819, "learning_rate": 4.990353313429303e-07, "logits/chosen": 0.1316775530576706, "logits/rejected": 0.32217010855674744, "logps/chosen": -244.0759735107422, "logps/pi_response": -121.6043701171875, "logps/ref_response": -121.85536193847656, "logps/rejected": -266.6847229003906, "loss": 0.6884, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.011644470505416393, "rewards/margins": 0.011915634386241436, "rewards/rejected": -0.02356010302901268, "step": 20 }, { "epoch": 0.18838304552590268, "grad_norm": 6.296068063682766, "learning_rate": 4.882681251368548e-07, "logits/chosen": 0.17213600873947144, "logits/rejected": 0.3042981028556824, "logps/chosen": -244.4438018798828, "logps/pi_response": -109.73341369628906, "logps/ref_response": -110.8894271850586, "logps/rejected": -290.1441650390625, "loss": 0.6685, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.043184880167245865, "rewards/margins": 0.054663728922605515, "rewards/rejected": -0.09784860908985138, "step": 30 }, { "epoch": 0.25117739403453687, "grad_norm": 5.762840753548806, "learning_rate": 4.6604720940421207e-07, "logits/chosen": 0.2107941210269928, "logits/rejected": 0.39838385581970215, "logps/chosen": -287.46002197265625, "logps/pi_response": -125.36665344238281, "logps/ref_response": -129.86325073242188, "logps/rejected": -316.40423583984375, "loss": 0.6349, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08985555917024612, "rewards/margins": 0.1335289627313614, "rewards/rejected": -0.22338449954986572, "step": 40 }, { "epoch": 0.3139717425431711, "grad_norm": 5.876442100853928, "learning_rate": 4.3344075855595097e-07, "logits/chosen": 0.3732234835624695, "logits/rejected": 0.5105798840522766, "logps/chosen": -247.15914916992188, "logps/pi_response": -109.07597351074219, "logps/ref_response": -116.5090560913086, "logps/rejected": -310.7102966308594, "loss": 0.606, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11287806183099747, "rewards/margins": 0.24187734723091125, "rewards/rejected": -0.3547554314136505, "step": 50 }, { "epoch": 0.37676609105180536, "grad_norm": 9.315356312481537, "learning_rate": 3.920161866827889e-07, "logits/chosen": 0.5162631273269653, "logits/rejected": 0.6798213124275208, "logps/chosen": -268.0201721191406, "logps/pi_response": -116.46971130371094, "logps/ref_response": -119.4989242553711, "logps/rejected": -347.75079345703125, "loss": 0.5814, "rewards/accuracies": 0.75, "rewards/chosen": -0.20100387930870056, "rewards/margins": 0.41021671891212463, "rewards/rejected": -0.6112205386161804, "step": 60 }, { "epoch": 0.43956043956043955, "grad_norm": 7.144146562974543, "learning_rate": 3.4376480090239047e-07, "logits/chosen": 0.6009117960929871, "logits/rejected": 0.767359733581543, "logps/chosen": -236.7908172607422, "logps/pi_response": -114.82955169677734, "logps/ref_response": -116.70068359375, "logps/rejected": -368.27850341796875, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2643741965293884, "rewards/margins": 0.5534776449203491, "rewards/rejected": -0.8178518414497375, "step": 70 }, { "epoch": 0.5023547880690737, "grad_norm": 7.105190568030611, "learning_rate": 2.910060778827554e-07, "logits/chosen": 0.47289925813674927, "logits/rejected": 0.8142975568771362, "logps/chosen": -325.69622802734375, "logps/pi_response": -129.21812438964844, "logps/ref_response": -127.53900146484375, "logps/rejected": -348.5531311035156, "loss": 0.5849, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3693317174911499, "rewards/margins": 0.37722334265708923, "rewards/rejected": -0.7465550303459167, "step": 80 }, { "epoch": 0.565149136577708, "grad_norm": 6.295924145315453, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 0.43639689683914185, "logits/rejected": 0.8138043284416199, "logps/chosen": -301.4584045410156, "logps/pi_response": -135.66107177734375, "logps/ref_response": -129.38760375976562, "logps/rejected": -421.0269470214844, "loss": 0.5432, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.29649025201797485, "rewards/margins": 0.6172370910644531, "rewards/rejected": -0.9137271642684937, "step": 90 }, { "epoch": 0.6279434850863422, "grad_norm": 7.528550263483296, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 0.6732058525085449, "logits/rejected": 0.8459190130233765, "logps/chosen": -283.48883056640625, "logps/pi_response": -121.66845703125, "logps/ref_response": -114.0061264038086, "logps/rejected": -304.6356201171875, "loss": 0.5426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30830463767051697, "rewards/margins": 0.3349376320838928, "rewards/rejected": -0.6432422995567322, "step": 100 }, { "epoch": 0.6907378335949764, "grad_norm": 7.109656663434607, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 0.6394161581993103, "logits/rejected": 0.9435567855834961, "logps/chosen": -298.015869140625, "logps/pi_response": -136.70449829101562, "logps/ref_response": -125.8144760131836, "logps/rejected": -383.9209899902344, "loss": 0.5314, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.33838868141174316, "rewards/margins": 0.5101521015167236, "rewards/rejected": -0.8485407829284668, "step": 110 }, { "epoch": 0.7535321821036107, "grad_norm": 7.158165163184529, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.5716279745101929, "logits/rejected": 0.8841035962104797, "logps/chosen": -320.6312255859375, "logps/pi_response": -131.89306640625, "logps/ref_response": -120.58707427978516, "logps/rejected": -369.5261535644531, "loss": 0.5541, "rewards/accuracies": 0.75, "rewards/chosen": -0.4013099670410156, "rewards/margins": 0.5214625597000122, "rewards/rejected": -0.9227724075317383, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 6.942114883986432, "learning_rate": 4.904486005914027e-08, "logits/chosen": 0.5996646881103516, "logits/rejected": 0.8270283937454224, "logps/chosen": -277.0096130371094, "logps/pi_response": -136.15554809570312, "logps/ref_response": -123.1449966430664, "logps/rejected": -372.5554504394531, "loss": 0.5271, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3302404284477234, "rewards/margins": 0.5928428173065186, "rewards/rejected": -0.9230831861495972, "step": 130 }, { "epoch": 0.8791208791208791, "grad_norm": 7.301927243964412, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 0.5675501823425293, "logits/rejected": 0.8693594932556152, "logps/chosen": -284.0587158203125, "logps/pi_response": -131.0335235595703, "logps/ref_response": -121.63087463378906, "logps/rejected": -404.29278564453125, "loss": 0.53, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3401293456554413, "rewards/margins": 0.748315155506134, "rewards/rejected": -1.088444471359253, "step": 140 }, { "epoch": 0.9419152276295133, "grad_norm": 8.46328336009499, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 0.5060345530509949, "logits/rejected": 0.8951950073242188, "logps/chosen": -317.99163818359375, "logps/pi_response": -145.62229919433594, "logps/ref_response": -132.86119079589844, "logps/rejected": -398.7874755859375, "loss": 0.5193, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.39088305830955505, "rewards/margins": 0.5848211050033569, "rewards/rejected": -0.9757040739059448, "step": 150 }, { "epoch": 0.9984301412872841, "step": 159, "total_flos": 0.0, "train_loss": 0.5789602717513558, "train_runtime": 4365.2801, "train_samples_per_second": 4.668, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }