{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990817263544536, "eval_steps": 100, "global_step": 204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.3809523809523811e-07, "logits/chosen": -2.5948691368103027, "logits/rejected": -2.452101707458496, "logps/chosen": -288.6771240234375, "logps/rejected": -270.8803405761719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": -2.4060661792755127, "logits/rejected": -2.3895983695983887, "logps/chosen": -248.78944396972656, "logps/rejected": -272.0387878417969, "loss": 0.6902, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": -0.012765132822096348, "rewards/margins": 0.0059960586950182915, "rewards/rejected": -0.01876119151711464, "step": 10 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": -2.352856159210205, "logits/rejected": -2.2108333110809326, "logps/chosen": -330.4653625488281, "logps/rejected": -309.5657043457031, "loss": 0.6543, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.05031045526266098, "rewards/margins": 0.09622685611248016, "rewards/rejected": -0.14653730392456055, "step": 20 }, { "epoch": 0.15, "learning_rate": 4.970219740227693e-06, "logits/chosen": -1.890295386314392, "logits/rejected": -1.7602672576904297, "logps/chosen": -304.2027282714844, "logps/rejected": -310.0560607910156, "loss": 0.6122, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.05764678865671158, "rewards/margins": 0.20967629551887512, "rewards/rejected": -0.2673230767250061, "step": 30 }, { "epoch": 0.2, "learning_rate": 4.868186180746792e-06, "logits/chosen": -1.6625322103500366, "logits/rejected": -1.408319115638733, "logps/chosen": -332.21380615234375, "logps/rejected": -359.38226318359375, "loss": 0.5921, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0873696357011795, "rewards/margins": 0.3436659276485443, "rewards/rejected": -0.431035578250885, "step": 40 }, { "epoch": 0.24, "learning_rate": 4.696530612642871e-06, "logits/chosen": -1.2149860858917236, "logits/rejected": -0.9494598507881165, "logps/chosen": -344.01007080078125, "logps/rejected": -387.1136779785156, "loss": 0.5646, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.3123381733894348, "rewards/margins": 0.41091781854629517, "rewards/rejected": -0.72325599193573, "step": 50 }, { "epoch": 0.29, "learning_rate": 4.460299516441777e-06, "logits/chosen": -1.3099194765090942, "logits/rejected": -1.0444400310516357, "logps/chosen": -308.3590393066406, "logps/rejected": -356.48309326171875, "loss": 0.5712, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.07990916073322296, "rewards/margins": 0.42140746116638184, "rewards/rejected": -0.5013166666030884, "step": 60 }, { "epoch": 0.34, "learning_rate": 4.1664378205239085e-06, "logits/chosen": -1.1891834735870361, "logits/rejected": -0.8295331001281738, "logps/chosen": -328.58770751953125, "logps/rejected": -367.3239440917969, "loss": 0.565, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.2833566665649414, "rewards/margins": 0.43687087297439575, "rewards/rejected": -0.7202275395393372, "step": 70 }, { "epoch": 0.39, "learning_rate": 3.8235847280454626e-06, "logits/chosen": -1.118951439857483, "logits/rejected": -0.7426460385322571, "logps/chosen": -330.37640380859375, "logps/rejected": -368.56915283203125, "loss": 0.5259, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24417515099048615, "rewards/margins": 0.4794433116912842, "rewards/rejected": -0.7236183881759644, "step": 80 }, { "epoch": 0.44, "learning_rate": 3.441819734087963e-06, "logits/chosen": -1.1813547611236572, "logits/rejected": -0.7541359663009644, "logps/chosen": -337.6168518066406, "logps/rejected": -367.4094543457031, "loss": 0.5487, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11910835653543472, "rewards/margins": 0.5179867744445801, "rewards/rejected": -0.6370951533317566, "step": 90 }, { "epoch": 0.49, "learning_rate": 3.0323662998460396e-06, "logits/chosen": -1.077327013015747, "logits/rejected": -0.598007082939148, "logps/chosen": -346.1620788574219, "logps/rejected": -371.0578918457031, "loss": 0.5347, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2570808529853821, "rewards/margins": 0.4765000343322754, "rewards/rejected": -0.7335808873176575, "step": 100 }, { "epoch": 0.49, "eval_logits/chosen": -0.9797883033752441, "eval_logits/rejected": -0.5705389380455017, "eval_logps/chosen": -348.43701171875, "eval_logps/rejected": -380.08721923828125, "eval_loss": 0.5570356845855713, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": -0.3268292248249054, "eval_rewards/margins": 0.49906718730926514, "eval_rewards/rejected": -0.8258963227272034, "eval_runtime": 384.1665, "eval_samples_per_second": 5.206, "eval_steps_per_second": 0.651, "step": 100 }, { "epoch": 0.54, "learning_rate": 2.6072618954988867e-06, "logits/chosen": -1.0047966241836548, "logits/rejected": -0.5113744735717773, "logps/chosen": -352.7798767089844, "logps/rejected": -395.8839416503906, "loss": 0.5652, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.37514516711235046, "rewards/margins": 0.49685588479042053, "rewards/rejected": -0.872001051902771, "step": 110 }, { "epoch": 0.59, "learning_rate": 2.1790041121336223e-06, "logits/chosen": -1.1386340856552124, "logits/rejected": -0.762003481388092, "logps/chosen": -347.37567138671875, "logps/rejected": -381.8182373046875, "loss": 0.5298, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.32593947649002075, "rewards/margins": 0.47125840187072754, "rewards/rejected": -0.7971979379653931, "step": 120 }, { "epoch": 0.64, "learning_rate": 1.760183246631777e-06, "logits/chosen": -0.9937411546707153, "logits/rejected": -0.5911135673522949, "logps/chosen": -341.10308837890625, "logps/rejected": -390.31622314453125, "loss": 0.5382, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22077293694019318, "rewards/margins": 0.6057422757148743, "rewards/rejected": -0.8265151977539062, "step": 130 }, { "epoch": 0.69, "learning_rate": 1.3631121611097364e-06, "logits/chosen": -0.8524423837661743, "logits/rejected": -0.5514906048774719, "logps/chosen": -347.8735046386719, "logps/rejected": -380.4819641113281, "loss": 0.5419, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3039913773536682, "rewards/margins": 0.49449166655540466, "rewards/rejected": -0.7984830141067505, "step": 140 }, { "epoch": 0.73, "learning_rate": 9.994642986290797e-07, "logits/chosen": -0.9539240002632141, "logits/rejected": -0.48703765869140625, "logps/chosen": -338.36041259765625, "logps/rejected": -382.77618408203125, "loss": 0.5449, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3446277379989624, "rewards/margins": 0.566214382648468, "rewards/rejected": -0.9108421206474304, "step": 150 }, { "epoch": 0.78, "learning_rate": 6.799304971075383e-07, "logits/chosen": -0.9565147161483765, "logits/rejected": -0.4371515214443207, "logps/chosen": -359.60650634765625, "logps/rejected": -392.94366455078125, "loss": 0.533, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.3234427571296692, "rewards/margins": 0.5231602787971497, "rewards/rejected": -0.8466030359268188, "step": 160 }, { "epoch": 0.83, "learning_rate": 4.1390469071538183e-07, "logits/chosen": -0.8408013582229614, "logits/rejected": -0.5169156789779663, "logps/chosen": -344.161376953125, "logps/rejected": -401.06927490234375, "loss": 0.5348, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2773435711860657, "rewards/margins": 0.7136318683624268, "rewards/rejected": -0.9909754991531372, "step": 170 }, { "epoch": 0.88, "learning_rate": 2.092077387824884e-07, "logits/chosen": -1.0086092948913574, "logits/rejected": -0.6254789233207703, "logps/chosen": -348.04437255859375, "logps/rejected": -398.12921142578125, "loss": 0.5478, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.273074209690094, "rewards/margins": 0.5786523818969727, "rewards/rejected": -0.8517265319824219, "step": 180 }, { "epoch": 0.93, "learning_rate": 7.185750133542168e-08, "logits/chosen": -0.9045581817626953, "logits/rejected": -0.6502401828765869, "logps/chosen": -323.7873840332031, "logps/rejected": -377.43109130859375, "loss": 0.5416, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.3004080057144165, "rewards/margins": 0.5847684741020203, "rewards/rejected": -0.8851765394210815, "step": 190 }, { "epoch": 0.98, "learning_rate": 5.891920784984184e-09, "logits/chosen": -1.004695177078247, "logits/rejected": -0.5996636748313904, "logps/chosen": -338.21856689453125, "logps/rejected": -390.23956298828125, "loss": 0.5154, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.263592392206192, "rewards/margins": 0.6431443691253662, "rewards/rejected": -0.9067367315292358, "step": 200 }, { "epoch": 0.98, "eval_logits/chosen": -0.9595882296562195, "eval_logits/rejected": -0.5385043621063232, "eval_logps/chosen": -346.7320556640625, "eval_logps/rejected": -380.200927734375, "eval_loss": 0.5470749735832214, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": -0.3097793161869049, "eval_rewards/margins": 0.5172543525695801, "eval_rewards/rejected": -0.8270336985588074, "eval_runtime": 384.197, "eval_samples_per_second": 5.206, "eval_steps_per_second": 0.651, "step": 200 }, { "epoch": 1.0, "step": 204, "total_flos": 0.0, "train_loss": 0.5623768936185276, "train_runtime": 9619.6269, "train_samples_per_second": 2.717, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }