{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.968, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 0.4729592502117157, "logits/rejected": 0.38554269075393677, "logps/chosen": -213.08737182617188, "logps/rejected": -203.01974487304688, "loss": 0.0016, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 0.15001463890075684, "logits/rejected": 0.20593884587287903, "logps/chosen": -161.55821228027344, "logps/rejected": -135.95449829101562, "loss": 0.0015, "rewards/accuracies": 0.3819444477558136, "rewards/chosen": 3.554481372702867e-05, "rewards/margins": 0.0017159796552732587, "rewards/rejected": -0.001680435030721128, "step": 10 }, { "epoch": 0.32, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.09242797642946243, "logits/rejected": 0.04898233711719513, "logps/chosen": -181.7728271484375, "logps/rejected": -151.2571563720703, "loss": 0.0017, "rewards/accuracies": 0.375, "rewards/chosen": -0.0012837719405069947, "rewards/margins": -0.0005185201880522072, "rewards/rejected": -0.0007652518688701093, "step": 20 }, { "epoch": 0.48, "learning_rate": 4.993800445762451e-06, "logits/chosen": 0.1974470317363739, "logits/rejected": 0.24875693023204803, "logps/chosen": -175.55596923828125, "logps/rejected": -143.77584838867188, "loss": 0.0019, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.003100383561104536, "rewards/margins": -0.0019143905956298113, "rewards/rejected": -0.0011859927326440811, "step": 30 }, { "epoch": 0.64, "learning_rate": 4.944388344834205e-06, "logits/chosen": 0.1500740945339203, "logits/rejected": 0.2011214792728424, "logps/chosen": -181.77581787109375, "logps/rejected": -164.48788452148438, "loss": 0.0016, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0005139185814186931, "rewards/margins": 0.0008601135341450572, "rewards/rejected": -0.0013740319991484284, "step": 40 }, { "epoch": 0.8, "learning_rate": 4.8465431931347904e-06, "logits/chosen": 0.08030920475721359, "logits/rejected": 0.15738067030906677, "logps/chosen": -169.9337158203125, "logps/rejected": -144.90228271484375, "loss": 0.0017, "rewards/accuracies": 0.375, "rewards/chosen": -0.0008664874476380646, "rewards/margins": -0.0005486059235408902, "rewards/rejected": -0.00031788164051249623, "step": 50 }, { "epoch": 0.96, "learning_rate": 4.702203692102539e-06, "logits/chosen": 0.1897524893283844, "logits/rejected": 0.17828692495822906, "logps/chosen": -173.4358367919922, "logps/rejected": -145.06625366210938, "loss": 0.0016, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0021182517521083355, "rewards/margins": 0.0011023099068552256, "rewards/rejected": -0.0032205611933022738, "step": 60 }, { "epoch": 1.12, "learning_rate": 4.514229781074239e-06, "logits/chosen": 0.2607804834842682, "logits/rejected": 0.23584774136543274, "logps/chosen": -188.595947265625, "logps/rejected": -160.49171447753906, "loss": 0.0018, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0020389086566865444, "rewards/margins": -0.001390365301631391, "rewards/rejected": -0.0006485433550551534, "step": 70 }, { "epoch": 1.28, "learning_rate": 4.286345970517195e-06, "logits/chosen": 0.17552152276039124, "logits/rejected": 0.24562516808509827, "logps/chosen": -165.12591552734375, "logps/rejected": -137.75808715820312, "loss": 0.0016, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00046771267079748213, "rewards/margins": 0.0003761005064006895, "rewards/rejected": -0.0008438131771981716, "step": 80 }, { "epoch": 1.44, "learning_rate": 4.023067544670082e-06, "logits/chosen": 0.16295495629310608, "logits/rejected": 0.1336701214313507, "logps/chosen": -176.12887573242188, "logps/rejected": -145.37673950195312, "loss": 0.0016, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0007470982382073998, "rewards/margins": 0.0005909097963012755, "rewards/rejected": -0.001338008209131658, "step": 90 }, { "epoch": 1.6, "learning_rate": 3.7296110958116845e-06, "logits/chosen": 0.18624618649482727, "logits/rejected": 0.15213565528392792, "logps/chosen": -177.76405334472656, "logps/rejected": -152.70240783691406, "loss": 0.0016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.000859270163346082, "rewards/margins": 0.0012418597470968962, "rewards/rejected": -0.0003825899329967797, "step": 100 }, { "epoch": 1.6, "eval_logits/chosen": -0.004967109765857458, "eval_logits/rejected": 0.09325645118951797, "eval_logps/chosen": -306.3399353027344, "eval_logps/rejected": -278.6739501953125, "eval_loss": 0.0021029466297477484, "eval_rewards/accuracies": 0.4909999966621399, "eval_rewards/chosen": 0.0005270715337246656, "eval_rewards/margins": 0.00038576460792683065, "eval_rewards/rejected": 0.00014130691124591976, "eval_runtime": 412.0589, "eval_samples_per_second": 4.854, "eval_steps_per_second": 1.213, "step": 100 }, { "epoch": 1.76, "learning_rate": 3.4117911628292944e-06, "logits/chosen": 0.18669767677783966, "logits/rejected": 0.193131685256958, "logps/chosen": -186.2015380859375, "logps/rejected": -160.41812133789062, "loss": 0.0017, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.0022148210555315018, "rewards/margins": -0.0006464887410402298, "rewards/rejected": -0.0015683325473219156, "step": 110 }, { "epoch": 1.92, "learning_rate": 3.075905022087675e-06, "logits/chosen": 0.07856817543506622, "logits/rejected": 0.13352298736572266, "logps/chosen": -174.9341278076172, "logps/rejected": -154.233642578125, "loss": 0.0017, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0001242739672306925, "rewards/margins": 0.0005846145795658231, "rewards/rejected": -0.00046034049591980875, "step": 120 }, { "epoch": 2.08, "learning_rate": 2.728607913349464e-06, "logits/chosen": 0.19492605328559875, "logits/rejected": 0.16430191695690155, "logps/chosen": -174.35968017578125, "logps/rejected": -150.935302734375, "loss": 0.0016, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0011604964965954423, "rewards/margins": 0.0007132277823984623, "rewards/rejected": -0.0018737241625785828, "step": 130 }, { "epoch": 2.24, "learning_rate": 2.376781173017589e-06, "logits/chosen": 0.2628365159034729, "logits/rejected": 0.19500017166137695, "logps/chosen": -187.58023071289062, "logps/rejected": -166.26370239257812, "loss": 0.0017, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.0010837314184755087, "rewards/margins": -0.00033805653220042586, "rewards/rejected": -0.0007456748280674219, "step": 140 }, { "epoch": 2.4, "learning_rate": 2.0273958875043877e-06, "logits/chosen": 0.14645084738731384, "logits/rejected": 0.1681375354528427, "logps/chosen": -157.08926391601562, "logps/rejected": -131.46810913085938, "loss": 0.0016, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.002456915332004428, "rewards/margins": 0.0014492868212983012, "rewards/rejected": -0.0039062027353793383, "step": 150 }, { "epoch": 2.56, "learning_rate": 1.6873747682962393e-06, "logits/chosen": 0.16395077109336853, "logits/rejected": 0.14529384672641754, "logps/chosen": -188.40296936035156, "logps/rejected": -158.95834350585938, "loss": 0.0016, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00022082138457335532, "rewards/margins": 0.000981360673904419, "rewards/rejected": -0.0007605393184348941, "step": 160 }, { "epoch": 2.72, "learning_rate": 1.363454985517803e-06, "logits/chosen": 0.21595752239227295, "logits/rejected": 0.17811095714569092, "logps/chosen": -178.89500427246094, "logps/rejected": -150.37576293945312, "loss": 0.0016, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0017019693041220307, "rewards/margins": 0.0014590112259611487, "rewards/rejected": 0.00024295765615534037, "step": 170 }, { "epoch": 2.88, "learning_rate": 1.062054677808238e-06, "logits/chosen": 0.11008661985397339, "logits/rejected": 0.1656235158443451, "logps/chosen": -166.90554809570312, "logps/rejected": -149.75042724609375, "loss": 0.0017, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0004684976884163916, "rewards/margins": 0.0005998688284307718, "rewards/rejected": -0.0010683666914701462, "step": 180 }, { "epoch": 3.04, "learning_rate": 7.891457834794711e-07, "logits/chosen": 0.2820424437522888, "logits/rejected": 0.16322749853134155, "logps/chosen": -180.0753173828125, "logps/rejected": -143.658203125, "loss": 0.0016, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 1.5354249853771762e-06, "rewards/margins": 0.0018281619995832443, "rewards/rejected": -0.0018266268307343125, "step": 190 }, { "epoch": 3.2, "learning_rate": 5.501357126768117e-07, "logits/chosen": 0.15994948148727417, "logits/rejected": 0.14330127835273743, "logps/chosen": -183.79759216308594, "logps/rejected": -155.71310424804688, "loss": 0.0017, "rewards/accuracies": 0.4375, "rewards/chosen": 0.001569538377225399, "rewards/margins": 0.0009444955503568053, "rewards/rejected": 0.000625042652245611, "step": 200 }, { "epoch": 3.2, "eval_logits/chosen": -0.009462343528866768, "eval_logits/rejected": 0.08856771141290665, "eval_logps/chosen": -306.4609375, "eval_logps/rejected": -278.74566650390625, "eval_loss": 0.0021845391020178795, "eval_rewards/accuracies": 0.4950000047683716, "eval_rewards/chosen": -0.0006826075841672719, "eval_rewards/margins": -0.00010681045387173072, "eval_rewards/rejected": -0.0005757971666753292, "eval_runtime": 412.1683, "eval_samples_per_second": 4.852, "eval_steps_per_second": 1.213, "step": 200 }, { "epoch": 3.36, "learning_rate": 3.4976020508682345e-07, "logits/chosen": 0.1456301361322403, "logits/rejected": 0.2356918305158615, "logps/chosen": -183.83010864257812, "logps/rejected": -155.9720458984375, "loss": 0.0015, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0012806833256036043, "rewards/margins": 0.0026433460880070925, "rewards/rejected": -0.0013626629952341318, "step": 210 }, { "epoch": 3.52, "learning_rate": 1.9198949610721273e-07, "logits/chosen": 0.11089984327554703, "logits/rejected": 0.18088462948799133, "logps/chosen": -183.67636108398438, "logps/rejected": -146.19009399414062, "loss": 0.0015, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0020343190990388393, "rewards/margins": 0.0024411864578723907, "rewards/rejected": -0.00040686698048375547, "step": 220 }, { "epoch": 3.68, "learning_rate": 7.994965069994143e-08, "logits/chosen": 0.15200337767601013, "logits/rejected": 0.0883503332734108, "logps/chosen": -152.91159057617188, "logps/rejected": -137.67068481445312, "loss": 0.0017, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0002739583433140069, "rewards/margins": 0.00024719498469494283, "rewards/rejected": -0.0005211535608395934, "step": 230 }, { "epoch": 3.84, "learning_rate": 1.5860623616664183e-08, "logits/chosen": 0.21818551421165466, "logits/rejected": 0.26517254114151, "logps/chosen": -185.5134735107422, "logps/rejected": -156.71102905273438, "loss": 0.0018, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0007516987388953567, "rewards/margins": -0.001160649349913001, "rewards/rejected": 0.0004089508147444576, "step": 240 }, { "epoch": 3.97, "step": 248, "total_flos": 0.0, "train_loss": 0.0016499216942447088, "train_runtime": 2658.6406, "train_samples_per_second": 1.505, "train_steps_per_second": 0.093 } ], "logging_steps": 10, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }