{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.968, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 0.02732202410697937, "logits/rejected": 0.16736462712287903, "logps/chosen": -204.44515991210938, "logps/rejected": -186.30474853515625, "loss": 0.0014, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 0.11495557427406311, "logits/rejected": 0.14849303662776947, "logps/chosen": -174.2774658203125, "logps/rejected": -139.304443359375, "loss": 0.0014, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": 0.0010303932940587401, "rewards/margins": 0.0013937298208475113, "rewards/rejected": -0.0003633367014117539, "step": 10 }, { "epoch": 0.32, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.19859905540943146, "logits/rejected": 0.2755558490753174, "logps/chosen": -186.06753540039062, "logps/rejected": -150.23538208007812, "loss": 0.0014, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.0006829313351772726, "rewards/margins": 0.0010883348295465112, "rewards/rejected": -0.0017712658736854792, "step": 20 }, { "epoch": 0.48, "learning_rate": 4.993800445762451e-06, "logits/chosen": 0.10206829011440277, "logits/rejected": 0.09731761366128922, "logps/chosen": -189.70846557617188, "logps/rejected": -176.63827514648438, "loss": 0.0014, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0010909180855378509, "rewards/margins": 0.0010104707907885313, "rewards/rejected": 8.044719288591295e-05, "step": 30 }, { "epoch": 0.64, "learning_rate": 4.944388344834205e-06, "logits/chosen": 0.21991512179374695, "logits/rejected": 0.13409800827503204, "logps/chosen": -178.78292846679688, "logps/rejected": -151.7918243408203, "loss": 0.0016, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.0007118875510059297, "rewards/margins": -0.000794673920609057, "rewards/rejected": 8.278638415504247e-05, "step": 40 }, { "epoch": 0.8, "learning_rate": 4.8465431931347904e-06, "logits/chosen": 0.10882525146007538, "logits/rejected": 0.16875343024730682, "logps/chosen": -185.3433074951172, "logps/rejected": -174.74209594726562, "loss": 0.0015, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.0009307868895120919, "rewards/margins": 0.000729514576960355, "rewards/rejected": 0.00020127242896705866, "step": 50 }, { "epoch": 0.96, "learning_rate": 4.702203692102539e-06, "logits/chosen": 0.1939707249403, "logits/rejected": 0.18511822819709778, "logps/chosen": -194.64193725585938, "logps/rejected": -159.30172729492188, "loss": 0.0015, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0016695528756827116, "rewards/margins": 8.004475239431486e-05, "rewards/rejected": 0.0015895080287009478, "step": 60 }, { "epoch": 1.12, "learning_rate": 4.514229781074239e-06, "logits/chosen": 0.1889864206314087, "logits/rejected": 0.1525781750679016, "logps/chosen": -196.69769287109375, "logps/rejected": -172.0983428955078, "loss": 0.0015, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0013059942284598947, "rewards/margins": -0.0003628497361205518, "rewards/rejected": 0.0016688440227881074, "step": 70 }, { "epoch": 1.28, "learning_rate": 4.286345970517195e-06, "logits/chosen": 0.1526193767786026, "logits/rejected": 0.13564926385879517, "logps/chosen": -176.2852020263672, "logps/rejected": -149.50759887695312, "loss": 0.0014, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0020406683906912804, "rewards/margins": 0.0011222332250326872, "rewards/rejected": 0.0009184351074509323, "step": 80 }, { "epoch": 1.44, "learning_rate": 4.023067544670082e-06, "logits/chosen": 0.19515976309776306, "logits/rejected": 0.16219770908355713, "logps/chosen": -183.90406799316406, "logps/rejected": -157.0895538330078, "loss": 0.0017, "rewards/accuracies": 0.375, "rewards/chosen": -0.001846942352131009, "rewards/margins": -0.0017863952089101076, "rewards/rejected": -6.0547237808350474e-05, "step": 90 }, { "epoch": 1.6, "learning_rate": 3.7296110958116845e-06, "logits/chosen": 0.16685011982917786, "logits/rejected": 0.05337408185005188, "logps/chosen": -177.84762573242188, "logps/rejected": -152.65464782714844, "loss": 0.0015, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0007909245905466378, "rewards/margins": 0.0006749060703441501, "rewards/rejected": -0.001465830602683127, "step": 100 }, { "epoch": 1.6, "eval_logits/chosen": 0.0015224728267639875, "eval_logits/rejected": 0.09820695966482162, "eval_logps/chosen": -306.2193603515625, "eval_logps/rejected": -278.5382080078125, "eval_loss": 0.001858623931184411, "eval_rewards/accuracies": 0.5149999856948853, "eval_rewards/chosen": 0.0015068423235788941, "eval_rewards/margins": 0.0008032902260310948, "eval_rewards/rejected": 0.0007035521557554603, "eval_runtime": 420.6562, "eval_samples_per_second": 4.754, "eval_steps_per_second": 1.189, "step": 100 }, { "epoch": 1.76, "learning_rate": 3.4117911628292944e-06, "logits/chosen": 0.22267238795757294, "logits/rejected": 0.19200441241264343, "logps/chosen": -205.3648223876953, "logps/rejected": -172.17208862304688, "loss": 0.0015, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.002072205301374197, "rewards/margins": -0.0004376435244921595, "rewards/rejected": -0.0016345620388165116, "step": 110 }, { "epoch": 1.92, "learning_rate": 3.075905022087675e-06, "logits/chosen": 0.1819857358932495, "logits/rejected": 0.23743709921836853, "logps/chosen": -184.61007690429688, "logps/rejected": -160.8583221435547, "loss": 0.0013, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.0009784279391169548, "rewards/margins": 0.0018532350659370422, "rewards/rejected": -0.000874807417858392, "step": 120 }, { "epoch": 2.08, "learning_rate": 2.728607913349464e-06, "logits/chosen": 0.09743748605251312, "logits/rejected": 0.11662141233682632, "logps/chosen": -176.29867553710938, "logps/rejected": -140.44961547851562, "loss": 0.0014, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0014981284039095044, "rewards/margins": 0.0015437586698681116, "rewards/rejected": -4.563046604744159e-05, "step": 130 }, { "epoch": 2.24, "learning_rate": 2.376781173017589e-06, "logits/chosen": 0.03450363129377365, "logits/rejected": 0.09166844189167023, "logps/chosen": -191.46543884277344, "logps/rejected": -154.39138793945312, "loss": 0.0014, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.001418759347870946, "rewards/margins": 0.0005404851399362087, "rewards/rejected": 0.0008782741497270763, "step": 140 }, { "epoch": 2.4, "learning_rate": 2.0273958875043877e-06, "logits/chosen": 0.15574321150779724, "logits/rejected": 0.15428531169891357, "logps/chosen": -179.232421875, "logps/rejected": -154.8232421875, "loss": 0.0014, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0011932613560929894, "rewards/margins": 0.0013866318622604012, "rewards/rejected": -0.00019337031699251384, "step": 150 }, { "epoch": 2.56, "learning_rate": 1.6873747682962393e-06, "logits/chosen": 0.2517469525337219, "logits/rejected": 0.1588711440563202, "logps/chosen": -193.44821166992188, "logps/rejected": -170.86611938476562, "loss": 0.0015, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00030686514219269156, "rewards/margins": 0.0002994650858454406, "rewards/rejected": -0.0006063304608687758, "step": 160 }, { "epoch": 2.72, "learning_rate": 1.363454985517803e-06, "logits/chosen": 0.20987768471240997, "logits/rejected": 0.08343996852636337, "logps/chosen": -187.78610229492188, "logps/rejected": -167.81800842285156, "loss": 0.0014, "rewards/accuracies": 0.46875, "rewards/chosen": 0.002795418258756399, "rewards/margins": 0.0018930940423160791, "rewards/rejected": 0.0009023241582326591, "step": 170 }, { "epoch": 2.88, "learning_rate": 1.062054677808238e-06, "logits/chosen": 0.23503074049949646, "logits/rejected": 0.2113850861787796, "logps/chosen": -186.79861450195312, "logps/rejected": -162.50196838378906, "loss": 0.0014, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.0017884777626022696, "rewards/margins": 0.0021424146834760904, "rewards/rejected": -0.0003539369790814817, "step": 180 }, { "epoch": 3.04, "learning_rate": 7.891457834794711e-07, "logits/chosen": 0.138333261013031, "logits/rejected": 0.21794256567955017, "logps/chosen": -179.44418334960938, "logps/rejected": -158.67454528808594, "loss": 0.0014, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.0017921695252880454, "rewards/margins": 0.0020220079459249973, "rewards/rejected": -0.00022983844974078238, "step": 190 }, { "epoch": 3.2, "learning_rate": 5.501357126768117e-07, "logits/chosen": 0.2148284912109375, "logits/rejected": 0.1433248072862625, "logps/chosen": -186.4739532470703, "logps/rejected": -167.48558044433594, "loss": 0.0014, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.002506103366613388, "rewards/margins": 0.0016806632047519088, "rewards/rejected": 0.0008254402200691402, "step": 200 }, { "epoch": 3.2, "eval_logits/chosen": -0.00685009965673089, "eval_logits/rejected": 0.09018866717815399, "eval_logps/chosen": -306.28704833984375, "eval_logps/rejected": -278.6018981933594, "eval_loss": 0.0019364985637366772, "eval_rewards/accuracies": 0.5195000171661377, "eval_rewards/chosen": 0.0008296637679450214, "eval_rewards/margins": 0.0007632386405020952, "eval_rewards/rejected": 6.642500375164673e-05, "eval_runtime": 420.9095, "eval_samples_per_second": 4.752, "eval_steps_per_second": 1.188, "step": 200 }, { "epoch": 3.36, "learning_rate": 3.4976020508682345e-07, "logits/chosen": 0.10354860126972198, "logits/rejected": 0.1740628331899643, "logps/chosen": -187.9175262451172, "logps/rejected": -160.59580993652344, "loss": 0.0014, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0003708422009367496, "rewards/margins": 0.0009719420922920108, "rewards/rejected": -0.0006010999786667526, "step": 210 }, { "epoch": 3.52, "learning_rate": 1.9198949610721273e-07, "logits/chosen": 0.14361225068569183, "logits/rejected": 0.09037239849567413, "logps/chosen": -183.87716674804688, "logps/rejected": -157.98300170898438, "loss": 0.0013, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0009205196984112263, "rewards/margins": 0.0023204255849123, "rewards/rejected": -0.0013999061193317175, "step": 220 }, { "epoch": 3.68, "learning_rate": 7.994965069994143e-08, "logits/chosen": 0.20537514984607697, "logits/rejected": 0.18879783153533936, "logps/chosen": -193.15943908691406, "logps/rejected": -162.2894744873047, "loss": 0.0015, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.00020981582929380238, "rewards/margins": 0.0006767899030819535, "rewards/rejected": -0.0008866057032719254, "step": 230 }, { "epoch": 3.84, "learning_rate": 1.5860623616664183e-08, "logits/chosen": 0.13774822652339935, "logits/rejected": 0.09208400547504425, "logps/chosen": -189.96868896484375, "logps/rejected": -159.98782348632812, "loss": 0.0016, "rewards/accuracies": 0.34375, "rewards/chosen": -6.72071473672986e-06, "rewards/margins": -0.0008294621366076171, "rewards/rejected": 0.0008227415382862091, "step": 240 }, { "epoch": 3.97, "step": 248, "total_flos": 0.0, "train_loss": 0.0014411601897013643, "train_runtime": 2755.6406, "train_samples_per_second": 1.452, "train_steps_per_second": 0.09 } ], "logging_steps": 10, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }