{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.968, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 0.34545159339904785, "logits/rejected": 0.2957597076892853, "logps/chosen": -217.1103973388672, "logps/rejected": -154.90234375, "loss": 0.0009, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 0.24290476739406586, "logits/rejected": 0.1927487701177597, "logps/chosen": -189.50128173828125, "logps/rejected": -162.37692260742188, "loss": 0.0011, "rewards/accuracies": 0.3055555522441864, "rewards/chosen": -0.0005859931115992367, "rewards/margins": -0.001147971022874117, "rewards/rejected": 0.0005619778530672193, "step": 10 }, { "epoch": 0.32, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.1098150983452797, "logits/rejected": 0.06667135655879974, "logps/chosen": -170.3683319091797, "logps/rejected": -145.548095703125, "loss": 0.0011, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -2.7951715310337022e-05, "rewards/margins": -0.00022801189334131777, "rewards/rejected": 0.00020006010890938342, "step": 20 }, { "epoch": 0.48, "learning_rate": 4.993800445762451e-06, "logits/chosen": 0.15559187531471252, "logits/rejected": 0.15184751152992249, "logps/chosen": -184.87167358398438, "logps/rejected": -146.75131225585938, "loss": 0.0011, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.00036125333281233907, "rewards/margins": -0.0002040974359260872, "rewards/rejected": -0.00015715583867859095, "step": 30 }, { "epoch": 0.64, "learning_rate": 4.944388344834205e-06, "logits/chosen": 0.11296383291482925, "logits/rejected": 0.20522311329841614, "logps/chosen": -182.8056640625, "logps/rejected": -148.15756225585938, "loss": 0.0009, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00020849374413955957, "rewards/margins": 0.001540421275421977, "rewards/rejected": -0.0017489150632172823, "step": 40 }, { "epoch": 0.8, "learning_rate": 4.8465431931347904e-06, "logits/chosen": 0.2731459140777588, "logits/rejected": 0.17598305642604828, "logps/chosen": -196.51651000976562, "logps/rejected": -161.89797973632812, "loss": 0.001, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0016538338968530297, "rewards/margins": 0.000834259029943496, "rewards/rejected": -0.0024880929850041866, "step": 50 }, { "epoch": 0.96, "learning_rate": 4.702203692102539e-06, "logits/chosen": 0.19920073449611664, "logits/rejected": 0.17924004793167114, "logps/chosen": -182.0515594482422, "logps/rejected": -150.9364013671875, "loss": 0.001, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00011732898565242067, "rewards/margins": 0.0012116450816392899, "rewards/rejected": -0.0010943160159513354, "step": 60 }, { "epoch": 1.12, "learning_rate": 4.514229781074239e-06, "logits/chosen": 0.1093553751707077, "logits/rejected": 0.17700831592082977, "logps/chosen": -203.3074188232422, "logps/rejected": -173.57713317871094, "loss": 0.001, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.00015989062376320362, "rewards/margins": 0.0011065483558923006, "rewards/rejected": -0.0012664392124861479, "step": 70 }, { "epoch": 1.28, "learning_rate": 4.286345970517195e-06, "logits/chosen": 0.13181297481060028, "logits/rejected": 0.1530202180147171, "logps/chosen": -193.3525848388672, "logps/rejected": -161.02328491210938, "loss": 0.0011, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00105115189217031, "rewards/margins": 0.00026105757569894195, "rewards/rejected": 0.0007900940254330635, "step": 80 }, { "epoch": 1.44, "learning_rate": 4.023067544670082e-06, "logits/chosen": 0.10432066023349762, "logits/rejected": 0.11901885271072388, "logps/chosen": -184.4915771484375, "logps/rejected": -151.45025634765625, "loss": 0.0009, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0010537179186940193, "rewards/margins": 0.002086392603814602, "rewards/rejected": -0.0010326746851205826, "step": 90 }, { "epoch": 1.6, "learning_rate": 3.7296110958116845e-06, "logits/chosen": 0.17887040972709656, "logits/rejected": 0.13352127373218536, "logps/chosen": -176.19094848632812, "logps/rejected": -149.05015563964844, "loss": 0.001, "rewards/accuracies": 0.375, "rewards/chosen": 0.0006333684432320297, "rewards/margins": 0.00030291633447632194, "rewards/rejected": 0.00033045216696336865, "step": 100 }, { "epoch": 1.6, "eval_logits/chosen": -0.005087433848530054, "eval_logits/rejected": 0.09259650856256485, "eval_logps/chosen": -306.27325439453125, "eval_logps/rejected": -278.57037353515625, "eval_loss": 0.0014748616376891732, "eval_rewards/accuracies": 0.5040000081062317, "eval_rewards/chosen": 8.555292879464105e-06, "eval_rewards/margins": 0.00035174566437490284, "eval_rewards/rejected": -0.0003431903896853328, "eval_runtime": 421.2425, "eval_samples_per_second": 4.748, "eval_steps_per_second": 1.187, "step": 100 }, { "epoch": 1.76, "learning_rate": 3.4117911628292944e-06, "logits/chosen": 0.1393759548664093, "logits/rejected": 0.09525509178638458, "logps/chosen": -194.57229614257812, "logps/rejected": -159.84437561035156, "loss": 0.001, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0006214675377123058, "rewards/margins": 0.0007887079264037311, "rewards/rejected": -0.00016724050510674715, "step": 110 }, { "epoch": 1.92, "learning_rate": 3.075905022087675e-06, "logits/chosen": 0.19464930891990662, "logits/rejected": 0.26265355944633484, "logps/chosen": -182.62338256835938, "logps/rejected": -146.72181701660156, "loss": 0.001, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.00028288367320783436, "rewards/margins": 0.0012492609675973654, "rewards/rejected": -0.0009663773817010224, "step": 120 }, { "epoch": 2.08, "learning_rate": 2.728607913349464e-06, "logits/chosen": 0.2267303168773651, "logits/rejected": 0.21481864154338837, "logps/chosen": -175.65589904785156, "logps/rejected": -143.40615844726562, "loss": 0.0011, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0007823506603017449, "rewards/margins": 0.00022141262888908386, "rewards/rejected": 0.000560938089620322, "step": 130 }, { "epoch": 2.24, "learning_rate": 2.376781173017589e-06, "logits/chosen": 0.18638554215431213, "logits/rejected": 0.19504567980766296, "logps/chosen": -181.9698486328125, "logps/rejected": -152.50296020507812, "loss": 0.001, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.002239116234704852, "rewards/margins": 0.001197666977532208, "rewards/rejected": 0.0010414490243420005, "step": 140 }, { "epoch": 2.4, "learning_rate": 2.0273958875043877e-06, "logits/chosen": 0.17610232532024384, "logits/rejected": 0.1723514050245285, "logps/chosen": -199.57699584960938, "logps/rejected": -177.21139526367188, "loss": 0.001, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0006620158674195409, "rewards/margins": 0.0013073014561086893, "rewards/rejected": -0.0006452856468968093, "step": 150 }, { "epoch": 2.56, "learning_rate": 1.6873747682962393e-06, "logits/chosen": 0.07310830056667328, "logits/rejected": 0.17652472853660583, "logps/chosen": -183.4370880126953, "logps/rejected": -141.97218322753906, "loss": 0.001, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025453295093029737, "rewards/margins": 0.0023006144911050797, "rewards/rejected": 0.00024471539654769003, "step": 160 }, { "epoch": 2.72, "learning_rate": 1.363454985517803e-06, "logits/chosen": 0.10622234642505646, "logits/rejected": 0.20915362238883972, "logps/chosen": -185.3426971435547, "logps/rejected": -146.079345703125, "loss": 0.0009, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0036269682459533215, "rewards/margins": 0.002667922293767333, "rewards/rejected": 0.0009590461850166321, "step": 170 }, { "epoch": 2.88, "learning_rate": 1.062054677808238e-06, "logits/chosen": 0.19405516982078552, "logits/rejected": 0.16787810623645782, "logps/chosen": -199.97219848632812, "logps/rejected": -163.14398193359375, "loss": 0.001, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0023652021773159504, "rewards/margins": 0.0015264868270605803, "rewards/rejected": 0.000838715408463031, "step": 180 }, { "epoch": 3.04, "learning_rate": 7.891457834794711e-07, "logits/chosen": 0.21832183003425598, "logits/rejected": 0.20919294655323029, "logps/chosen": -167.15711975097656, "logps/rejected": -146.7445526123047, "loss": 0.001, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 8.343837544089183e-05, "rewards/margins": 0.0007264987798407674, "rewards/rejected": -0.0006430605426430702, "step": 190 }, { "epoch": 3.2, "learning_rate": 5.501357126768117e-07, "logits/chosen": 0.07451615482568741, "logits/rejected": 0.10825479030609131, "logps/chosen": -187.6416015625, "logps/rejected": -153.81216430664062, "loss": 0.001, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0008775632595643401, "rewards/margins": 0.0012995953438803554, "rewards/rejected": -0.00042203222983516753, "step": 200 }, { "epoch": 3.2, "eval_logits/chosen": -0.0019514876184985042, "eval_logits/rejected": 0.0977490246295929, "eval_logps/chosen": -306.2765197753906, "eval_logps/rejected": -278.5775146484375, "eval_loss": 0.0015296473866328597, "eval_rewards/accuracies": 0.4884999990463257, "eval_rewards/chosen": -2.441116339468863e-05, "eval_rewards/margins": 0.00038999062962830067, "eval_rewards/rejected": -0.000414401845773682, "eval_runtime": 420.5773, "eval_samples_per_second": 4.755, "eval_steps_per_second": 1.189, "step": 200 }, { "epoch": 3.36, "learning_rate": 3.4976020508682345e-07, "logits/chosen": 0.24440991878509521, "logits/rejected": 0.20419850945472717, "logps/chosen": -173.31942749023438, "logps/rejected": -149.56796264648438, "loss": 0.001, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0018821163102984428, "rewards/margins": 0.0013987896963953972, "rewards/rejected": 0.0004833267885260284, "step": 210 }, { "epoch": 3.52, "learning_rate": 1.9198949610721273e-07, "logits/chosen": 0.195010706782341, "logits/rejected": 0.1568932682275772, "logps/chosen": -196.238525390625, "logps/rejected": -168.1038055419922, "loss": 0.0009, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0010507756378501654, "rewards/margins": 0.0023238039575517178, "rewards/rejected": -0.001273027970455587, "step": 220 }, { "epoch": 3.68, "learning_rate": 7.994965069994143e-08, "logits/chosen": 0.1438043862581253, "logits/rejected": 0.13570797443389893, "logps/chosen": -193.8529815673828, "logps/rejected": -155.14317321777344, "loss": 0.001, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0007188455201685429, "rewards/margins": 0.0012139389291405678, "rewards/rejected": -0.0004950935835950077, "step": 230 }, { "epoch": 3.84, "learning_rate": 1.5860623616664183e-08, "logits/chosen": 0.1838085651397705, "logits/rejected": 0.06515821814537048, "logps/chosen": -176.7322235107422, "logps/rejected": -146.44021606445312, "loss": 0.0009, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.004661304876208305, "rewards/margins": 0.0025658137165009975, "rewards/rejected": 0.0020954906940460205, "step": 240 }, { "epoch": 3.97, "step": 248, "total_flos": 0.0, "train_loss": 0.001002159876318934, "train_runtime": 2735.225, "train_samples_per_second": 1.462, "train_steps_per_second": 0.091 } ], "logging_steps": 10, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }