{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9960474308300395, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 4737.779382861946, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -4.638427734375, "logits/rejected": -4.891327857971191, "logps/chosen": -198.52749633789062, "logps/rejected": -147.3392791748047, "loss": 2.1269, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 3019.619079402316, "learning_rate": 2.631578947368421e-07, "logits/chosen": -4.496801376342773, "logits/rejected": -4.816222190856934, "logps/chosen": -224.27357482910156, "logps/rejected": -168.04739379882812, "loss": 1.9212, "rewards/accuracies": 0.5381944179534912, "rewards/chosen": 0.3541475236415863, "rewards/margins": 0.37169286608695984, "rewards/rejected": -0.017545383423566818, "step": 10 }, { "epoch": 0.11, "grad_norm": 1863.4195630562826, "learning_rate": 4.999573126145131e-07, "logits/chosen": -4.533459663391113, "logits/rejected": -4.848563194274902, "logps/chosen": -220.4309539794922, "logps/rejected": -180.72413635253906, "loss": 1.1783, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": 5.777209281921387, "rewards/margins": 6.331713676452637, "rewards/rejected": -0.5545047521591187, "step": 20 }, { "epoch": 0.16, "grad_norm": 1882.3555396757283, "learning_rate": 4.948524419003415e-07, "logits/chosen": -4.54370641708374, "logits/rejected": -4.812285423278809, "logps/chosen": -213.49411010742188, "logps/rejected": -177.16848754882812, "loss": 1.3518, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": 8.2464017868042, "rewards/margins": 12.621076583862305, "rewards/rejected": -4.3746747970581055, "step": 30 }, { "epoch": 0.21, "grad_norm": 1979.651061288252, "learning_rate": 4.81409414945389e-07, "logits/chosen": -4.619187831878662, "logits/rejected": -4.8958845138549805, "logps/chosen": -221.00082397460938, "logps/rejected": -184.62203979492188, "loss": 1.4689, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": 7.427975654602051, "rewards/margins": 15.529205322265625, "rewards/rejected": -8.101228713989258, "step": 40 }, { "epoch": 0.26, "grad_norm": 2122.235483955093, "learning_rate": 4.6008601790947314e-07, "logits/chosen": -4.608691215515137, "logits/rejected": -4.925226211547852, "logps/chosen": -210.32058715820312, "logps/rejected": -179.0367431640625, "loss": 1.3821, "rewards/accuracies": 0.859375, "rewards/chosen": 6.428221225738525, "rewards/margins": 16.42898941040039, "rewards/rejected": -10.00076961517334, "step": 50 }, { "epoch": 0.32, "grad_norm": 1692.2277360562514, "learning_rate": 4.3160839350405605e-07, "logits/chosen": -4.665585994720459, "logits/rejected": -4.9272074699401855, "logps/chosen": -205.7926788330078, "logps/rejected": -178.56011962890625, "loss": 1.3465, "rewards/accuracies": 0.859375, "rewards/chosen": 7.834652900695801, "rewards/margins": 16.5399169921875, "rewards/rejected": -8.705263137817383, "step": 60 }, { "epoch": 0.37, "grad_norm": 1810.8868167381333, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -4.6464009284973145, "logits/rejected": -4.913968086242676, "logps/chosen": -207.1618194580078, "logps/rejected": -182.61012268066406, "loss": 1.3564, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 8.192334175109863, "rewards/margins": 16.401655197143555, "rewards/rejected": -8.209321975708008, "step": 70 }, { "epoch": 0.42, "grad_norm": 1395.6821844604426, "learning_rate": 3.572801521931522e-07, "logits/chosen": -4.674800395965576, "logits/rejected": -4.932587623596191, "logps/chosen": -202.7789764404297, "logps/rejected": -184.74395751953125, "loss": 1.312, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 5.3585076332092285, "rewards/margins": 16.119762420654297, "rewards/rejected": -10.761255264282227, "step": 80 }, { "epoch": 0.47, "grad_norm": 1761.719146022038, "learning_rate": 3.139606943986089e-07, "logits/chosen": -4.721759796142578, "logits/rejected": -4.953747272491455, "logps/chosen": -199.81448364257812, "logps/rejected": -178.44004821777344, "loss": 1.3425, "rewards/accuracies": 0.815625011920929, "rewards/chosen": 7.849789619445801, "rewards/margins": 15.739909172058105, "rewards/rejected": -7.890120029449463, "step": 90 }, { "epoch": 0.53, "grad_norm": 1641.4466240114464, "learning_rate": 2.684631318687185e-07, "logits/chosen": -4.7313385009765625, "logits/rejected": -4.984685897827148, "logps/chosen": -213.2564239501953, "logps/rejected": -190.69088745117188, "loss": 1.3623, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 5.4120659828186035, "rewards/margins": 16.620161056518555, "rewards/rejected": -11.208093643188477, "step": 100 }, { "epoch": 0.58, "grad_norm": 1346.9601711684072, "learning_rate": 2.2233682952712483e-07, "logits/chosen": -4.668034553527832, "logits/rejected": -4.953825950622559, "logps/chosen": -216.8499298095703, "logps/rejected": -186.10470581054688, "loss": 1.1234, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 7.529428005218506, "rewards/margins": 15.865753173828125, "rewards/rejected": -8.336324691772461, "step": 110 }, { "epoch": 0.63, "grad_norm": 1999.3900490416042, "learning_rate": 1.7715256327766884e-07, "logits/chosen": -4.720789909362793, "logits/rejected": -5.025943279266357, "logps/chosen": -207.7978973388672, "logps/rejected": -178.0445098876953, "loss": 1.1185, "rewards/accuracies": 0.859375, "rewards/chosen": 8.364091873168945, "rewards/margins": 16.011329650878906, "rewards/rejected": -7.647237300872803, "step": 120 }, { "epoch": 0.69, "grad_norm": 1568.9083661238265, "learning_rate": 1.3444902911492174e-07, "logits/chosen": -4.702408790588379, "logits/rejected": -4.98063325881958, "logps/chosen": -215.88174438476562, "logps/rejected": -188.39645385742188, "loss": 1.2748, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": 7.107934474945068, "rewards/margins": 16.361108779907227, "rewards/rejected": -9.253173828125, "step": 130 }, { "epoch": 0.74, "grad_norm": 1390.666498149427, "learning_rate": 9.56804446775518e-08, "logits/chosen": -4.6129560470581055, "logits/rejected": -4.923257350921631, "logps/chosen": -208.4700469970703, "logps/rejected": -178.78623962402344, "loss": 1.0987, "rewards/accuracies": 0.859375, "rewards/chosen": 6.485724449157715, "rewards/margins": 17.193899154663086, "rewards/rejected": -10.708174705505371, "step": 140 }, { "epoch": 0.79, "grad_norm": 1296.5821049110084, "learning_rate": 6.216702761078166e-08, "logits/chosen": -4.699868202209473, "logits/rejected": -4.9864583015441895, "logps/chosen": -196.650146484375, "logps/rejected": -168.93551635742188, "loss": 1.0856, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 7.758223056793213, "rewards/margins": 17.15636444091797, "rewards/rejected": -9.398139953613281, "step": 150 }, { "epoch": 0.84, "grad_norm": 1769.0071097352081, "learning_rate": 3.5050037137906885e-08, "logits/chosen": -4.634187698364258, "logits/rejected": -4.958773612976074, "logps/chosen": -211.03591918945312, "logps/rejected": -176.72067260742188, "loss": 1.1749, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": 8.515033721923828, "rewards/margins": 16.594724655151367, "rewards/rejected": -8.079689979553223, "step": 160 }, { "epoch": 0.9, "grad_norm": 1606.699013433802, "learning_rate": 1.5252909846235894e-08, "logits/chosen": -4.62954044342041, "logits/rejected": -4.913142204284668, "logps/chosen": -209.8083953857422, "logps/rejected": -184.52127075195312, "loss": 1.2059, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 9.714839935302734, "rewards/margins": 18.106443405151367, "rewards/rejected": -8.391606330871582, "step": 170 }, { "epoch": 0.95, "grad_norm": 1242.5060745172418, "learning_rate": 3.4498131616493565e-09, "logits/chosen": -4.616083145141602, "logits/rejected": -4.87780237197876, "logps/chosen": -215.27685546875, "logps/rejected": -193.55332946777344, "loss": 1.1132, "rewards/accuracies": 0.8125, "rewards/chosen": 6.19677209854126, "rewards/margins": 13.694157600402832, "rewards/rejected": -7.497385501861572, "step": 180 }, { "epoch": 1.0, "step": 189, "total_flos": 0.0, "train_loss": 1.2810401298381664, "train_runtime": 5417.1403, "train_samples_per_second": 8.959, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 189, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }