{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006279434850863423, "grad_norm": 44.82313301644843, "learning_rate": 6.25e-09, "logits/chosen": 0.8539759516716003, "logits/rejected": 1.0286259651184082, "logps/chosen": -335.678466796875, "logps/pi_response": -136.65570068359375, "logps/ref_response": -136.65570068359375, "logps/rejected": -607.512451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06279434850863422, "grad_norm": 37.3559401152222, "learning_rate": 6.25e-08, "logits/chosen": 0.5866143703460693, "logits/rejected": 0.9108358025550842, "logps/chosen": -331.2261657714844, "logps/pi_response": -115.99703216552734, "logps/ref_response": -116.07170867919922, "logps/rejected": -542.704833984375, "loss": 0.6928, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": -0.0002141093573300168, "rewards/margins": -9.146899537881836e-05, "rewards/rejected": -0.0001226404565386474, "step": 10 }, { "epoch": 0.12558869701726844, "grad_norm": 32.618665621131484, "learning_rate": 9.980706626858606e-08, "logits/chosen": 0.6913329362869263, "logits/rejected": 1.0521622896194458, "logps/chosen": -266.18450927734375, "logps/pi_response": -121.02943420410156, "logps/ref_response": -121.02423095703125, "logps/rejected": -550.6239624023438, "loss": 0.6868, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007379357703030109, "rewards/margins": 0.01433458924293518, "rewards/rejected": -0.021713946014642715, "step": 20 }, { "epoch": 0.18838304552590268, "grad_norm": 28.038145324089726, "learning_rate": 9.765362502737097e-08, "logits/chosen": 0.7383168935775757, "logits/rejected": 1.0771757364273071, "logps/chosen": -336.5181579589844, "logps/pi_response": -119.6182861328125, "logps/ref_response": -119.22322845458984, "logps/rejected": -517.7554321289062, "loss": 0.6645, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05802997946739197, "rewards/margins": 0.036242544651031494, "rewards/rejected": -0.09427253156900406, "step": 30 }, { "epoch": 0.25117739403453687, "grad_norm": 28.144482930637814, "learning_rate": 9.320944188084241e-08, "logits/chosen": 0.6981341242790222, "logits/rejected": 1.0143299102783203, "logps/chosen": -312.6896057128906, "logps/pi_response": -114.02241516113281, "logps/ref_response": -113.06685638427734, "logps/rejected": -570.7511596679688, "loss": 0.6223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1473049521446228, "rewards/margins": 0.1629556119441986, "rewards/rejected": -0.3102605938911438, "step": 40 }, { "epoch": 0.3139717425431711, "grad_norm": 24.826701070167935, "learning_rate": 8.668815171119019e-08, "logits/chosen": 0.7617167234420776, "logits/rejected": 1.1797912120819092, "logps/chosen": -300.81610107421875, "logps/pi_response": -123.53846740722656, "logps/ref_response": -122.34455871582031, "logps/rejected": -541.4984130859375, "loss": 0.5948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16112999618053436, "rewards/margins": 0.23305478692054749, "rewards/rejected": -0.39418473839759827, "step": 50 }, { "epoch": 0.37676609105180536, "grad_norm": 19.513203799747355, "learning_rate": 7.840323733655779e-08, "logits/chosen": 0.766559898853302, "logits/rejected": 1.1061646938323975, "logps/chosen": -339.07257080078125, "logps/pi_response": -115.84468841552734, "logps/ref_response": -112.74748229980469, "logps/rejected": -619.0446166992188, "loss": 0.5697, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.33040159940719604, "rewards/margins": 0.37622275948524475, "rewards/rejected": -0.7066243886947632, "step": 60 }, { "epoch": 0.43956043956043955, "grad_norm": 17.524688561630104, "learning_rate": 6.87529601804781e-08, "logits/chosen": 0.9022890329360962, "logits/rejected": 1.2815022468566895, "logps/chosen": -287.8564758300781, "logps/pi_response": -117.10661315917969, "logps/ref_response": -110.9754409790039, "logps/rejected": -641.4493408203125, "loss": 0.5495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3896563947200775, "rewards/margins": 0.6181625723838806, "rewards/rejected": -1.0078189373016357, "step": 70 }, { "epoch": 0.5023547880690737, "grad_norm": 22.238613223093857, "learning_rate": 5.8201215576551086e-08, "logits/chosen": 0.7839330434799194, "logits/rejected": 1.2842929363250732, "logps/chosen": -352.50201416015625, "logps/pi_response": -128.81784057617188, "logps/ref_response": -121.79289245605469, "logps/rejected": -704.5844116210938, "loss": 0.5423, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.48940858244895935, "rewards/margins": 0.7611938118934631, "rewards/rejected": -1.2506022453308105, "step": 80 }, { "epoch": 0.565149136577708, "grad_norm": 20.05399901390604, "learning_rate": 4.725523300678362e-08, "logits/chosen": 0.9467741250991821, "logits/rejected": 1.3314892053604126, "logps/chosen": -390.7951965332031, "logps/pi_response": -120.1943359375, "logps/ref_response": -111.21858978271484, "logps/rejected": -724.9862060546875, "loss": 0.5435, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5949150919914246, "rewards/margins": 0.7683295011520386, "rewards/rejected": -1.363244652748108, "step": 90 }, { "epoch": 0.6279434850863422, "grad_norm": 15.488053965952703, "learning_rate": 3.644119323817915e-08, "logits/chosen": 0.9162198901176453, "logits/rejected": 1.237339973449707, "logps/chosen": -372.86065673828125, "logps/pi_response": -127.0195083618164, "logps/ref_response": -119.08543395996094, "logps/rejected": -618.2140502929688, "loss": 0.5384, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5800319910049438, "rewards/margins": 0.5143734812736511, "rewards/rejected": -1.0944055318832397, "step": 100 }, { "epoch": 0.6907378335949764, "grad_norm": 19.97941872789063, "learning_rate": 2.6278934458271994e-08, "logits/chosen": 0.8850401043891907, "logits/rejected": 1.172430157661438, "logps/chosen": -350.63067626953125, "logps/pi_response": -121.9805908203125, "logps/ref_response": -114.52159118652344, "logps/rejected": -664.2578735351562, "loss": 0.5634, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5176302194595337, "rewards/margins": 0.6778607368469238, "rewards/rejected": -1.1954909563064575, "step": 110 }, { "epoch": 0.7535321821036107, "grad_norm": 18.5617857059109, "learning_rate": 1.725696330273575e-08, "logits/chosen": 0.8677975535392761, "logits/rejected": 1.1380926370620728, "logps/chosen": -314.06951904296875, "logps/pi_response": -123.45503234863281, "logps/ref_response": -116.88655853271484, "logps/rejected": -674.5084228515625, "loss": 0.5261, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.43471240997314453, "rewards/margins": 0.7433832287788391, "rewards/rejected": -1.1780955791473389, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 22.823575286285628, "learning_rate": 9.808972011828054e-09, "logits/chosen": 0.7576395273208618, "logits/rejected": 1.283482313156128, "logps/chosen": -336.9859619140625, "logps/pi_response": -129.77108764648438, "logps/ref_response": -124.2276382446289, "logps/rejected": -663.2884521484375, "loss": 0.547, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.43467482924461365, "rewards/margins": 0.6862285733222961, "rewards/rejected": -1.1209033727645874, "step": 130 }, { "epoch": 0.8791208791208791, "grad_norm": 16.657603305669547, "learning_rate": 4.2929905518041705e-09, "logits/chosen": 0.8893574476242065, "logits/rejected": 1.3175649642944336, "logps/chosen": -363.91876220703125, "logps/pi_response": -123.624267578125, "logps/ref_response": -117.37095642089844, "logps/rejected": -586.3743896484375, "loss": 0.5436, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4913572371006012, "rewards/margins": 0.5228551030158997, "rewards/rejected": -1.0142122507095337, "step": 140 }, { "epoch": 0.9419152276295133, "grad_norm": 19.65034837624422, "learning_rate": 9.741758728888216e-10, "logits/chosen": 0.7108520269393921, "logits/rejected": 1.1452034711837769, "logps/chosen": -374.49493408203125, "logps/pi_response": -138.4927978515625, "logps/ref_response": -132.68472290039062, "logps/rejected": -669.5167236328125, "loss": 0.5336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.45191335678100586, "rewards/margins": 0.7034718990325928, "rewards/rejected": -1.1553852558135986, "step": 150 }, { "epoch": 0.9984301412872841, "step": 159, "total_flos": 0.0, "train_loss": 0.5769100609065602, "train_runtime": 4399.847, "train_samples_per_second": 4.632, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }