FactAlign-LLaMA-3-8B / trainer_state.json
chaoweihuang's picture
Upload folder using huggingface_hub
eae09c2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 400,
"global_step": 975,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"count/fg_chosen": 30.85714340209961,
"count/fg_rejected": 7.4285712242126465,
"epoch": 0.010256410256410256,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.948674201965332,
"fg_logps/policy_chosen": -6.262445449829102,
"fg_logps/policy_rejected": -8.74467945098877,
"fg_logps/reference_KL": -11.94157600402832,
"fg_logps/reference_chosen": -6.2594828605651855,
"fg_logps/reference_rejected": -8.742448806762695,
"fg_loss": 0.8008173704147339,
"fg_rewards/chosen_sum": -0.008917576633393764,
"fg_rewards/rejected_sum": -0.0010543327080085874,
"grad_norm": 70.97090228694296,
"kl": 0.15787295997142792,
"learning_rate": 2.5110157309792834e-07,
"logps/chosen": -366.76351768092104,
"logps/rejected": -369.69268508184524,
"loss": 0.6347,
"rewards/chosen": 0.014076207813463713,
"rewards/margins": -0.000743936476552097,
"rewards/rejected": 0.01482014429001581,
"step": 10
},
{
"count/fg_chosen": 26.352941513061523,
"count/fg_rejected": 6.058823585510254,
"epoch": 0.020512820512820513,
"fg_kl": NaN,
"fg_logps/policy_KL": -10.825294494628906,
"fg_logps/policy_chosen": -5.95189905166626,
"fg_logps/policy_rejected": -5.48292350769043,
"fg_logps/reference_KL": -10.80107307434082,
"fg_logps/reference_chosen": -5.9293718338012695,
"fg_logps/reference_rejected": -5.445353984832764,
"fg_loss": 0.7557108402252197,
"fg_rewards/chosen_sum": -0.05455803498625755,
"fg_rewards/rejected_sum": -0.025491168722510338,
"grad_norm": 76.27540979070403,
"kl": 0.05270981788635254,
"learning_rate": 3.2669067855881653e-07,
"logps/chosen": -385.705078125,
"logps/rejected": -347.460890436747,
"loss": 0.5881,
"rewards/chosen": 0.02315927016270625,
"rewards/margins": 0.05135368041786262,
"rewards/rejected": -0.028194410255156368,
"step": 20
},
{
"count/fg_chosen": 27.0,
"count/fg_rejected": 7.176470756530762,
"epoch": 0.03076923076923077,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.925114631652832,
"fg_logps/policy_chosen": -6.261083602905273,
"fg_logps/policy_rejected": -7.534660816192627,
"fg_logps/reference_KL": -12.83480167388916,
"fg_logps/reference_chosen": -6.232151985168457,
"fg_logps/reference_rejected": -7.494457244873047,
"fg_loss": 0.8402522206306458,
"fg_rewards/chosen_sum": -0.0777626782655716,
"fg_rewards/rejected_sum": -0.03122476302087307,
"grad_norm": 112.80411347358924,
"kl": 0.02636871300637722,
"learning_rate": 3.709074707164929e-07,
"logps/chosen": -380.789990234375,
"logps/rejected": -344.6407958984375,
"loss": 0.5949,
"rewards/chosen": 0.017523756623268126,
"rewards/margins": 0.0999738484621048,
"rewards/rejected": -0.08245009183883667,
"step": 30
},
{
"count/fg_chosen": 32.60869598388672,
"count/fg_rejected": 8.136363983154297,
"epoch": 0.041025641025641026,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.220458030700684,
"fg_logps/policy_chosen": -5.84191370010376,
"fg_logps/policy_rejected": -6.710254192352295,
"fg_logps/reference_KL": -11.054718971252441,
"fg_logps/reference_chosen": -5.7772111892700195,
"fg_logps/reference_rejected": -6.645753860473633,
"fg_loss": 0.808053195476532,
"fg_rewards/chosen_sum": -0.1892387568950653,
"fg_rewards/rejected_sum": -0.05010434612631798,
"grad_norm": 50.84808526748507,
"kl": 0.0,
"learning_rate": 4.022797840197047e-07,
"logps/chosen": -383.87660435267856,
"logps/rejected": -374.25223581414474,
"loss": 0.6121,
"rewards/chosen": 0.04867589473724365,
"rewards/margins": 0.2174466848373413,
"rewards/rejected": -0.16877079010009766,
"step": 40
},
{
"count/fg_chosen": 32.79999923706055,
"count/fg_rejected": 9.533333778381348,
"epoch": 0.05128205128205128,
"fg_kl": NaN,
"fg_logps/policy_KL": -10.373883247375488,
"fg_logps/policy_chosen": -5.664600849151611,
"fg_logps/policy_rejected": -5.962148189544678,
"fg_logps/reference_KL": -10.145182609558105,
"fg_logps/reference_chosen": -5.588218688964844,
"fg_logps/reference_rejected": -5.897520065307617,
"fg_loss": 0.8677656054496765,
"fg_rewards/chosen_sum": -0.25619086623191833,
"fg_rewards/rejected_sum": -0.07288213074207306,
"grad_norm": 64.57457570643511,
"kl": 0.009938049130141735,
"learning_rate": 4.2661404073496845e-07,
"logps/chosen": -346.134577371988,
"logps/rejected": -368.41335227272725,
"loss": 0.5682,
"rewards/chosen": 0.2387204227677311,
"rewards/margins": 0.3148248284061905,
"rewards/rejected": -0.07610440563845944,
"step": 50
},
{
"count/fg_chosen": 26.294116973876953,
"count/fg_rejected": 5.470588207244873,
"epoch": 0.06153846153846154,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.107590675354004,
"fg_logps/policy_chosen": -6.114685535430908,
"fg_logps/policy_rejected": -7.69861364364624,
"fg_logps/reference_KL": -11.640838623046875,
"fg_logps/reference_chosen": -6.00569486618042,
"fg_logps/reference_rejected": -7.617056369781494,
"fg_loss": 0.7082093954086304,
"fg_rewards/chosen_sum": -0.27744388580322266,
"fg_rewards/rejected_sum": -0.06493094563484192,
"grad_norm": 49.41694776530553,
"kl": 0.0,
"learning_rate": 4.4649657617738114e-07,
"logps/chosen": -353.69510690789474,
"logps/rejected": -366.0445033482143,
"loss": 0.5548,
"rewards/chosen": 0.17124160967375102,
"rewards/margins": 0.3800723295761529,
"rewards/rejected": -0.20883071990240187,
"step": 60
},
{
"count/fg_chosen": 32.19047546386719,
"count/fg_rejected": 6.238095283508301,
"epoch": 0.07179487179487179,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.570515632629395,
"fg_logps/policy_chosen": -6.555771827697754,
"fg_logps/policy_rejected": -8.589411735534668,
"fg_logps/reference_KL": -12.056818008422852,
"fg_logps/reference_chosen": -6.406361103057861,
"fg_logps/reference_rejected": -8.353743553161621,
"fg_loss": 0.716395378112793,
"fg_rewards/chosen_sum": -0.46484148502349854,
"fg_rewards/rejected_sum": -0.1304880827665329,
"grad_norm": 68.38293723456421,
"kl": 0.0,
"learning_rate": 4.633070203674842e-07,
"logps/chosen": -319.37548828125,
"logps/rejected": -425.4502418154762,
"loss": 0.6056,
"rewards/chosen": -0.016339432252080816,
"rewards/margins": 0.2762198054551481,
"rewards/rejected": -0.2925592377072289,
"step": 70
},
{
"count/fg_chosen": 34.77777862548828,
"count/fg_rejected": 9.470588684082031,
"epoch": 0.08205128205128205,
"fg_kl": NaN,
"fg_logps/policy_KL": -10.578932762145996,
"fg_logps/policy_chosen": -5.850771427154541,
"fg_logps/policy_rejected": -7.156460762023926,
"fg_logps/reference_KL": -10.092942237854004,
"fg_logps/reference_chosen": -5.712654113769531,
"fg_logps/reference_rejected": -6.944825172424316,
"fg_loss": 0.8019319176673889,
"fg_rewards/chosen_sum": -0.4671042263507843,
"fg_rewards/rejected_sum": -0.17894208431243896,
"grad_norm": 39.81642138358546,
"kl": 0.0,
"learning_rate": 4.77868889480593e-07,
"logps/chosen": -317.1737351190476,
"logps/rejected": -377.7265625,
"loss": 0.5778,
"rewards/chosen": 0.11412754512968518,
"rewards/margins": 0.5135003988605394,
"rewards/rejected": -0.39937285373085424,
"step": 80
},
{
"count/fg_chosen": 31.0625,
"count/fg_rejected": 6.25,
"epoch": 0.09230769230769231,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.664382934570312,
"fg_logps/policy_chosen": -6.444965839385986,
"fg_logps/policy_rejected": -8.933675765991211,
"fg_logps/reference_KL": -10.985946655273438,
"fg_logps/reference_chosen": -6.227105140686035,
"fg_logps/reference_rejected": -8.714274406433105,
"fg_loss": 0.6460863351821899,
"fg_rewards/chosen_sum": -0.5869948267936707,
"fg_rewards/rejected_sum": -0.14423823356628418,
"grad_norm": 55.62438439854401,
"kl": 0.0,
"learning_rate": 4.907133683350575e-07,
"logps/chosen": -404.47572544642856,
"logps/rejected": -416.0439453125,
"loss": 0.5292,
"rewards/chosen": 0.30132850011189777,
"rewards/margins": 0.755986305705288,
"rewards/rejected": -0.4546578055933902,
"step": 90
},
{
"count/fg_chosen": 28.772727966308594,
"count/fg_rejected": 6.333333492279053,
"epoch": 0.10256410256410256,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.864214897155762,
"fg_logps/policy_chosen": -6.919012546539307,
"fg_logps/policy_rejected": -9.152473449707031,
"fg_logps/reference_KL": -12.052042007446289,
"fg_logps/reference_chosen": -6.679240703582764,
"fg_logps/reference_rejected": -8.906697273254395,
"fg_loss": 0.7610839605331421,
"fg_rewards/chosen_sum": -0.674491822719574,
"fg_rewards/rejected_sum": -0.1936338096857071,
"grad_norm": 63.5056444865937,
"kl": 0.0,
"learning_rate": 4.994298745724059e-07,
"logps/chosen": -340.2066359747024,
"logps/rejected": -390.9894377055921,
"loss": 0.6206,
"rewards/chosen": 0.13452401615324475,
"rewards/margins": 0.2098735791997503,
"rewards/rejected": -0.07534956304650557,
"step": 100
},
{
"count/fg_chosen": 32.42856979370117,
"count/fg_rejected": 5.736842155456543,
"epoch": 0.11282051282051282,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.667691230773926,
"fg_logps/policy_chosen": -6.325592041015625,
"fg_logps/policy_rejected": -7.5075507164001465,
"fg_logps/reference_KL": -11.706281661987305,
"fg_logps/reference_chosen": -6.0957818031311035,
"fg_logps/reference_rejected": -7.153979778289795,
"fg_loss": 0.6360421180725098,
"fg_rewards/chosen_sum": -0.7651198506355286,
"fg_rewards/rejected_sum": -0.1911022961139679,
"grad_norm": 45.1771736885249,
"kl": 0.0,
"learning_rate": 4.937286202964652e-07,
"logps/chosen": -358.7214664152299,
"logps/rejected": -373.4951305650685,
"loss": 0.5458,
"rewards/chosen": 0.35928555192618533,
"rewards/margins": 0.8201791045234402,
"rewards/rejected": -0.4608935525972549,
"step": 110
},
{
"count/fg_chosen": 36.45000076293945,
"count/fg_rejected": 10.210526466369629,
"epoch": 0.12307692307692308,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.05976676940918,
"fg_logps/policy_chosen": -6.244026184082031,
"fg_logps/policy_rejected": -7.9034271240234375,
"fg_logps/reference_KL": -11.134529113769531,
"fg_logps/reference_chosen": -6.0136542320251465,
"fg_logps/reference_rejected": -7.628241062164307,
"fg_loss": 0.881417453289032,
"fg_rewards/chosen_sum": -0.7559553384780884,
"fg_rewards/rejected_sum": -0.3012525737285614,
"grad_norm": 50.09822696941802,
"kl": 0.015017986297607422,
"learning_rate": 4.880273660205244e-07,
"logps/chosen": -320.32579210069446,
"logps/rejected": -356.7809392755682,
"loss": 0.601,
"rewards/chosen": 0.6363146040174696,
"rewards/margins": 0.7909924068836252,
"rewards/rejected": -0.15467780286615546,
"step": 120
},
{
"count/fg_chosen": 36.400001525878906,
"count/fg_rejected": 7.933333396911621,
"epoch": 0.13333333333333333,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.468809127807617,
"fg_logps/policy_chosen": -6.23488712310791,
"fg_logps/policy_rejected": -6.54256010055542,
"fg_logps/reference_KL": -10.525192260742188,
"fg_logps/reference_chosen": -6.026561260223389,
"fg_logps/reference_rejected": -6.267870903015137,
"fg_loss": 0.8011055588722229,
"fg_rewards/chosen_sum": -0.647752583026886,
"fg_rewards/rejected_sum": -0.20049738883972168,
"grad_norm": 50.115512333731104,
"kl": 0.03811788558959961,
"learning_rate": 4.823261117445838e-07,
"logps/chosen": -450.6170099431818,
"logps/rejected": -392.42621527777777,
"loss": 0.5078,
"rewards/chosen": 1.0679140090942383,
"rewards/margins": 1.1960734128952026,
"rewards/rejected": -0.12815940380096436,
"step": 130
},
{
"count/fg_chosen": 31.4761905670166,
"count/fg_rejected": 8.699999809265137,
"epoch": 0.14358974358974358,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.193472862243652,
"fg_logps/policy_chosen": -6.388846397399902,
"fg_logps/policy_rejected": -7.679136753082275,
"fg_logps/reference_KL": -11.09471607208252,
"fg_logps/reference_chosen": -6.169702529907227,
"fg_logps/reference_rejected": -7.332272529602051,
"fg_loss": 0.7396747469902039,
"fg_rewards/chosen_sum": -0.5886417627334595,
"fg_rewards/rejected_sum": -0.24698862433433533,
"grad_norm": 48.94452126464587,
"kl": 0.03351273387670517,
"learning_rate": 4.766248574686431e-07,
"logps/chosen": -334.752628279321,
"logps/rejected": -386.6670292721519,
"loss": 0.5475,
"rewards/chosen": 0.8302505869924286,
"rewards/margins": 1.2393360176688526,
"rewards/rejected": -0.40908543067642406,
"step": 140
},
{
"count/fg_chosen": 26.105262756347656,
"count/fg_rejected": 4.526315689086914,
"epoch": 0.15384615384615385,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.575007438659668,
"fg_logps/policy_chosen": -7.0048089027404785,
"fg_logps/policy_rejected": -8.861261367797852,
"fg_logps/reference_KL": -12.408534049987793,
"fg_logps/reference_chosen": -6.816770076751709,
"fg_logps/reference_rejected": -8.450145721435547,
"fg_loss": 0.7104328870773315,
"fg_rewards/chosen_sum": -0.4196644723415375,
"fg_rewards/rejected_sum": -0.17191696166992188,
"grad_norm": 49.69486730354108,
"kl": 0.05726609379053116,
"learning_rate": 4.7092360319270236e-07,
"logps/chosen": -418.9573688271605,
"logps/rejected": -390.3833069620253,
"loss": 0.5064,
"rewards/chosen": 0.9700225076557677,
"rewards/margins": 1.2821282176491542,
"rewards/rejected": -0.31210570999338655,
"step": 150
},
{
"count/fg_chosen": 25.764705657958984,
"count/fg_rejected": 4.125,
"epoch": 0.1641025641025641,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.1893892288208,
"fg_logps/policy_chosen": -6.999147415161133,
"fg_logps/policy_rejected": -7.968658447265625,
"fg_logps/reference_KL": -12.764519691467285,
"fg_logps/reference_chosen": -6.616185188293457,
"fg_logps/reference_rejected": -7.533709526062012,
"fg_loss": 0.7398098707199097,
"fg_rewards/chosen_sum": -0.8501734733581543,
"fg_rewards/rejected_sum": -0.15099674463272095,
"grad_norm": 41.227539280479604,
"kl": 0.0,
"learning_rate": 4.652223489167617e-07,
"logps/chosen": -377.22486787683823,
"logps/rejected": -458.93584408967394,
"loss": 0.4883,
"rewards/chosen": 0.8492268955006319,
"rewards/margins": 1.2064810067491458,
"rewards/rejected": -0.3572541112485139,
"step": 160
},
{
"count/fg_chosen": 33.06666564941406,
"count/fg_rejected": 7.615384578704834,
"epoch": 0.17435897435897435,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.796510696411133,
"fg_logps/policy_chosen": -6.570502758026123,
"fg_logps/policy_rejected": -7.280531883239746,
"fg_logps/reference_KL": -11.486601829528809,
"fg_logps/reference_chosen": -6.288327693939209,
"fg_logps/reference_rejected": -6.919613361358643,
"fg_loss": 0.7529634237289429,
"fg_rewards/chosen_sum": -0.7710135579109192,
"fg_rewards/rejected_sum": -0.2511799931526184,
"grad_norm": 50.55336505699433,
"kl": 0.0,
"learning_rate": 4.5952109464082095e-07,
"logps/chosen": -339.1629430259146,
"logps/rejected": -407.0223607772436,
"loss": 0.4934,
"rewards/chosen": 0.810060268495141,
"rewards/margins": 1.2479861452103855,
"rewards/rejected": -0.4379258767152444,
"step": 170
},
{
"count/fg_chosen": 32.875,
"count/fg_rejected": 8.25,
"epoch": 0.18461538461538463,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.054594993591309,
"fg_logps/policy_chosen": -6.169870853424072,
"fg_logps/policy_rejected": -7.339913368225098,
"fg_logps/reference_KL": -10.784111022949219,
"fg_logps/reference_chosen": -5.852520942687988,
"fg_logps/reference_rejected": -6.8034443855285645,
"fg_loss": 0.7260686755180359,
"fg_rewards/chosen_sum": -0.8828132152557373,
"fg_rewards/rejected_sum": -0.5087793469429016,
"grad_norm": 79.2618606536439,
"kl": 0.0,
"learning_rate": 4.5381984036488027e-07,
"logps/chosen": -298.9371427210366,
"logps/rejected": -415.6966145833333,
"loss": 0.5143,
"rewards/chosen": 0.3949350496617759,
"rewards/margins": 1.4249423386679356,
"rewards/rejected": -1.0300072890061598,
"step": 180
},
{
"count/fg_chosen": 31.363636016845703,
"count/fg_rejected": 6.409090995788574,
"epoch": 0.19487179487179487,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.836668968200684,
"fg_logps/policy_chosen": -6.473412990570068,
"fg_logps/policy_rejected": -7.306280612945557,
"fg_logps/reference_KL": -10.574777603149414,
"fg_logps/reference_chosen": -6.141035079956055,
"fg_logps/reference_rejected": -7.045315265655518,
"fg_loss": 0.7152173519134521,
"fg_rewards/chosen_sum": -0.9164342880249023,
"fg_rewards/rejected_sum": -0.21355971693992615,
"grad_norm": 62.51457716342411,
"kl": 0.0,
"learning_rate": 4.4811858608893954e-07,
"logps/chosen": -378.7162252286585,
"logps/rejected": -380.27271133814105,
"loss": 0.482,
"rewards/chosen": 0.9614362949278297,
"rewards/margins": 1.7487119516035108,
"rewards/rejected": -0.7872756566756811,
"step": 190
},
{
"count/fg_chosen": 27.789474487304688,
"count/fg_rejected": 6.263157844543457,
"epoch": 0.20512820512820512,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.003227233886719,
"fg_logps/policy_chosen": -6.706553936004639,
"fg_logps/policy_rejected": -7.728447437286377,
"fg_logps/reference_KL": -12.440956115722656,
"fg_logps/reference_chosen": -6.388577938079834,
"fg_logps/reference_rejected": -7.369418621063232,
"fg_loss": 0.7191720008850098,
"fg_rewards/chosen_sum": -0.8144214749336243,
"fg_rewards/rejected_sum": -0.220667764544487,
"grad_norm": 53.07709071243361,
"kl": 0.016681909561157227,
"learning_rate": 4.4241733181299887e-07,
"logps/chosen": -376.6458753360215,
"logps/rejected": -420.5650652985075,
"loss": 0.5059,
"rewards/chosen": 0.6015197179650748,
"rewards/margins": 1.3459980290767528,
"rewards/rejected": -0.744478311111678,
"step": 200
},
{
"count/fg_chosen": 35.38461685180664,
"count/fg_rejected": 9.230769157409668,
"epoch": 0.2153846153846154,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.624795913696289,
"fg_logps/policy_chosen": -6.518624305725098,
"fg_logps/policy_rejected": -6.696259021759033,
"fg_logps/reference_KL": -12.152335166931152,
"fg_logps/reference_chosen": -6.259435653686523,
"fg_logps/reference_rejected": -6.192153453826904,
"fg_loss": 0.7262544631958008,
"fg_rewards/chosen_sum": -0.6904063820838928,
"fg_rewards/rejected_sum": -0.5146002173423767,
"grad_norm": 53.74268239721948,
"kl": 0.0,
"learning_rate": 4.3671607753705814e-07,
"logps/chosen": -386.4969911317568,
"logps/rejected": -467.1247274709302,
"loss": 0.4207,
"rewards/chosen": 1.3281025242161106,
"rewards/margins": 2.8707167735120773,
"rewards/rejected": -1.5426142492959665,
"step": 210
},
{
"count/fg_chosen": 24.214284896850586,
"count/fg_rejected": 7.5,
"epoch": 0.22564102564102564,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.594059944152832,
"fg_logps/policy_chosen": -6.371434688568115,
"fg_logps/policy_rejected": -7.165874004364014,
"fg_logps/reference_KL": -10.995756149291992,
"fg_logps/reference_chosen": -6.087092399597168,
"fg_logps/reference_rejected": -6.730601787567139,
"fg_loss": 0.7775211334228516,
"fg_rewards/chosen_sum": -0.6547192931175232,
"fg_rewards/rejected_sum": -0.29393261671066284,
"grad_norm": 46.801027582446935,
"kl": 0.0,
"learning_rate": 4.3101482326111746e-07,
"logps/chosen": -344.7091128700658,
"logps/rejected": -398.7763671875,
"loss": 0.4334,
"rewards/chosen": 0.5747735876786081,
"rewards/margins": 2.151842461492782,
"rewards/rejected": -1.577068873814174,
"step": 220
},
{
"count/fg_chosen": 29.272727966308594,
"count/fg_rejected": 6.55555534362793,
"epoch": 0.2358974358974359,
"fg_kl": NaN,
"fg_logps/policy_KL": -11.965290069580078,
"fg_logps/policy_chosen": -6.299896717071533,
"fg_logps/policy_rejected": -7.80208158493042,
"fg_logps/reference_KL": -10.39778995513916,
"fg_logps/reference_chosen": -5.838742256164551,
"fg_logps/reference_rejected": -7.279135704040527,
"fg_loss": 0.6748415231704712,
"fg_rewards/chosen_sum": -1.2193199396133423,
"fg_rewards/rejected_sum": -0.36254453659057617,
"grad_norm": 45.120354528449354,
"kl": 0.0,
"learning_rate": 4.2531356898517673e-07,
"logps/chosen": -358.53559470663265,
"logps/rejected": -378.2181829637097,
"loss": 0.4775,
"rewards/chosen": 0.5253227389588648,
"rewards/margins": 2.214199475507504,
"rewards/rejected": -1.688876736548639,
"step": 230
},
{
"count/fg_chosen": 36.1875,
"count/fg_rejected": 7.199999809265137,
"epoch": 0.24615384615384617,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.929790496826172,
"fg_logps/policy_chosen": -6.825524806976318,
"fg_logps/policy_rejected": -7.702876091003418,
"fg_logps/reference_KL": -11.064438819885254,
"fg_logps/reference_chosen": -6.324741363525391,
"fg_logps/reference_rejected": -7.179370403289795,
"fg_loss": 0.7855690717697144,
"fg_rewards/chosen_sum": -1.7501062154769897,
"fg_rewards/rejected_sum": -0.501429557800293,
"grad_norm": 41.89950021868993,
"kl": 0.26182326674461365,
"learning_rate": 4.1961231470923605e-07,
"logps/chosen": -347.2475725446429,
"logps/rejected": -382.114453125,
"loss": 0.4878,
"rewards/chosen": 1.19210935320173,
"rewards/margins": 1.8935284205845424,
"rewards/rejected": -0.7014190673828125,
"step": 240
},
{
"count/fg_chosen": 29.399999618530273,
"count/fg_rejected": 7.266666889190674,
"epoch": 0.2564102564102564,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.258040428161621,
"fg_logps/policy_chosen": -6.943975925445557,
"fg_logps/policy_rejected": -7.556437969207764,
"fg_logps/reference_KL": -11.820046424865723,
"fg_logps/reference_chosen": -6.191815376281738,
"fg_logps/reference_rejected": -6.7565178871154785,
"fg_loss": 0.8718132972717285,
"fg_rewards/chosen_sum": -1.6466922760009766,
"fg_rewards/rejected_sum": -0.6005190014839172,
"grad_norm": 41.93986307357718,
"kl": 0.0,
"learning_rate": 4.139110604332953e-07,
"logps/chosen": -327.5843017578125,
"logps/rejected": -385.35126953125,
"loss": 0.4012,
"rewards/chosen": 1.1960113525390625,
"rewards/margins": 2.935526466369629,
"rewards/rejected": -1.7395151138305665,
"step": 250
},
{
"count/fg_chosen": 33.38461685180664,
"count/fg_rejected": 7.692307472229004,
"epoch": 0.26666666666666666,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.9034423828125,
"fg_logps/policy_chosen": -6.196410655975342,
"fg_logps/policy_rejected": -7.9534759521484375,
"fg_logps/reference_KL": -10.658266067504883,
"fg_logps/reference_chosen": -5.453469276428223,
"fg_logps/reference_rejected": -7.014803409576416,
"fg_loss": 0.9994122982025146,
"fg_rewards/chosen_sum": -1.9320785999298096,
"fg_rewards/rejected_sum": -0.8277677893638611,
"grad_norm": 32.044513351466335,
"kl": 0.0,
"learning_rate": 4.0820980615735465e-07,
"logps/chosen": -334.1531723484849,
"logps/rejected": -391.7591838430851,
"loss": 0.4248,
"rewards/chosen": 0.9683192859996449,
"rewards/margins": 2.343336492719226,
"rewards/rejected": -1.3750172067195812,
"step": 260
},
{
"count/fg_chosen": 25.071428298950195,
"count/fg_rejected": 5.142857074737549,
"epoch": 0.27692307692307694,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.50527286529541,
"fg_logps/policy_chosen": -7.0650315284729,
"fg_logps/policy_rejected": -8.667463302612305,
"fg_logps/reference_KL": -12.12157154083252,
"fg_logps/reference_chosen": -6.389377593994141,
"fg_logps/reference_rejected": -7.870638370513916,
"fg_loss": 0.7973106503486633,
"fg_rewards/chosen_sum": -1.2882376909255981,
"fg_rewards/rejected_sum": -0.4027543365955353,
"grad_norm": 68.19722281582398,
"kl": 0.020351696759462357,
"learning_rate": 4.025085518814139e-07,
"logps/chosen": -405.6799411525974,
"logps/rejected": -402.68011106927713,
"loss": 0.4322,
"rewards/chosen": 0.5332601472928926,
"rewards/margins": 2.588090307778585,
"rewards/rejected": -2.0548301604856927,
"step": 270
},
{
"count/fg_chosen": 31.959999084472656,
"count/fg_rejected": 9.0,
"epoch": 0.28717948717948716,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.018957138061523,
"fg_logps/policy_chosen": -6.5620503425598145,
"fg_logps/policy_rejected": -8.468265533447266,
"fg_logps/reference_KL": -9.829309463500977,
"fg_logps/reference_chosen": -5.755721569061279,
"fg_logps/reference_rejected": -7.4278693199157715,
"fg_loss": 0.8410596251487732,
"fg_rewards/chosen_sum": -2.1979994773864746,
"fg_rewards/rejected_sum": -1.1406161785125732,
"grad_norm": 42.49380746961441,
"kl": 0.0,
"learning_rate": 3.9680729760547324e-07,
"logps/chosen": -333.30290316358025,
"logps/rejected": -438.11288568037975,
"loss": 0.5378,
"rewards/chosen": 0.4965087513864776,
"rewards/margins": 2.022118021313297,
"rewards/rejected": -1.5256092699268196,
"step": 280
},
{
"count/fg_chosen": 31.38888931274414,
"count/fg_rejected": 9.166666984558105,
"epoch": 0.29743589743589743,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.498015403747559,
"fg_logps/policy_chosen": -6.605287551879883,
"fg_logps/policy_rejected": -7.190271377563477,
"fg_logps/reference_KL": -11.353104591369629,
"fg_logps/reference_chosen": -6.0449957847595215,
"fg_logps/reference_rejected": -6.696670055389404,
"fg_loss": 0.9254876971244812,
"fg_rewards/chosen_sum": -1.749000072479248,
"fg_rewards/rejected_sum": -0.5272374153137207,
"grad_norm": 34.10871711923179,
"kl": 0.0,
"learning_rate": 3.9110604332953246e-07,
"logps/chosen": -374.3355087652439,
"logps/rejected": -443.0232371794872,
"loss": 0.4904,
"rewards/chosen": 0.7626230658554449,
"rewards/margins": 2.348516941368766,
"rewards/rejected": -1.5858938755133214,
"step": 290
},
{
"count/fg_chosen": 29.75,
"count/fg_rejected": 7.6315789222717285,
"epoch": 0.3076923076923077,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.35108757019043,
"fg_logps/policy_chosen": -6.358473300933838,
"fg_logps/policy_rejected": -7.603394508361816,
"fg_logps/reference_KL": -10.117968559265137,
"fg_logps/reference_chosen": -5.656960964202881,
"fg_logps/reference_rejected": -6.988058090209961,
"fg_loss": 0.687099277973175,
"fg_rewards/chosen_sum": -1.7474342584609985,
"fg_rewards/rejected_sum": -0.6550286412239075,
"grad_norm": 45.87667859445539,
"kl": 0.0,
"learning_rate": 3.854047890535917e-07,
"logps/chosen": -291.94080528846155,
"logps/rejected": -454.1446265243902,
"loss": 0.5046,
"rewards/chosen": -0.05552493608914889,
"rewards/margins": 2.302112236702867,
"rewards/rejected": -2.357637172792016,
"step": 300
},
{
"count/fg_chosen": 29.052631378173828,
"count/fg_rejected": 6.157894611358643,
"epoch": 0.31794871794871793,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.878127098083496,
"fg_logps/policy_chosen": -6.794922828674316,
"fg_logps/policy_rejected": -8.655734062194824,
"fg_logps/reference_KL": -11.557811737060547,
"fg_logps/reference_chosen": -6.259448528289795,
"fg_logps/reference_rejected": -7.914809226989746,
"fg_loss": 0.7258095145225525,
"fg_rewards/chosen_sum": -1.6086143255233765,
"fg_rewards/rejected_sum": -0.5489023327827454,
"grad_norm": 43.21903612458155,
"kl": 0.0,
"learning_rate": 3.7970353477765105e-07,
"logps/chosen": -361.2526117369186,
"logps/rejected": -382.5555320945946,
"loss": 0.4464,
"rewards/chosen": 0.8190518756245457,
"rewards/margins": 3.1284712498327383,
"rewards/rejected": -2.3094193742081925,
"step": 310
},
{
"count/fg_chosen": 27.66666603088379,
"count/fg_rejected": 7.05555534362793,
"epoch": 0.3282051282051282,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.007112503051758,
"fg_logps/policy_chosen": -7.029316425323486,
"fg_logps/policy_rejected": -8.816922187805176,
"fg_logps/reference_KL": -11.414546012878418,
"fg_logps/reference_chosen": -6.1291069984436035,
"fg_logps/reference_rejected": -8.072396278381348,
"fg_loss": 0.6982179880142212,
"fg_rewards/chosen_sum": -1.864067554473877,
"fg_rewards/rejected_sum": -0.625391960144043,
"grad_norm": 49.56835553157457,
"kl": 0.0,
"learning_rate": 3.740022805017103e-07,
"logps/chosen": -308.43726245777026,
"logps/rejected": -462.0808502906977,
"loss": 0.4611,
"rewards/chosen": 0.7936567358068518,
"rewards/margins": 2.709756816430035,
"rewards/rejected": -1.9161000806231832,
"step": 320
},
{
"count/fg_chosen": 29.941177368164062,
"count/fg_rejected": 6.125,
"epoch": 0.3384615384615385,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.976774215698242,
"fg_logps/policy_chosen": -6.938152313232422,
"fg_logps/policy_rejected": -7.871105670928955,
"fg_logps/reference_KL": -11.392000198364258,
"fg_logps/reference_chosen": -6.208968162536621,
"fg_logps/reference_rejected": -6.786693096160889,
"fg_loss": 0.8347401022911072,
"fg_rewards/chosen_sum": -2.1311469078063965,
"fg_rewards/rejected_sum": -0.6044603586196899,
"grad_norm": 23.82687521831931,
"kl": 0.0,
"learning_rate": 3.6830102622576964e-07,
"logps/chosen": -316.02463269589555,
"logps/rejected": -366.19430443548384,
"loss": 0.4596,
"rewards/chosen": 0.36891575713655844,
"rewards/margins": 2.952398068754502,
"rewards/rejected": -2.5834823116179435,
"step": 330
},
{
"count/fg_chosen": 31.647058486938477,
"count/fg_rejected": 5.0,
"epoch": 0.3487179487179487,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.484383583068848,
"fg_logps/policy_chosen": -6.541139602661133,
"fg_logps/policy_rejected": -8.020905494689941,
"fg_logps/reference_KL": -11.032403945922852,
"fg_logps/reference_chosen": -5.8485236167907715,
"fg_logps/reference_rejected": -7.10052490234375,
"fg_loss": 0.7300294041633606,
"fg_rewards/chosen_sum": -1.9582923650741577,
"fg_rewards/rejected_sum": -0.4550693929195404,
"grad_norm": 44.41360203093714,
"kl": 0.0,
"learning_rate": 3.625997719498289e-07,
"logps/chosen": -337.7124953497024,
"logps/rejected": -364.32930715460526,
"loss": 0.4522,
"rewards/chosen": 0.633344604855492,
"rewards/margins": 2.6144102545907923,
"rewards/rejected": -1.9810656497353,
"step": 340
},
{
"count/fg_chosen": 34.35293960571289,
"count/fg_rejected": 8.764705657958984,
"epoch": 0.358974358974359,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.230541229248047,
"fg_logps/policy_chosen": -7.154247760772705,
"fg_logps/policy_rejected": -7.5230560302734375,
"fg_logps/reference_KL": -11.367884635925293,
"fg_logps/reference_chosen": -6.23461389541626,
"fg_logps/reference_rejected": -6.495339870452881,
"fg_loss": 0.853543221950531,
"fg_rewards/chosen_sum": -2.904083490371704,
"fg_rewards/rejected_sum": -0.9132155179977417,
"grad_norm": 30.3057952063642,
"kl": 0.0,
"learning_rate": 3.5689851767388824e-07,
"logps/chosen": -394.6584884129214,
"logps/rejected": -393.08568992077466,
"loss": 0.4936,
"rewards/chosen": 0.6523181615250834,
"rewards/margins": 2.7101455334716062,
"rewards/rejected": -2.057827371946523,
"step": 350
},
{
"count/fg_chosen": 27.30769157409668,
"count/fg_rejected": 5.0,
"epoch": 0.36923076923076925,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.898691177368164,
"fg_logps/policy_chosen": -7.163309574127197,
"fg_logps/policy_rejected": -6.87624979019165,
"fg_logps/reference_KL": -10.903498649597168,
"fg_logps/reference_chosen": -6.516329288482666,
"fg_logps/reference_rejected": -6.057809829711914,
"fg_loss": 0.7118747234344482,
"fg_rewards/chosen_sum": -1.5648789405822754,
"fg_rewards/rejected_sum": -0.6483681201934814,
"grad_norm": 37.36414008433845,
"kl": 0.0,
"learning_rate": 3.511972633979475e-07,
"logps/chosen": -316.7720240542763,
"logps/rejected": -391.03125,
"loss": 0.4128,
"rewards/chosen": 1.343739258615594,
"rewards/margins": 2.428480033587692,
"rewards/rejected": -1.0847407749720983,
"step": 360
},
{
"count/fg_chosen": 31.190475463867188,
"count/fg_rejected": 7.050000190734863,
"epoch": 0.37948717948717947,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.466145515441895,
"fg_logps/policy_chosen": -6.301754951477051,
"fg_logps/policy_rejected": -8.657445907592773,
"fg_logps/reference_KL": -10.610649108886719,
"fg_logps/reference_chosen": -5.920670509338379,
"fg_logps/reference_rejected": -8.133522987365723,
"fg_loss": 0.7434370517730713,
"fg_rewards/chosen_sum": -0.8639131188392639,
"fg_rewards/rejected_sum": -0.45794281363487244,
"grad_norm": 28.069125042652725,
"kl": 0.0,
"learning_rate": 3.4549600912200683e-07,
"logps/chosen": -327.7210542485955,
"logps/rejected": -410.5584286971831,
"loss": 0.4659,
"rewards/chosen": 1.6249963996115695,
"rewards/margins": 2.5865509134622515,
"rewards/rejected": -0.9615545138506822,
"step": 370
},
{
"count/fg_chosen": 34.900001525878906,
"count/fg_rejected": 8.899999618530273,
"epoch": 0.38974358974358975,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.721631050109863,
"fg_logps/policy_chosen": -6.37019681930542,
"fg_logps/policy_rejected": -6.883843898773193,
"fg_logps/reference_KL": -11.473298072814941,
"fg_logps/reference_chosen": -6.092167854309082,
"fg_logps/reference_rejected": -6.5457611083984375,
"fg_loss": 0.8677409887313843,
"fg_rewards/chosen_sum": -0.9150064587593079,
"fg_rewards/rejected_sum": -0.41792982816696167,
"grad_norm": 42.9333549511222,
"kl": 0.0,
"learning_rate": 3.397947548460661e-07,
"logps/chosen": -340.5213176448171,
"logps/rejected": -451.7316706730769,
"loss": 0.4514,
"rewards/chosen": 1.505601836413872,
"rewards/margins": 2.4657741472674877,
"rewards/rejected": -0.9601723108536158,
"step": 380
},
{
"count/fg_chosen": 29.55555534362793,
"count/fg_rejected": 7.0,
"epoch": 0.4,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.328727722167969,
"fg_logps/policy_chosen": -6.023360729217529,
"fg_logps/policy_rejected": -7.300014972686768,
"fg_logps/reference_KL": -10.421500205993652,
"fg_logps/reference_chosen": -5.618011474609375,
"fg_logps/reference_rejected": -6.620323181152344,
"fg_loss": 0.7781895399093628,
"fg_rewards/chosen_sum": -0.9389697313308716,
"fg_rewards/rejected_sum": -0.5253291130065918,
"grad_norm": 42.84097066535792,
"kl": 0.0,
"learning_rate": 3.340935005701254e-07,
"logps/chosen": -364.2041149400685,
"logps/rejected": -441.3032956178161,
"loss": 0.4992,
"rewards/chosen": 1.0798116187526756,
"rewards/margins": 2.0560204480843476,
"rewards/rejected": -0.9762088293316721,
"step": 390
},
{
"count/fg_chosen": 26.72222137451172,
"count/fg_rejected": 6.647058963775635,
"epoch": 0.41025641025641024,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.881956100463867,
"fg_logps/policy_chosen": -6.5600266456604,
"fg_logps/policy_rejected": -9.088929176330566,
"fg_logps/reference_KL": -11.40665054321289,
"fg_logps/reference_chosen": -5.7827043533325195,
"fg_logps/reference_rejected": -8.174890518188477,
"fg_loss": 0.8777969479560852,
"fg_rewards/chosen_sum": -1.556259036064148,
"fg_rewards/rejected_sum": -0.7908374667167664,
"grad_norm": 37.72027450146743,
"kl": 0.0,
"learning_rate": 3.283922462941847e-07,
"logps/chosen": -365.9978794642857,
"logps/rejected": -436.88159722222224,
"loss": 0.4478,
"rewards/chosen": 1.2335292271205358,
"rewards/margins": 2.4691440885029143,
"rewards/rejected": -1.2356148613823785,
"step": 400
},
{
"epoch": 0.41025641025641024,
"eval_count/fg_chosen": 30.183246612548828,
"eval_count/fg_rejected": 6.92391300201416,
"eval_fg_kl": NaN,
"eval_fg_logps/policy_KL": -13.678318977355957,
"eval_fg_logps/policy_chosen": -6.628693580627441,
"eval_fg_logps/policy_rejected": -8.363188743591309,
"eval_fg_logps/reference_KL": -11.47359848022461,
"eval_fg_logps/reference_chosen": -6.041894912719727,
"eval_fg_logps/reference_rejected": -7.58065938949585,
"eval_fg_loss": 0.7654322385787964,
"eval_fg_rewards/chosen_sum": -1.3938791751861572,
"eval_fg_rewards/rejected_sum": -0.6767725944519043,
"eval_kl": 0.02797871269285679,
"eval_logps/chosen": -340.2313144329897,
"eval_logps/rejected": -400.85385283893396,
"eval_loss": 0.4325231909751892,
"eval_rewards/chosen": 1.316945568665879,
"eval_rewards/margins": 3.0533541780318263,
"eval_rewards/rejected": -1.7364086093659472,
"eval_runtime": 492.9712,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 0.88,
"step": 400
},
{
"count/fg_chosen": 26.549999237060547,
"count/fg_rejected": 6.25,
"epoch": 0.4205128205128205,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.237287521362305,
"fg_logps/policy_chosen": -6.820374488830566,
"fg_logps/policy_rejected": -8.927366256713867,
"fg_logps/reference_KL": -12.311280250549316,
"fg_logps/reference_chosen": -5.970030784606934,
"fg_logps/reference_rejected": -7.938845634460449,
"fg_loss": 0.8091492056846619,
"fg_rewards/chosen_sum": -1.5172061920166016,
"fg_rewards/rejected_sum": -0.7380185723304749,
"grad_norm": 53.93549024472509,
"kl": 0.0,
"learning_rate": 3.22690992018244e-07,
"logps/chosen": -324.6869419642857,
"logps/rejected": -405.54951054216866,
"loss": 0.4023,
"rewards/chosen": 1.265897478376116,
"rewards/margins": 3.5306600810328366,
"rewards/rejected": -2.2647626026567207,
"step": 410
},
{
"count/fg_chosen": 23.399999618530273,
"count/fg_rejected": 6.133333206176758,
"epoch": 0.4307692307692308,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.516840934753418,
"fg_logps/policy_chosen": -7.477798938751221,
"fg_logps/policy_rejected": -8.535691261291504,
"fg_logps/reference_KL": -11.834728240966797,
"fg_logps/reference_chosen": -6.342043876647949,
"fg_logps/reference_rejected": -7.10928201675415,
"fg_loss": 0.8881044387817383,
"fg_rewards/chosen_sum": -2.0730390548706055,
"fg_rewards/rejected_sum": -0.9379479289054871,
"grad_norm": 37.934930263204464,
"kl": 0.04062976688146591,
"learning_rate": 3.169897377423033e-07,
"logps/chosen": -352.2984280873494,
"logps/rejected": -437.3393871753247,
"loss": 0.4353,
"rewards/chosen": 0.722329794642437,
"rewards/margins": 3.3173880871799777,
"rewards/rejected": -2.5950582925375407,
"step": 420
},
{
"count/fg_chosen": 29.41176414489746,
"count/fg_rejected": 5.882352828979492,
"epoch": 0.441025641025641,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.61406421661377,
"fg_logps/policy_chosen": -6.908777713775635,
"fg_logps/policy_rejected": -9.259625434875488,
"fg_logps/reference_KL": -10.859848976135254,
"fg_logps/reference_chosen": -5.828268527984619,
"fg_logps/reference_rejected": -7.893514156341553,
"fg_loss": 0.7920488119125366,
"fg_rewards/chosen_sum": -2.7851388454437256,
"fg_rewards/rejected_sum": -0.8430763483047485,
"grad_norm": 31.263236198590103,
"kl": 0.20134501159191132,
"learning_rate": 3.112884834663626e-07,
"logps/chosen": -338.0028831845238,
"logps/rejected": -437.03207236842104,
"loss": 0.4237,
"rewards/chosen": 1.1830097380138578,
"rewards/margins": 3.346872267567723,
"rewards/rejected": -2.163862529553865,
"step": 430
},
{
"count/fg_chosen": 31.16666603088379,
"count/fg_rejected": 5.583333492279053,
"epoch": 0.4512820512820513,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.71406078338623,
"fg_logps/policy_chosen": -6.013169765472412,
"fg_logps/policy_rejected": -7.012132167816162,
"fg_logps/reference_KL": -10.454259872436523,
"fg_logps/reference_chosen": -5.288631439208984,
"fg_logps/reference_rejected": -6.411142826080322,
"fg_loss": 0.8265379071235657,
"fg_rewards/chosen_sum": -1.9277740716934204,
"fg_rewards/rejected_sum": -0.4081937372684479,
"grad_norm": 33.498855345311206,
"kl": 0.0,
"learning_rate": 3.055872291904219e-07,
"logps/chosen": -433.9810126582278,
"logps/rejected": -409.45997299382714,
"loss": 0.4124,
"rewards/chosen": 0.23994885215276404,
"rewards/margins": 2.5844254342442956,
"rewards/rejected": -2.3444765820915316,
"step": 440
},
{
"count/fg_chosen": 28.214284896850586,
"count/fg_rejected": 7.0714287757873535,
"epoch": 0.46153846153846156,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.437283515930176,
"fg_logps/policy_chosen": -7.329289436340332,
"fg_logps/policy_rejected": -7.971861839294434,
"fg_logps/reference_KL": -11.504508018493652,
"fg_logps/reference_chosen": -6.307824611663818,
"fg_logps/reference_rejected": -7.138981342315674,
"fg_loss": 0.8858200907707214,
"fg_rewards/chosen_sum": -2.4875144958496094,
"fg_rewards/rejected_sum": -0.8280299305915833,
"grad_norm": 31.189671182424355,
"kl": 0.0,
"learning_rate": 2.998859749144812e-07,
"logps/chosen": -298.6210195806962,
"logps/rejected": -419.6058545524691,
"loss": 0.4201,
"rewards/chosen": 0.8161652963372725,
"rewards/margins": 3.091305375788468,
"rewards/rejected": -2.275140079451196,
"step": 450
},
{
"count/fg_chosen": 32.3636360168457,
"count/fg_rejected": 4.7272725105285645,
"epoch": 0.4717948717948718,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.52069091796875,
"fg_logps/policy_chosen": -6.610226154327393,
"fg_logps/policy_rejected": -10.518632888793945,
"fg_logps/reference_KL": -10.879704475402832,
"fg_logps/reference_chosen": -5.88496732711792,
"fg_logps/reference_rejected": -9.407367706298828,
"fg_loss": 0.6551663279533386,
"fg_rewards/chosen_sum": -2.258774757385254,
"fg_rewards/rejected_sum": -0.5948446989059448,
"grad_norm": 33.51878345246348,
"kl": 0.029797697439789772,
"learning_rate": 2.941847206385404e-07,
"logps/chosen": -332.60402610085225,
"logps/rejected": -395.40771484375,
"loss": 0.4001,
"rewards/chosen": 1.4196222478693181,
"rewards/margins": 3.0095812864977907,
"rewards/rejected": -1.5899590386284723,
"step": 460
},
{
"count/fg_chosen": 31.5,
"count/fg_rejected": 5.800000190734863,
"epoch": 0.48205128205128206,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.285706520080566,
"fg_logps/policy_chosen": -6.048055648803711,
"fg_logps/policy_rejected": -8.306843757629395,
"fg_logps/reference_KL": -10.14644718170166,
"fg_logps/reference_chosen": -5.535238742828369,
"fg_logps/reference_rejected": -7.302800178527832,
"fg_loss": 0.6428090333938599,
"fg_rewards/chosen_sum": -1.36484956741333,
"fg_rewards/rejected_sum": -0.6093672513961792,
"grad_norm": 26.805133801472735,
"kl": 0.17673882842063904,
"learning_rate": 2.8848346636259974e-07,
"logps/chosen": -317.91790291432585,
"logps/rejected": -368.47114326584506,
"loss": 0.4744,
"rewards/chosen": 1.5331906093640273,
"rewards/margins": 2.4303819928332997,
"rewards/rejected": -0.8971913834692726,
"step": 470
},
{
"count/fg_chosen": 39.266666412353516,
"count/fg_rejected": 7.4666666984558105,
"epoch": 0.49230769230769234,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.738459587097168,
"fg_logps/policy_chosen": -6.287077903747559,
"fg_logps/policy_rejected": -6.787537097930908,
"fg_logps/reference_KL": -11.216691017150879,
"fg_logps/reference_chosen": -5.826966762542725,
"fg_logps/reference_rejected": -6.379599571228027,
"fg_loss": 0.7447641491889954,
"fg_rewards/chosen_sum": -0.980557382106781,
"fg_rewards/rejected_sum": -0.506367564201355,
"grad_norm": 39.97199247238054,
"kl": 0.0,
"learning_rate": 2.82782212086659e-07,
"logps/chosen": -401.7761665239726,
"logps/rejected": -363.2634698275862,
"loss": 0.5103,
"rewards/chosen": 1.34750000418049,
"rewards/margins": 1.8919780588848274,
"rewards/rejected": -0.5444780547043373,
"step": 480
},
{
"count/fg_chosen": 26.875,
"count/fg_rejected": 5.400000095367432,
"epoch": 0.5025641025641026,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.641780853271484,
"fg_logps/policy_chosen": -8.056283950805664,
"fg_logps/policy_rejected": -8.251357078552246,
"fg_logps/reference_KL": -13.59717845916748,
"fg_logps/reference_chosen": -7.162622928619385,
"fg_logps/reference_rejected": -7.323818683624268,
"fg_loss": 0.8411279916763306,
"fg_rewards/chosen_sum": -1.9043647050857544,
"fg_rewards/rejected_sum": -0.7839928269386292,
"grad_norm": 29.60091559742249,
"kl": 0.22372007369995117,
"learning_rate": 2.7708095781071834e-07,
"logps/chosen": -324.56468441611844,
"logps/rejected": -461.18638392857144,
"loss": 0.4437,
"rewards/chosen": 1.3998164126747532,
"rewards/margins": 2.890947968141178,
"rewards/rejected": -1.4911315554664248,
"step": 490
},
{
"count/fg_chosen": 27.3157901763916,
"count/fg_rejected": 5.294117450714111,
"epoch": 0.5128205128205128,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.019055366516113,
"fg_logps/policy_chosen": -6.862349033355713,
"fg_logps/policy_rejected": -8.392266273498535,
"fg_logps/reference_KL": -11.065834999084473,
"fg_logps/reference_chosen": -6.182069301605225,
"fg_logps/reference_rejected": -7.583798408508301,
"fg_loss": 0.8486608266830444,
"fg_rewards/chosen_sum": -1.7188913822174072,
"fg_rewards/rejected_sum": -0.4710962176322937,
"grad_norm": 37.93308715806046,
"kl": 0.0,
"learning_rate": 2.713797035347776e-07,
"logps/chosen": -336.33485504518075,
"logps/rejected": -412.9320211038961,
"loss": 0.4063,
"rewards/chosen": 1.6464567988751881,
"rewards/margins": 3.429747315121637,
"rewards/rejected": -1.783290516246449,
"step": 500
},
{
"count/fg_chosen": 32.52941131591797,
"count/fg_rejected": 6.1875,
"epoch": 0.5230769230769231,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.041118621826172,
"fg_logps/policy_chosen": -6.475778579711914,
"fg_logps/policy_rejected": -8.933878898620605,
"fg_logps/reference_KL": -11.386185646057129,
"fg_logps/reference_chosen": -6.088446617126465,
"fg_logps/reference_rejected": -8.2723970413208,
"fg_loss": 0.6639065742492676,
"fg_rewards/chosen_sum": -1.039247989654541,
"fg_rewards/rejected_sum": -0.31398898363113403,
"grad_norm": 51.003351409032156,
"kl": 0.0,
"learning_rate": 2.6567844925883693e-07,
"logps/chosen": -315.5387290396341,
"logps/rejected": -434.9411057692308,
"loss": 0.4328,
"rewards/chosen": 1.3493434626881669,
"rewards/margins": 2.8553199195503964,
"rewards/rejected": -1.5059764568622296,
"step": 510
},
{
"count/fg_chosen": 29.549999237060547,
"count/fg_rejected": 7.300000190734863,
"epoch": 0.5333333333333333,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.461477279663086,
"fg_logps/policy_chosen": -7.2638678550720215,
"fg_logps/policy_rejected": -8.44337272644043,
"fg_logps/reference_KL": -12.010942459106445,
"fg_logps/reference_chosen": -6.417178153991699,
"fg_logps/reference_rejected": -7.042668342590332,
"fg_loss": 0.8494647145271301,
"fg_rewards/chosen_sum": -2.0671496391296387,
"fg_rewards/rejected_sum": -0.9835360646247864,
"grad_norm": 41.33647069198984,
"kl": 0.0,
"learning_rate": 2.599771949828962e-07,
"logps/chosen": -332.418183117378,
"logps/rejected": -371.52498998397436,
"loss": 0.455,
"rewards/chosen": 1.320475787651248,
"rewards/margins": 3.076387394659962,
"rewards/rejected": -1.755911607008714,
"step": 520
},
{
"count/fg_chosen": 30.526315689086914,
"count/fg_rejected": 9.11111068725586,
"epoch": 0.5435897435897435,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.753694534301758,
"fg_logps/policy_chosen": -6.313483238220215,
"fg_logps/policy_rejected": -8.319221496582031,
"fg_logps/reference_KL": -10.8753080368042,
"fg_logps/reference_chosen": -5.6761603355407715,
"fg_logps/reference_rejected": -7.430839538574219,
"fg_loss": 0.7786957621574402,
"fg_rewards/chosen_sum": -1.3800252676010132,
"fg_rewards/rejected_sum": -1.1956380605697632,
"grad_norm": 34.93340197656234,
"kl": 0.0,
"learning_rate": 2.542759407069555e-07,
"logps/chosen": -311.92927758487656,
"logps/rejected": -372.22604331487344,
"loss": 0.448,
"rewards/chosen": 1.0696545824592496,
"rewards/margins": 3.1661021045864253,
"rewards/rejected": -2.0964475221271757,
"step": 530
},
{
"count/fg_chosen": 26.764705657958984,
"count/fg_rejected": 6.352941036224365,
"epoch": 0.5538461538461539,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.03756046295166,
"fg_logps/policy_chosen": -6.464415073394775,
"fg_logps/policy_rejected": -7.652871608734131,
"fg_logps/reference_KL": -10.35094165802002,
"fg_logps/reference_chosen": -5.7773261070251465,
"fg_logps/reference_rejected": -6.825320720672607,
"fg_loss": 0.8047051429748535,
"fg_rewards/chosen_sum": -1.6725194454193115,
"fg_rewards/rejected_sum": -0.7678513526916504,
"grad_norm": 41.240539642737886,
"kl": 0.0,
"learning_rate": 2.485746864310148e-07,
"logps/chosen": -345.1589215158046,
"logps/rejected": -397.16462435787673,
"loss": 0.4683,
"rewards/chosen": 1.1091806696749282,
"rewards/margins": 2.9815141440301294,
"rewards/rejected": -1.8723334743552011,
"step": 540
},
{
"count/fg_chosen": 36.0,
"count/fg_rejected": 7.176470756530762,
"epoch": 0.5641025641025641,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.34708023071289,
"fg_logps/policy_chosen": -6.144561767578125,
"fg_logps/policy_rejected": -7.657267093658447,
"fg_logps/reference_KL": -10.591930389404297,
"fg_logps/reference_chosen": -5.485869884490967,
"fg_logps/reference_rejected": -6.727408409118652,
"fg_loss": 0.7655860185623169,
"fg_rewards/chosen_sum": -1.8279484510421753,
"fg_rewards/rejected_sum": -0.5319306254386902,
"grad_norm": 34.418931407344175,
"kl": 0.0,
"learning_rate": 2.428734321550741e-07,
"logps/chosen": -332.24548669763516,
"logps/rejected": -384.9080214389535,
"loss": 0.4533,
"rewards/chosen": 1.4185297166978992,
"rewards/margins": 3.59055198175513,
"rewards/rejected": -2.172022265057231,
"step": 550
},
{
"count/fg_chosen": 30.6842098236084,
"count/fg_rejected": 5.842105388641357,
"epoch": 0.5743589743589743,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.044573783874512,
"fg_logps/policy_chosen": -6.35300874710083,
"fg_logps/policy_rejected": -8.149062156677246,
"fg_logps/reference_KL": -9.564573287963867,
"fg_logps/reference_chosen": -5.577574253082275,
"fg_logps/reference_rejected": -7.246463298797607,
"fg_loss": 0.7160053849220276,
"fg_rewards/chosen_sum": -1.8285123109817505,
"fg_rewards/rejected_sum": -0.428337961435318,
"grad_norm": 32.969063143022574,
"kl": 0.0,
"learning_rate": 2.371721778791334e-07,
"logps/chosen": -404.944683908046,
"logps/rejected": -394.23758561643837,
"loss": 0.4521,
"rewards/chosen": 1.1878378857141254,
"rewards/margins": 2.883744642389532,
"rewards/rejected": -1.6959067566754067,
"step": 560
},
{
"count/fg_chosen": 29.5,
"count/fg_rejected": 6.733333110809326,
"epoch": 0.5846153846153846,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.527681350708008,
"fg_logps/policy_chosen": -6.548896789550781,
"fg_logps/policy_rejected": -6.040011882781982,
"fg_logps/reference_KL": -9.865092277526855,
"fg_logps/reference_chosen": -5.893582344055176,
"fg_logps/reference_rejected": -5.497416019439697,
"fg_loss": 0.5755335092544556,
"fg_rewards/chosen_sum": -1.5131157636642456,
"fg_rewards/rejected_sum": -0.35849064588546753,
"grad_norm": 40.43474566516004,
"kl": 0.0,
"learning_rate": 2.314709236031927e-07,
"logps/chosen": -354.1348353794643,
"logps/rejected": -417.60911800986844,
"loss": 0.4105,
"rewards/chosen": 0.7818209330240885,
"rewards/margins": 3.425596471418414,
"rewards/rejected": -2.6437755383943258,
"step": 570
},
{
"count/fg_chosen": 31.428571701049805,
"count/fg_rejected": 7.599999904632568,
"epoch": 0.5948717948717949,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.412928581237793,
"fg_logps/policy_chosen": -6.371100902557373,
"fg_logps/policy_rejected": -7.6736016273498535,
"fg_logps/reference_KL": -10.896791458129883,
"fg_logps/reference_chosen": -5.65976095199585,
"fg_logps/reference_rejected": -6.982507228851318,
"fg_loss": 0.7268858551979065,
"fg_rewards/chosen_sum": -1.6253752708435059,
"fg_rewards/rejected_sum": -0.8460947871208191,
"grad_norm": 57.39186053322085,
"kl": 0.0,
"learning_rate": 2.2576966932725198e-07,
"logps/chosen": -291.66895736882714,
"logps/rejected": -384.9341623813291,
"loss": 0.4683,
"rewards/chosen": 1.6188120900848766,
"rewards/margins": 2.9102534159847977,
"rewards/rejected": -1.2914413258999209,
"step": 580
},
{
"count/fg_chosen": 28.904762268066406,
"count/fg_rejected": 6.526315689086914,
"epoch": 0.6051282051282051,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.970428466796875,
"fg_logps/policy_chosen": -6.568854808807373,
"fg_logps/policy_rejected": -8.221002578735352,
"fg_logps/reference_KL": -11.08731746673584,
"fg_logps/reference_chosen": -5.649308204650879,
"fg_logps/reference_rejected": -7.402557373046875,
"fg_loss": 0.705423891544342,
"fg_rewards/chosen_sum": -2.060758590698242,
"fg_rewards/rejected_sum": -0.8252547979354858,
"grad_norm": 35.44361919546434,
"kl": 0.0,
"learning_rate": 2.2006841505131128e-07,
"logps/chosen": -442.12862723214283,
"logps/rejected": -406.4741981907895,
"loss": 0.4365,
"rewards/chosen": 1.5486488342285156,
"rewards/margins": 3.623551418906764,
"rewards/rejected": -2.0749025846782483,
"step": 590
},
{
"count/fg_chosen": 27.83333396911621,
"count/fg_rejected": 6.2727274894714355,
"epoch": 0.6153846153846154,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.578009605407715,
"fg_logps/policy_chosen": -7.306002140045166,
"fg_logps/policy_rejected": -8.329404830932617,
"fg_logps/reference_KL": -11.412123680114746,
"fg_logps/reference_chosen": -6.204747676849365,
"fg_logps/reference_rejected": -6.661261081695557,
"fg_loss": 0.649915337562561,
"fg_rewards/chosen_sum": -2.443979263305664,
"fg_rewards/rejected_sum": -0.9118065237998962,
"grad_norm": 48.33661978525442,
"kl": 0.0,
"learning_rate": 2.1436716077537057e-07,
"logps/chosen": -353.3546720805921,
"logps/rejected": -475.7469773065476,
"loss": 0.4239,
"rewards/chosen": 1.3294219970703125,
"rewards/margins": 3.233419145856585,
"rewards/rejected": -1.9039971487862724,
"step": 600
},
{
"count/fg_chosen": 25.647058486938477,
"count/fg_rejected": 6.470588207244873,
"epoch": 0.6256410256410256,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.512959480285645,
"fg_logps/policy_chosen": -6.967007160186768,
"fg_logps/policy_rejected": -8.427515029907227,
"fg_logps/reference_KL": -10.780054092407227,
"fg_logps/reference_chosen": -6.145755290985107,
"fg_logps/reference_rejected": -7.377350807189941,
"fg_loss": 0.8846892714500427,
"fg_rewards/chosen_sum": -1.872863531112671,
"fg_rewards/rejected_sum": -0.8110222816467285,
"grad_norm": 27.94628622877484,
"kl": 0.0,
"learning_rate": 2.0866590649942987e-07,
"logps/chosen": -325.96284054487177,
"logps/rejected": -358.4112280868902,
"loss": 0.443,
"rewards/chosen": 1.2946516183706431,
"rewards/margins": 2.8088877894417656,
"rewards/rejected": -1.5142361710711223,
"step": 610
},
{
"count/fg_chosen": 34.238094329833984,
"count/fg_rejected": 6.949999809265137,
"epoch": 0.6358974358974359,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.278539657592773,
"fg_logps/policy_chosen": -6.847230434417725,
"fg_logps/policy_rejected": -9.253725051879883,
"fg_logps/reference_KL": -11.118898391723633,
"fg_logps/reference_chosen": -5.77498722076416,
"fg_logps/reference_rejected": -7.873915195465088,
"fg_loss": 0.7475059628486633,
"fg_rewards/chosen_sum": -2.7328929901123047,
"fg_rewards/rejected_sum": -1.1806801557540894,
"grad_norm": 37.21517260596127,
"kl": 0.0,
"learning_rate": 2.0296465222348917e-07,
"logps/chosen": -345.37012924382714,
"logps/rejected": -452.38132911392404,
"loss": 0.4182,
"rewards/chosen": 2.110634132667824,
"rewards/margins": 4.059031805296879,
"rewards/rejected": -1.9483976726290546,
"step": 620
},
{
"count/fg_chosen": 28.3125,
"count/fg_rejected": 7.4375,
"epoch": 0.6461538461538462,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.633122444152832,
"fg_logps/policy_chosen": -6.758295059204102,
"fg_logps/policy_rejected": -8.180941581726074,
"fg_logps/reference_KL": -12.073482513427734,
"fg_logps/reference_chosen": -5.663504600524902,
"fg_logps/reference_rejected": -6.7309250831604,
"fg_loss": 0.8814060091972351,
"fg_rewards/chosen_sum": -2.544795513153076,
"fg_rewards/rejected_sum": -1.284183144569397,
"grad_norm": 42.32817075648944,
"kl": 0.0,
"learning_rate": 1.9726339794754846e-07,
"logps/chosen": -351.775993441358,
"logps/rejected": -522.9557950949367,
"loss": 0.4343,
"rewards/chosen": 1.261369964222849,
"rewards/margins": 3.565065836083015,
"rewards/rejected": -2.303695871860166,
"step": 630
},
{
"count/fg_chosen": 33.64706039428711,
"count/fg_rejected": 8.058823585510254,
"epoch": 0.6564102564102564,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.795089721679688,
"fg_logps/policy_chosen": -6.418089389801025,
"fg_logps/policy_rejected": -8.086543083190918,
"fg_logps/reference_KL": -10.756563186645508,
"fg_logps/reference_chosen": -5.482754230499268,
"fg_logps/reference_rejected": -6.624981880187988,
"fg_loss": 0.8172480463981628,
"fg_rewards/chosen_sum": -2.2949178218841553,
"fg_rewards/rejected_sum": -1.4898707866668701,
"grad_norm": 49.06648139657291,
"kl": 0.0,
"learning_rate": 1.9156214367160776e-07,
"logps/chosen": -323.18095703125,
"logps/rejected": -457.679443359375,
"loss": 0.3975,
"rewards/chosen": 1.7368663787841796,
"rewards/margins": 4.7420207977294915,
"rewards/rejected": -3.0051544189453123,
"step": 640
},
{
"count/fg_chosen": 26.785715103149414,
"count/fg_rejected": 6.0,
"epoch": 0.6666666666666666,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.923561096191406,
"fg_logps/policy_chosen": -7.209476470947266,
"fg_logps/policy_rejected": -8.816498756408691,
"fg_logps/reference_KL": -12.868348121643066,
"fg_logps/reference_chosen": -5.93485164642334,
"fg_logps/reference_rejected": -7.260035991668701,
"fg_loss": 0.7890381217002869,
"fg_rewards/chosen_sum": -2.429896831512451,
"fg_rewards/rejected_sum": -0.9942983388900757,
"grad_norm": 42.684857833622196,
"kl": 0.0,
"learning_rate": 1.8586088939566706e-07,
"logps/chosen": -307.12525576636904,
"logps/rejected": -396.83095189144734,
"loss": 0.4325,
"rewards/chosen": 1.7140017918178014,
"rewards/margins": 3.020266568750367,
"rewards/rejected": -1.3062647769325657,
"step": 650
},
{
"count/fg_chosen": 32.578948974609375,
"count/fg_rejected": 5.5789475440979,
"epoch": 0.676923076923077,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.829089164733887,
"fg_logps/policy_chosen": -6.605353832244873,
"fg_logps/policy_rejected": -8.679577827453613,
"fg_logps/reference_KL": -11.020931243896484,
"fg_logps/reference_chosen": -5.843183517456055,
"fg_logps/reference_rejected": -7.641061305999756,
"fg_loss": 0.7101105451583862,
"fg_rewards/chosen_sum": -1.6802619695663452,
"fg_rewards/rejected_sum": -0.5640377998352051,
"grad_norm": 51.20082166149615,
"kl": 0.0,
"learning_rate": 1.8015963511972635e-07,
"logps/chosen": -326.6531840479651,
"logps/rejected": -467.38508234797297,
"loss": 0.3955,
"rewards/chosen": 1.7110205362009447,
"rewards/margins": 3.708857912151114,
"rewards/rejected": -1.9978373759501689,
"step": 660
},
{
"count/fg_chosen": 34.52941131591797,
"count/fg_rejected": 8.882352828979492,
"epoch": 0.6871794871794872,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.263018608093262,
"fg_logps/policy_chosen": -7.245337009429932,
"fg_logps/policy_rejected": -10.788996696472168,
"fg_logps/reference_KL": -11.636683464050293,
"fg_logps/reference_chosen": -6.2825164794921875,
"fg_logps/reference_rejected": -8.302498817443848,
"fg_loss": 0.9282689094543457,
"fg_rewards/chosen_sum": -2.5015087127685547,
"fg_rewards/rejected_sum": -1.4177420139312744,
"grad_norm": 43.77061774737123,
"kl": 0.16583053767681122,
"learning_rate": 1.7445838084378562e-07,
"logps/chosen": -347.34482020547944,
"logps/rejected": -440.41316451149424,
"loss": 0.4428,
"rewards/chosen": 1.4226540343402183,
"rewards/margins": 3.992055567094349,
"rewards/rejected": -2.5694015327541306,
"step": 670
},
{
"count/fg_chosen": 33.0625,
"count/fg_rejected": 6.1875,
"epoch": 0.6974358974358974,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.69076156616211,
"fg_logps/policy_chosen": -6.070934295654297,
"fg_logps/policy_rejected": -9.86292552947998,
"fg_logps/reference_KL": -10.596210479736328,
"fg_logps/reference_chosen": -5.411348342895508,
"fg_logps/reference_rejected": -8.194981575012207,
"fg_loss": 0.7803856730461121,
"fg_rewards/chosen_sum": -1.5216065645217896,
"fg_rewards/rejected_sum": -0.7260585427284241,
"grad_norm": 41.07172408412241,
"kl": 0.0,
"learning_rate": 1.6875712656784492e-07,
"logps/chosen": -321.7051943824405,
"logps/rejected": -402.75840357730266,
"loss": 0.431,
"rewards/chosen": 1.6818878537132627,
"rewards/margins": 2.9571292035860526,
"rewards/rejected": -1.27524134987279,
"step": 680
},
{
"count/fg_chosen": 33.04166793823242,
"count/fg_rejected": 7.956521511077881,
"epoch": 0.7076923076923077,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.681419372558594,
"fg_logps/policy_chosen": -6.311405181884766,
"fg_logps/policy_rejected": -8.458182334899902,
"fg_logps/reference_KL": -10.43921947479248,
"fg_logps/reference_chosen": -5.538773059844971,
"fg_logps/reference_rejected": -7.221550941467285,
"fg_loss": 0.774190366268158,
"fg_rewards/chosen_sum": -2.0744409561157227,
"fg_rewards/rejected_sum": -1.0601459741592407,
"grad_norm": 26.76398723713879,
"kl": 0.0,
"learning_rate": 1.6305587229190422e-07,
"logps/chosen": -337.47130408653845,
"logps/rejected": -426.1006573932927,
"loss": 0.4689,
"rewards/chosen": 1.7125216753054888,
"rewards/margins": 3.957819735280718,
"rewards/rejected": -2.245298059975229,
"step": 690
},
{
"count/fg_chosen": 29.933332443237305,
"count/fg_rejected": 7.5,
"epoch": 0.717948717948718,
"fg_kl": NaN,
"fg_logps/policy_KL": -18.62053871154785,
"fg_logps/policy_chosen": -6.723959445953369,
"fg_logps/policy_rejected": -8.809980392456055,
"fg_logps/reference_KL": -14.621339797973633,
"fg_logps/reference_chosen": -5.710059642791748,
"fg_logps/reference_rejected": -7.013789176940918,
"fg_loss": 0.7358676791191101,
"fg_rewards/chosen_sum": -2.391604423522949,
"fg_rewards/rejected_sum": -1.1176395416259766,
"grad_norm": 32.71207561201088,
"kl": 0.0,
"learning_rate": 1.573546180159635e-07,
"logps/chosen": -377.94707661290323,
"logps/rejected": -423.6705923507463,
"loss": 0.3983,
"rewards/chosen": 1.9157637729439685,
"rewards/margins": 3.7593996736406803,
"rewards/rejected": -1.8436359006967118,
"step": 700
},
{
"count/fg_chosen": 34.0,
"count/fg_rejected": 7.0,
"epoch": 0.7282051282051282,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.678239822387695,
"fg_logps/policy_chosen": -5.730792999267578,
"fg_logps/policy_rejected": -6.297366142272949,
"fg_logps/reference_KL": -10.122818946838379,
"fg_logps/reference_chosen": -5.305339813232422,
"fg_logps/reference_rejected": -5.903895378112793,
"fg_loss": 0.8335784673690796,
"fg_rewards/chosen_sum": -1.278421401977539,
"fg_rewards/rejected_sum": -0.4984094202518463,
"grad_norm": 30.093125040941494,
"kl": 0.0,
"learning_rate": 1.516533637400228e-07,
"logps/chosen": -311.1296672077922,
"logps/rejected": -378.9050263554217,
"loss": 0.4411,
"rewards/chosen": 1.7025673606178977,
"rewards/margins": 4.042804529998545,
"rewards/rejected": -2.3402371693806474,
"step": 710
},
{
"count/fg_chosen": 25.214284896850586,
"count/fg_rejected": 4.538461685180664,
"epoch": 0.7384615384615385,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.262101173400879,
"fg_logps/policy_chosen": -6.9626898765563965,
"fg_logps/policy_rejected": -9.714632034301758,
"fg_logps/reference_KL": -11.78027057647705,
"fg_logps/reference_chosen": -6.246325969696045,
"fg_logps/reference_rejected": -8.87566089630127,
"fg_loss": 0.7088484764099121,
"fg_rewards/chosen_sum": -1.5614358186721802,
"fg_rewards/rejected_sum": -0.42503052949905396,
"grad_norm": 41.939062686169684,
"kl": 0.0,
"learning_rate": 1.459521094640821e-07,
"logps/chosen": -357.324462890625,
"logps/rejected": -446.333984375,
"loss": 0.3344,
"rewards/chosen": 1.5038203239440917,
"rewards/margins": 4.055677318572998,
"rewards/rejected": -2.5518569946289062,
"step": 720
},
{
"count/fg_chosen": 34.75,
"count/fg_rejected": 8.470588684082031,
"epoch": 0.7487179487179487,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.44446086883545,
"fg_logps/policy_chosen": -6.577872276306152,
"fg_logps/policy_rejected": -8.81990909576416,
"fg_logps/reference_KL": -10.716168403625488,
"fg_logps/reference_chosen": -6.01681661605835,
"fg_logps/reference_rejected": -8.056208610534668,
"fg_loss": 0.7597875595092773,
"fg_rewards/chosen_sum": -1.6217119693756104,
"fg_rewards/rejected_sum": -1.0568969249725342,
"grad_norm": 29.217422708191275,
"kl": 0.0,
"learning_rate": 1.402508551881414e-07,
"logps/chosen": -325.07026041666666,
"logps/rejected": -434.04232536764704,
"loss": 0.3915,
"rewards/chosen": 2.13581298828125,
"rewards/margins": 5.36644473805147,
"rewards/rejected": -3.2306317497702204,
"step": 730
},
{
"count/fg_chosen": 32.095237731933594,
"count/fg_rejected": 8.7619047164917,
"epoch": 0.7589743589743589,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.643653869628906,
"fg_logps/policy_chosen": -7.633936405181885,
"fg_logps/policy_rejected": -8.764749526977539,
"fg_logps/reference_KL": -12.597454071044922,
"fg_logps/reference_chosen": -6.6152801513671875,
"fg_logps/reference_rejected": -7.493948936462402,
"fg_loss": 0.8724325895309448,
"fg_rewards/chosen_sum": -2.190063714981079,
"fg_rewards/rejected_sum": -1.0442728996276855,
"grad_norm": 35.508557979266605,
"kl": 0.0,
"learning_rate": 1.345496009122007e-07,
"logps/chosen": -323.4399604301948,
"logps/rejected": -447.21136106927713,
"loss": 0.4821,
"rewards/chosen": 1.2747364787312296,
"rewards/margins": 3.8379994181843546,
"rewards/rejected": -2.563262939453125,
"step": 740
},
{
"count/fg_chosen": 35.55555725097656,
"count/fg_rejected": 7.764705657958984,
"epoch": 0.7692307692307693,
"fg_kl": NaN,
"fg_logps/policy_KL": -12.14694881439209,
"fg_logps/policy_chosen": -5.938383102416992,
"fg_logps/policy_rejected": -6.999124526977539,
"fg_logps/reference_KL": -9.31165599822998,
"fg_logps/reference_chosen": -5.441190242767334,
"fg_logps/reference_rejected": -6.349566459655762,
"fg_loss": 0.7159730792045593,
"fg_rewards/chosen_sum": -1.1845917701721191,
"fg_rewards/rejected_sum": -0.6632856726646423,
"grad_norm": 29.30713270318622,
"kl": 0.0,
"learning_rate": 1.2884834663625997e-07,
"logps/chosen": -351.3742959665698,
"logps/rejected": -477.6983741554054,
"loss": 0.4247,
"rewards/chosen": 1.214830709058185,
"rewards/margins": 3.774690938550372,
"rewards/rejected": -2.5598602294921875,
"step": 750
},
{
"count/fg_chosen": 29.450000762939453,
"count/fg_rejected": 8.941176414489746,
"epoch": 0.7794871794871795,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.91418743133545,
"fg_logps/policy_chosen": -6.500802516937256,
"fg_logps/policy_rejected": -7.696014404296875,
"fg_logps/reference_KL": -10.872803688049316,
"fg_logps/reference_chosen": -6.060533046722412,
"fg_logps/reference_rejected": -6.9095940589904785,
"fg_loss": 0.8821809887886047,
"fg_rewards/chosen_sum": -1.222095012664795,
"fg_rewards/rejected_sum": -0.8427726030349731,
"grad_norm": 23.409794702847652,
"kl": 0.0,
"learning_rate": 1.2314709236031927e-07,
"logps/chosen": -350.1331449468085,
"logps/rejected": -399.28329190340907,
"loss": 0.4556,
"rewards/chosen": 1.079000432440575,
"rewards/margins": 3.400327179525684,
"rewards/rejected": -2.321326747085109,
"step": 760
},
{
"count/fg_chosen": 26.789474487304688,
"count/fg_rejected": 7.0,
"epoch": 0.7897435897435897,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.873922348022461,
"fg_logps/policy_chosen": -7.308094024658203,
"fg_logps/policy_rejected": -7.175009727478027,
"fg_logps/reference_KL": -11.33347225189209,
"fg_logps/reference_chosen": -6.172826766967773,
"fg_logps/reference_rejected": -6.013812065124512,
"fg_loss": 0.7144444584846497,
"fg_rewards/chosen_sum": -2.5322296619415283,
"fg_rewards/rejected_sum": -0.9527682065963745,
"grad_norm": 39.24437423103846,
"kl": 0.0,
"learning_rate": 1.1744583808437855e-07,
"logps/chosen": -331.80161458333333,
"logps/rejected": -438.73373161764704,
"loss": 0.4217,
"rewards/chosen": 1.7163297526041668,
"rewards/margins": 3.8777695360370714,
"rewards/rejected": -2.1614397834329044,
"step": 770
},
{
"count/fg_chosen": 32.1875,
"count/fg_rejected": 5.8125,
"epoch": 0.8,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.304938316345215,
"fg_logps/policy_chosen": -6.930516242980957,
"fg_logps/policy_rejected": -7.694925785064697,
"fg_logps/reference_KL": -10.620773315429688,
"fg_logps/reference_chosen": -6.272948265075684,
"fg_logps/reference_rejected": -6.997465133666992,
"fg_loss": 0.7136435508728027,
"fg_rewards/chosen_sum": -1.7926644086837769,
"fg_rewards/rejected_sum": -0.3232946991920471,
"grad_norm": 24.909110123805196,
"kl": 0.0,
"learning_rate": 1.1174458380843785e-07,
"logps/chosen": -303.2761627906977,
"logps/rejected": -392.0064400337838,
"loss": 0.4265,
"rewards/chosen": 1.246623904206032,
"rewards/margins": 3.1565479731724597,
"rewards/rejected": -1.9099240689664274,
"step": 780
},
{
"count/fg_chosen": 31.0,
"count/fg_rejected": 6.0,
"epoch": 0.8102564102564103,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.473762512207031,
"fg_logps/policy_chosen": -6.058125972747803,
"fg_logps/policy_rejected": -7.64716100692749,
"fg_logps/reference_KL": -10.569419860839844,
"fg_logps/reference_chosen": -5.636739730834961,
"fg_logps/reference_rejected": -6.955509662628174,
"fg_loss": 0.6918079257011414,
"fg_rewards/chosen_sum": -0.8551385998725891,
"fg_rewards/rejected_sum": -0.39899593591690063,
"grad_norm": 22.682895979877173,
"kl": 0.0,
"learning_rate": 1.0604332953249714e-07,
"logps/chosen": -329.2703077936747,
"logps/rejected": -374.5271154626623,
"loss": 0.4628,
"rewards/chosen": 1.6247121052569653,
"rewards/margins": 2.985569189456945,
"rewards/rejected": -1.3608570841999796,
"step": 790
},
{
"count/fg_chosen": 32.46666717529297,
"count/fg_rejected": 5.666666507720947,
"epoch": 0.8205128205128205,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.543877601623535,
"fg_logps/policy_chosen": -7.2420654296875,
"fg_logps/policy_rejected": -8.499801635742188,
"fg_logps/reference_KL": -11.280120849609375,
"fg_logps/reference_chosen": -6.594120502471924,
"fg_logps/reference_rejected": -7.241811275482178,
"fg_loss": 0.6977981925010681,
"fg_rewards/chosen_sum": -1.8107116222381592,
"fg_rewards/rejected_sum": -0.6922832727432251,
"grad_norm": 39.98049317260069,
"kl": 0.0,
"learning_rate": 1.0034207525655644e-07,
"logps/chosen": -359.1409722222222,
"logps/rejected": -515.9350446428572,
"loss": 0.4043,
"rewards/chosen": 1.5236521402994792,
"rewards/margins": 3.4145729428245906,
"rewards/rejected": -1.8909208025251116,
"step": 800
},
{
"epoch": 0.8205128205128205,
"eval_count/fg_chosen": 30.183246612548828,
"eval_count/fg_rejected": 6.92391300201416,
"eval_fg_kl": NaN,
"eval_fg_logps/policy_KL": -14.794645309448242,
"eval_fg_logps/policy_chosen": -6.733245849609375,
"eval_fg_logps/policy_rejected": -8.626864433288574,
"eval_fg_logps/reference_KL": -11.47359848022461,
"eval_fg_logps/reference_chosen": -6.041894912719727,
"eval_fg_logps/reference_rejected": -7.58065938949585,
"eval_fg_loss": 0.762517511844635,
"eval_fg_rewards/chosen_sum": -1.556026816368103,
"eval_fg_rewards/rejected_sum": -0.9032577276229858,
"eval_kl": 0.014131884090602398,
"eval_logps/chosen": -336.04120131729667,
"eval_logps/rejected": -406.1173232908459,
"eval_loss": 0.41103363037109375,
"eval_rewards/chosen": 1.7359535243503006,
"eval_rewards/margins": 3.998709942730949,
"eval_rewards/rejected": -2.262756418380649,
"eval_runtime": 462.7715,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.938,
"step": 800
},
{
"count/fg_chosen": 25.733333587646484,
"count/fg_rejected": 8.800000190734863,
"epoch": 0.8307692307692308,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.521686553955078,
"fg_logps/policy_chosen": -6.098317623138428,
"fg_logps/policy_rejected": -7.37031888961792,
"fg_logps/reference_KL": -11.138436317443848,
"fg_logps/reference_chosen": -5.529090881347656,
"fg_logps/reference_rejected": -6.567668437957764,
"fg_loss": 0.7037224173545837,
"fg_rewards/chosen_sum": -1.2442917823791504,
"fg_rewards/rejected_sum": -0.9032351970672607,
"grad_norm": 39.139843341578626,
"kl": 0.0,
"learning_rate": 9.464082098061574e-08,
"logps/chosen": -351.8864535108025,
"logps/rejected": -414.8218453322785,
"loss": 0.4442,
"rewards/chosen": 1.5498073248215665,
"rewards/margins": 3.332525065809996,
"rewards/rejected": -1.7827177409884296,
"step": 810
},
{
"count/fg_chosen": 29.53333282470703,
"count/fg_rejected": 8.714285850524902,
"epoch": 0.841025641025641,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.194475173950195,
"fg_logps/policy_chosen": -7.472283840179443,
"fg_logps/policy_rejected": -8.897085189819336,
"fg_logps/reference_KL": -12.283650398254395,
"fg_logps/reference_chosen": -6.056351184844971,
"fg_logps/reference_rejected": -7.35612154006958,
"fg_loss": 0.8785532712936401,
"fg_rewards/chosen_sum": -2.9767565727233887,
"fg_rewards/rejected_sum": -1.322200059890747,
"grad_norm": 18.084277972168177,
"kl": 0.11083474010229111,
"learning_rate": 8.893956670467502e-08,
"logps/chosen": -357.466950491573,
"logps/rejected": -436.3477937940141,
"loss": 0.4174,
"rewards/chosen": 1.0562572693556882,
"rewards/margins": 4.643260735464843,
"rewards/rejected": -3.587003466109155,
"step": 820
},
{
"count/fg_chosen": 38.94117736816406,
"count/fg_rejected": 9.133333206176758,
"epoch": 0.8512820512820513,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.23822021484375,
"fg_logps/policy_chosen": -6.3059844970703125,
"fg_logps/policy_rejected": -7.230159282684326,
"fg_logps/reference_KL": -12.178247451782227,
"fg_logps/reference_chosen": -5.708430290222168,
"fg_logps/reference_rejected": -5.989579200744629,
"fg_loss": 0.8925216794013977,
"fg_rewards/chosen_sum": -1.6965184211730957,
"fg_rewards/rejected_sum": -1.320400357246399,
"grad_norm": 36.47201338586735,
"kl": 0.0,
"learning_rate": 8.323831242873432e-08,
"logps/chosen": -357.6101471656977,
"logps/rejected": -458.4434121621622,
"loss": 0.4677,
"rewards/chosen": 1.8875178847202034,
"rewards/margins": 3.2487285789043776,
"rewards/rejected": -1.3612106941841744,
"step": 830
},
{
"count/fg_chosen": 32.33333206176758,
"count/fg_rejected": 6.941176414489746,
"epoch": 0.8615384615384616,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.08393669128418,
"fg_logps/policy_chosen": -6.780251502990723,
"fg_logps/policy_rejected": -7.011376857757568,
"fg_logps/reference_KL": -10.381501197814941,
"fg_logps/reference_chosen": -6.354065418243408,
"fg_logps/reference_rejected": -6.308195114135742,
"fg_loss": 0.7612662315368652,
"fg_rewards/chosen_sum": -1.2925324440002441,
"fg_rewards/rejected_sum": -0.4022027552127838,
"grad_norm": 32.781155840821306,
"kl": 0.0,
"learning_rate": 7.753705815279361e-08,
"logps/chosen": -331.03251953125,
"logps/rejected": -414.158447265625,
"loss": 0.4681,
"rewards/chosen": 1.1188287734985352,
"rewards/margins": 3.2083324432373046,
"rewards/rejected": -2.0895036697387694,
"step": 840
},
{
"count/fg_chosen": 31.047618865966797,
"count/fg_rejected": 9.380952835083008,
"epoch": 0.8717948717948718,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.173805236816406,
"fg_logps/policy_chosen": -6.497308731079102,
"fg_logps/policy_rejected": -8.013084411621094,
"fg_logps/reference_KL": -10.995800018310547,
"fg_logps/reference_chosen": -5.791131973266602,
"fg_logps/reference_rejected": -7.026320457458496,
"fg_loss": 0.8731069564819336,
"fg_rewards/chosen_sum": -1.6361898183822632,
"fg_rewards/rejected_sum": -1.0837599039077759,
"grad_norm": 29.75869838237667,
"kl": 0.0,
"learning_rate": 7.183580387685291e-08,
"logps/chosen": -385.02049512987014,
"logps/rejected": -396.0473926957831,
"loss": 0.4773,
"rewards/chosen": 1.234356471470424,
"rewards/margins": 3.495361367519464,
"rewards/rejected": -2.26100489604904,
"step": 850
},
{
"count/fg_chosen": 30.14285659790039,
"count/fg_rejected": 5.857142925262451,
"epoch": 0.882051282051282,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.854009628295898,
"fg_logps/policy_chosen": -6.418879508972168,
"fg_logps/policy_rejected": -8.779900550842285,
"fg_logps/reference_KL": -12.032855033874512,
"fg_logps/reference_chosen": -5.604477405548096,
"fg_logps/reference_rejected": -7.313387870788574,
"fg_loss": 0.8508789539337158,
"fg_rewards/chosen_sum": -1.4381144046783447,
"fg_rewards/rejected_sum": -0.7955017685890198,
"grad_norm": 25.28218871544972,
"kl": 0.0,
"learning_rate": 6.613454960091219e-08,
"logps/chosen": -300.05953414351853,
"logps/rejected": -383.09023931962025,
"loss": 0.3998,
"rewards/chosen": 1.808587156696084,
"rewards/margins": 3.9609218286823378,
"rewards/rejected": -2.152334671986254,
"step": 860
},
{
"count/fg_chosen": 28.25,
"count/fg_rejected": 5.800000190734863,
"epoch": 0.8923076923076924,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.163013458251953,
"fg_logps/policy_chosen": -6.579830646514893,
"fg_logps/policy_rejected": -7.748932361602783,
"fg_logps/reference_KL": -10.930763244628906,
"fg_logps/reference_chosen": -5.884364128112793,
"fg_logps/reference_rejected": -6.846892356872559,
"fg_loss": 0.7400026321411133,
"fg_rewards/chosen_sum": -1.4189780950546265,
"fg_rewards/rejected_sum": -0.8359920978546143,
"grad_norm": 35.432818767338894,
"kl": 0.0,
"learning_rate": 6.043329532497149e-08,
"logps/chosen": -343.70828125,
"logps/rejected": -468.51098345588235,
"loss": 0.3876,
"rewards/chosen": 1.4050553385416666,
"rewards/margins": 4.349431846469056,
"rewards/rejected": -2.9443765079273896,
"step": 870
},
{
"count/fg_chosen": 23.538461685180664,
"count/fg_rejected": 6.166666507720947,
"epoch": 0.9025641025641026,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.849803924560547,
"fg_logps/policy_chosen": -6.889503002166748,
"fg_logps/policy_rejected": -9.813651084899902,
"fg_logps/reference_KL": -11.724679946899414,
"fg_logps/reference_chosen": -6.115569114685059,
"fg_logps/reference_rejected": -8.466334342956543,
"fg_loss": 0.923600435256958,
"fg_rewards/chosen_sum": -1.7626667022705078,
"fg_rewards/rejected_sum": -0.8649892210960388,
"grad_norm": 28.479901425326158,
"kl": 0.0,
"learning_rate": 5.4732041049030787e-08,
"logps/chosen": -310.8540810032895,
"logps/rejected": -370.5546642485119,
"loss": 0.3796,
"rewards/chosen": 2.1148808127955387,
"rewards/margins": 4.169318722603016,
"rewards/rejected": -2.054437909807478,
"step": 880
},
{
"count/fg_chosen": 28.647058486938477,
"count/fg_rejected": 7.214285850524902,
"epoch": 0.9128205128205128,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.122632026672363,
"fg_logps/policy_chosen": -6.575372219085693,
"fg_logps/policy_rejected": -8.797600746154785,
"fg_logps/reference_KL": -11.014878273010254,
"fg_logps/reference_chosen": -6.147511005401611,
"fg_logps/reference_rejected": -7.686254024505615,
"fg_loss": 0.7267153263092041,
"fg_rewards/chosen_sum": -1.0292545557022095,
"fg_rewards/rejected_sum": -1.021606206893921,
"grad_norm": 14.860543570560992,
"kl": 0.0,
"learning_rate": 4.9030786773090077e-08,
"logps/chosen": -310.51392463235294,
"logps/rejected": -438.26770833333336,
"loss": 0.4036,
"rewards/chosen": 1.7528151568244486,
"rewards/margins": 4.661474010991116,
"rewards/rejected": -2.9086588541666667,
"step": 890
},
{
"count/fg_chosen": 25.649999618530273,
"count/fg_rejected": 5.099999904632568,
"epoch": 0.9230769230769231,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.193872451782227,
"fg_logps/policy_chosen": -6.921626091003418,
"fg_logps/policy_rejected": -9.522343635559082,
"fg_logps/reference_KL": -11.5450439453125,
"fg_logps/reference_chosen": -6.295389175415039,
"fg_logps/reference_rejected": -8.422341346740723,
"fg_loss": 0.8100715279579163,
"fg_rewards/chosen_sum": -1.2453607320785522,
"fg_rewards/rejected_sum": -0.6394971609115601,
"grad_norm": 26.313505637704687,
"kl": 0.0,
"learning_rate": 4.332953249714937e-08,
"logps/chosen": -379.3786095727848,
"logps/rejected": -469.6590470679012,
"loss": 0.4082,
"rewards/chosen": 1.5717159222952928,
"rewards/margins": 4.730814041206847,
"rewards/rejected": -3.1590981189115546,
"step": 900
},
{
"count/fg_chosen": 31.809524536132812,
"count/fg_rejected": 7.949999809265137,
"epoch": 0.9333333333333333,
"fg_kl": NaN,
"fg_logps/policy_KL": -13.88550090789795,
"fg_logps/policy_chosen": -6.575550556182861,
"fg_logps/policy_rejected": -8.97218132019043,
"fg_logps/reference_KL": -10.913046836853027,
"fg_logps/reference_chosen": -5.786440372467041,
"fg_logps/reference_rejected": -8.161312103271484,
"fg_loss": 0.6918947696685791,
"fg_rewards/chosen_sum": -1.8372831344604492,
"fg_rewards/rejected_sum": -0.7708438038825989,
"grad_norm": 30.704094667430244,
"kl": 0.0,
"learning_rate": 3.762827822120866e-08,
"logps/chosen": -342.4310891544118,
"logps/rejected": -465.26286458333334,
"loss": 0.394,
"rewards/chosen": 1.6974679385914522,
"rewards/margins": 5.162705975700828,
"rewards/rejected": -3.465238037109375,
"step": 910
},
{
"count/fg_chosen": 33.44444274902344,
"count/fg_rejected": 7.05555534362793,
"epoch": 0.9435897435897436,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.852989196777344,
"fg_logps/policy_chosen": -8.116382598876953,
"fg_logps/policy_rejected": -9.44908332824707,
"fg_logps/reference_KL": -12.696809768676758,
"fg_logps/reference_chosen": -6.711515426635742,
"fg_logps/reference_rejected": -8.2290678024292,
"fg_loss": 0.8823240995407104,
"fg_rewards/chosen_sum": -2.763169527053833,
"fg_rewards/rejected_sum": -0.838661789894104,
"grad_norm": 28.750070922344392,
"kl": 0.0,
"learning_rate": 3.192702394526796e-08,
"logps/chosen": -369.98503449675326,
"logps/rejected": -378.85448042168673,
"loss": 0.455,
"rewards/chosen": 1.3270432113052963,
"rewards/margins": 4.1537548429882785,
"rewards/rejected": -2.826711631682982,
"step": 920
},
{
"count/fg_chosen": 31.0,
"count/fg_rejected": 7.222222328186035,
"epoch": 0.9538461538461539,
"fg_kl": NaN,
"fg_logps/policy_KL": -16.319583892822266,
"fg_logps/policy_chosen": -7.1412739753723145,
"fg_logps/policy_rejected": -8.26430892944336,
"fg_logps/reference_KL": -12.383834838867188,
"fg_logps/reference_chosen": -6.26031494140625,
"fg_logps/reference_rejected": -7.345766544342041,
"fg_loss": 0.7640350461006165,
"fg_rewards/chosen_sum": -1.9626283645629883,
"fg_rewards/rejected_sum": -0.9573346376419067,
"grad_norm": 30.27804248529551,
"kl": 0.0,
"learning_rate": 2.6225769669327253e-08,
"logps/chosen": -328.4565281723485,
"logps/rejected": -376.20595079787233,
"loss": 0.3767,
"rewards/chosen": 2.0332070552941524,
"rewards/margins": 5.176523121305622,
"rewards/rejected": -3.1433160660114696,
"step": 930
},
{
"count/fg_chosen": 30.625,
"count/fg_rejected": 7.0,
"epoch": 0.9641025641025641,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.706071853637695,
"fg_logps/policy_chosen": -7.53517484664917,
"fg_logps/policy_rejected": -10.4346284866333,
"fg_logps/reference_KL": -11.768562316894531,
"fg_logps/reference_chosen": -6.489566326141357,
"fg_logps/reference_rejected": -9.170198440551758,
"fg_loss": 0.7636561989784241,
"fg_rewards/chosen_sum": -2.5118489265441895,
"fg_rewards/rejected_sum": -0.7650282979011536,
"grad_norm": 39.27702261397548,
"kl": 0.0,
"learning_rate": 2.0524515393386543e-08,
"logps/chosen": -346.63032670454544,
"logps/rejected": -424.40479103915663,
"loss": 0.4212,
"rewards/chosen": 1.459620389071378,
"rewards/margins": 4.2732272947213215,
"rewards/rejected": -2.8136069056499435,
"step": 940
},
{
"count/fg_chosen": 25.272727966308594,
"count/fg_rejected": 4.363636493682861,
"epoch": 0.9743589743589743,
"fg_kl": NaN,
"fg_logps/policy_KL": -14.18252182006836,
"fg_logps/policy_chosen": -6.842197895050049,
"fg_logps/policy_rejected": -7.672128677368164,
"fg_logps/reference_KL": -10.544868469238281,
"fg_logps/reference_chosen": -5.763217449188232,
"fg_logps/reference_rejected": -6.418917655944824,
"fg_loss": 0.6513127088546753,
"fg_rewards/chosen_sum": -1.7274693250656128,
"fg_rewards/rejected_sum": -0.795689582824707,
"grad_norm": 23.77562108897806,
"kl": 0.0,
"learning_rate": 1.4823261117445838e-08,
"logps/chosen": -349.6753555689103,
"logps/rejected": -403.2014100609756,
"loss": 0.3828,
"rewards/chosen": 1.3632464286608574,
"rewards/margins": 3.730917743923815,
"rewards/rejected": -2.367671315262957,
"step": 950
},
{
"count/fg_chosen": 26.649999618530273,
"count/fg_rejected": 6.5,
"epoch": 0.9846153846153847,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.6975736618042,
"fg_logps/policy_chosen": -7.5320539474487305,
"fg_logps/policy_rejected": -9.357258796691895,
"fg_logps/reference_KL": -12.039111137390137,
"fg_logps/reference_chosen": -6.5699005126953125,
"fg_logps/reference_rejected": -8.148908615112305,
"fg_loss": 0.6992577910423279,
"fg_rewards/chosen_sum": -1.6473188400268555,
"fg_rewards/rejected_sum": -0.9504286646842957,
"grad_norm": 29.616218116706296,
"kl": 0.0,
"learning_rate": 9.122006841505132e-09,
"logps/chosen": -366.2652652138158,
"logps/rejected": -412.12672061011904,
"loss": 0.4654,
"rewards/chosen": 0.8081491369950143,
"rewards/margins": 3.221180578819791,
"rewards/rejected": -2.413031441824777,
"step": 960
},
{
"count/fg_chosen": 27.294116973876953,
"count/fg_rejected": 6.294117450714111,
"epoch": 0.9948717948717949,
"fg_kl": NaN,
"fg_logps/policy_KL": -15.349953651428223,
"fg_logps/policy_chosen": -7.822242259979248,
"fg_logps/policy_rejected": -8.426324844360352,
"fg_logps/reference_KL": -11.580068588256836,
"fg_logps/reference_chosen": -6.358785629272461,
"fg_logps/reference_rejected": -7.00697135925293,
"fg_loss": 0.8312911987304688,
"fg_rewards/chosen_sum": -2.5030882358551025,
"fg_rewards/rejected_sum": -0.9714083671569824,
"grad_norm": 26.639106291790316,
"kl": 0.0,
"learning_rate": 3.420752565564424e-09,
"logps/chosen": -419.9399809966216,
"logps/rejected": -412.7228379360465,
"loss": 0.4666,
"rewards/chosen": 0.5730146459631018,
"rewards/margins": 2.736655932413865,
"rewards/rejected": -2.1636412864507633,
"step": 970
},
{
"epoch": 1.0,
"step": 975,
"total_flos": 0.0,
"train_loss": 0.45996271347388246,
"train_runtime": 8430.3956,
"train_samples_per_second": 1.85,
"train_steps_per_second": 0.116
}
],
"logging_steps": 10,
"max_steps": 975,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}