|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 400, |
|
"global_step": 975, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"count/fg_chosen": 30.85714340209961, |
|
"count/fg_rejected": 7.4285712242126465, |
|
"epoch": 0.010256410256410256, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.948674201965332, |
|
"fg_logps/policy_chosen": -6.262445449829102, |
|
"fg_logps/policy_rejected": -8.74467945098877, |
|
"fg_logps/reference_KL": -11.94157600402832, |
|
"fg_logps/reference_chosen": -6.2594828605651855, |
|
"fg_logps/reference_rejected": -8.742448806762695, |
|
"fg_loss": 0.8008173704147339, |
|
"fg_rewards/chosen_sum": -0.008917576633393764, |
|
"fg_rewards/rejected_sum": -0.0010543327080085874, |
|
"grad_norm": 70.97090228694296, |
|
"kl": 0.15787295997142792, |
|
"learning_rate": 2.5110157309792834e-07, |
|
"logps/chosen": -366.76351768092104, |
|
"logps/rejected": -369.69268508184524, |
|
"loss": 0.6347, |
|
"rewards/chosen": 0.014076207813463713, |
|
"rewards/margins": -0.000743936476552097, |
|
"rewards/rejected": 0.01482014429001581, |
|
"step": 10 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.352941513061523, |
|
"count/fg_rejected": 6.058823585510254, |
|
"epoch": 0.020512820512820513, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -10.825294494628906, |
|
"fg_logps/policy_chosen": -5.95189905166626, |
|
"fg_logps/policy_rejected": -5.48292350769043, |
|
"fg_logps/reference_KL": -10.80107307434082, |
|
"fg_logps/reference_chosen": -5.9293718338012695, |
|
"fg_logps/reference_rejected": -5.445353984832764, |
|
"fg_loss": 0.7557108402252197, |
|
"fg_rewards/chosen_sum": -0.05455803498625755, |
|
"fg_rewards/rejected_sum": -0.025491168722510338, |
|
"grad_norm": 76.27540979070403, |
|
"kl": 0.05270981788635254, |
|
"learning_rate": 3.2669067855881653e-07, |
|
"logps/chosen": -385.705078125, |
|
"logps/rejected": -347.460890436747, |
|
"loss": 0.5881, |
|
"rewards/chosen": 0.02315927016270625, |
|
"rewards/margins": 0.05135368041786262, |
|
"rewards/rejected": -0.028194410255156368, |
|
"step": 20 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.0, |
|
"count/fg_rejected": 7.176470756530762, |
|
"epoch": 0.03076923076923077, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.925114631652832, |
|
"fg_logps/policy_chosen": -6.261083602905273, |
|
"fg_logps/policy_rejected": -7.534660816192627, |
|
"fg_logps/reference_KL": -12.83480167388916, |
|
"fg_logps/reference_chosen": -6.232151985168457, |
|
"fg_logps/reference_rejected": -7.494457244873047, |
|
"fg_loss": 0.8402522206306458, |
|
"fg_rewards/chosen_sum": -0.0777626782655716, |
|
"fg_rewards/rejected_sum": -0.03122476302087307, |
|
"grad_norm": 112.80411347358924, |
|
"kl": 0.02636871300637722, |
|
"learning_rate": 3.709074707164929e-07, |
|
"logps/chosen": -380.789990234375, |
|
"logps/rejected": -344.6407958984375, |
|
"loss": 0.5949, |
|
"rewards/chosen": 0.017523756623268126, |
|
"rewards/margins": 0.0999738484621048, |
|
"rewards/rejected": -0.08245009183883667, |
|
"step": 30 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.60869598388672, |
|
"count/fg_rejected": 8.136363983154297, |
|
"epoch": 0.041025641025641026, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.220458030700684, |
|
"fg_logps/policy_chosen": -5.84191370010376, |
|
"fg_logps/policy_rejected": -6.710254192352295, |
|
"fg_logps/reference_KL": -11.054718971252441, |
|
"fg_logps/reference_chosen": -5.7772111892700195, |
|
"fg_logps/reference_rejected": -6.645753860473633, |
|
"fg_loss": 0.808053195476532, |
|
"fg_rewards/chosen_sum": -0.1892387568950653, |
|
"fg_rewards/rejected_sum": -0.05010434612631798, |
|
"grad_norm": 50.84808526748507, |
|
"kl": 0.0, |
|
"learning_rate": 4.022797840197047e-07, |
|
"logps/chosen": -383.87660435267856, |
|
"logps/rejected": -374.25223581414474, |
|
"loss": 0.6121, |
|
"rewards/chosen": 0.04867589473724365, |
|
"rewards/margins": 0.2174466848373413, |
|
"rewards/rejected": -0.16877079010009766, |
|
"step": 40 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.79999923706055, |
|
"count/fg_rejected": 9.533333778381348, |
|
"epoch": 0.05128205128205128, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -10.373883247375488, |
|
"fg_logps/policy_chosen": -5.664600849151611, |
|
"fg_logps/policy_rejected": -5.962148189544678, |
|
"fg_logps/reference_KL": -10.145182609558105, |
|
"fg_logps/reference_chosen": -5.588218688964844, |
|
"fg_logps/reference_rejected": -5.897520065307617, |
|
"fg_loss": 0.8677656054496765, |
|
"fg_rewards/chosen_sum": -0.25619086623191833, |
|
"fg_rewards/rejected_sum": -0.07288213074207306, |
|
"grad_norm": 64.57457570643511, |
|
"kl": 0.009938049130141735, |
|
"learning_rate": 4.2661404073496845e-07, |
|
"logps/chosen": -346.134577371988, |
|
"logps/rejected": -368.41335227272725, |
|
"loss": 0.5682, |
|
"rewards/chosen": 0.2387204227677311, |
|
"rewards/margins": 0.3148248284061905, |
|
"rewards/rejected": -0.07610440563845944, |
|
"step": 50 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.294116973876953, |
|
"count/fg_rejected": 5.470588207244873, |
|
"epoch": 0.06153846153846154, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.107590675354004, |
|
"fg_logps/policy_chosen": -6.114685535430908, |
|
"fg_logps/policy_rejected": -7.69861364364624, |
|
"fg_logps/reference_KL": -11.640838623046875, |
|
"fg_logps/reference_chosen": -6.00569486618042, |
|
"fg_logps/reference_rejected": -7.617056369781494, |
|
"fg_loss": 0.7082093954086304, |
|
"fg_rewards/chosen_sum": -0.27744388580322266, |
|
"fg_rewards/rejected_sum": -0.06493094563484192, |
|
"grad_norm": 49.41694776530553, |
|
"kl": 0.0, |
|
"learning_rate": 4.4649657617738114e-07, |
|
"logps/chosen": -353.69510690789474, |
|
"logps/rejected": -366.0445033482143, |
|
"loss": 0.5548, |
|
"rewards/chosen": 0.17124160967375102, |
|
"rewards/margins": 0.3800723295761529, |
|
"rewards/rejected": -0.20883071990240187, |
|
"step": 60 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.19047546386719, |
|
"count/fg_rejected": 6.238095283508301, |
|
"epoch": 0.07179487179487179, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.570515632629395, |
|
"fg_logps/policy_chosen": -6.555771827697754, |
|
"fg_logps/policy_rejected": -8.589411735534668, |
|
"fg_logps/reference_KL": -12.056818008422852, |
|
"fg_logps/reference_chosen": -6.406361103057861, |
|
"fg_logps/reference_rejected": -8.353743553161621, |
|
"fg_loss": 0.716395378112793, |
|
"fg_rewards/chosen_sum": -0.46484148502349854, |
|
"fg_rewards/rejected_sum": -0.1304880827665329, |
|
"grad_norm": 68.38293723456421, |
|
"kl": 0.0, |
|
"learning_rate": 4.633070203674842e-07, |
|
"logps/chosen": -319.37548828125, |
|
"logps/rejected": -425.4502418154762, |
|
"loss": 0.6056, |
|
"rewards/chosen": -0.016339432252080816, |
|
"rewards/margins": 0.2762198054551481, |
|
"rewards/rejected": -0.2925592377072289, |
|
"step": 70 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.77777862548828, |
|
"count/fg_rejected": 9.470588684082031, |
|
"epoch": 0.08205128205128205, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -10.578932762145996, |
|
"fg_logps/policy_chosen": -5.850771427154541, |
|
"fg_logps/policy_rejected": -7.156460762023926, |
|
"fg_logps/reference_KL": -10.092942237854004, |
|
"fg_logps/reference_chosen": -5.712654113769531, |
|
"fg_logps/reference_rejected": -6.944825172424316, |
|
"fg_loss": 0.8019319176673889, |
|
"fg_rewards/chosen_sum": -0.4671042263507843, |
|
"fg_rewards/rejected_sum": -0.17894208431243896, |
|
"grad_norm": 39.81642138358546, |
|
"kl": 0.0, |
|
"learning_rate": 4.77868889480593e-07, |
|
"logps/chosen": -317.1737351190476, |
|
"logps/rejected": -377.7265625, |
|
"loss": 0.5778, |
|
"rewards/chosen": 0.11412754512968518, |
|
"rewards/margins": 0.5135003988605394, |
|
"rewards/rejected": -0.39937285373085424, |
|
"step": 80 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.0625, |
|
"count/fg_rejected": 6.25, |
|
"epoch": 0.09230769230769231, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.664382934570312, |
|
"fg_logps/policy_chosen": -6.444965839385986, |
|
"fg_logps/policy_rejected": -8.933675765991211, |
|
"fg_logps/reference_KL": -10.985946655273438, |
|
"fg_logps/reference_chosen": -6.227105140686035, |
|
"fg_logps/reference_rejected": -8.714274406433105, |
|
"fg_loss": 0.6460863351821899, |
|
"fg_rewards/chosen_sum": -0.5869948267936707, |
|
"fg_rewards/rejected_sum": -0.14423823356628418, |
|
"grad_norm": 55.62438439854401, |
|
"kl": 0.0, |
|
"learning_rate": 4.907133683350575e-07, |
|
"logps/chosen": -404.47572544642856, |
|
"logps/rejected": -416.0439453125, |
|
"loss": 0.5292, |
|
"rewards/chosen": 0.30132850011189777, |
|
"rewards/margins": 0.755986305705288, |
|
"rewards/rejected": -0.4546578055933902, |
|
"step": 90 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.772727966308594, |
|
"count/fg_rejected": 6.333333492279053, |
|
"epoch": 0.10256410256410256, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.864214897155762, |
|
"fg_logps/policy_chosen": -6.919012546539307, |
|
"fg_logps/policy_rejected": -9.152473449707031, |
|
"fg_logps/reference_KL": -12.052042007446289, |
|
"fg_logps/reference_chosen": -6.679240703582764, |
|
"fg_logps/reference_rejected": -8.906697273254395, |
|
"fg_loss": 0.7610839605331421, |
|
"fg_rewards/chosen_sum": -0.674491822719574, |
|
"fg_rewards/rejected_sum": -0.1936338096857071, |
|
"grad_norm": 63.5056444865937, |
|
"kl": 0.0, |
|
"learning_rate": 4.994298745724059e-07, |
|
"logps/chosen": -340.2066359747024, |
|
"logps/rejected": -390.9894377055921, |
|
"loss": 0.6206, |
|
"rewards/chosen": 0.13452401615324475, |
|
"rewards/margins": 0.2098735791997503, |
|
"rewards/rejected": -0.07534956304650557, |
|
"step": 100 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.42856979370117, |
|
"count/fg_rejected": 5.736842155456543, |
|
"epoch": 0.11282051282051282, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.667691230773926, |
|
"fg_logps/policy_chosen": -6.325592041015625, |
|
"fg_logps/policy_rejected": -7.5075507164001465, |
|
"fg_logps/reference_KL": -11.706281661987305, |
|
"fg_logps/reference_chosen": -6.0957818031311035, |
|
"fg_logps/reference_rejected": -7.153979778289795, |
|
"fg_loss": 0.6360421180725098, |
|
"fg_rewards/chosen_sum": -0.7651198506355286, |
|
"fg_rewards/rejected_sum": -0.1911022961139679, |
|
"grad_norm": 45.1771736885249, |
|
"kl": 0.0, |
|
"learning_rate": 4.937286202964652e-07, |
|
"logps/chosen": -358.7214664152299, |
|
"logps/rejected": -373.4951305650685, |
|
"loss": 0.5458, |
|
"rewards/chosen": 0.35928555192618533, |
|
"rewards/margins": 0.8201791045234402, |
|
"rewards/rejected": -0.4608935525972549, |
|
"step": 110 |
|
}, |
|
{ |
|
"count/fg_chosen": 36.45000076293945, |
|
"count/fg_rejected": 10.210526466369629, |
|
"epoch": 0.12307692307692308, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.05976676940918, |
|
"fg_logps/policy_chosen": -6.244026184082031, |
|
"fg_logps/policy_rejected": -7.9034271240234375, |
|
"fg_logps/reference_KL": -11.134529113769531, |
|
"fg_logps/reference_chosen": -6.0136542320251465, |
|
"fg_logps/reference_rejected": -7.628241062164307, |
|
"fg_loss": 0.881417453289032, |
|
"fg_rewards/chosen_sum": -0.7559553384780884, |
|
"fg_rewards/rejected_sum": -0.3012525737285614, |
|
"grad_norm": 50.09822696941802, |
|
"kl": 0.015017986297607422, |
|
"learning_rate": 4.880273660205244e-07, |
|
"logps/chosen": -320.32579210069446, |
|
"logps/rejected": -356.7809392755682, |
|
"loss": 0.601, |
|
"rewards/chosen": 0.6363146040174696, |
|
"rewards/margins": 0.7909924068836252, |
|
"rewards/rejected": -0.15467780286615546, |
|
"step": 120 |
|
}, |
|
{ |
|
"count/fg_chosen": 36.400001525878906, |
|
"count/fg_rejected": 7.933333396911621, |
|
"epoch": 0.13333333333333333, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.468809127807617, |
|
"fg_logps/policy_chosen": -6.23488712310791, |
|
"fg_logps/policy_rejected": -6.54256010055542, |
|
"fg_logps/reference_KL": -10.525192260742188, |
|
"fg_logps/reference_chosen": -6.026561260223389, |
|
"fg_logps/reference_rejected": -6.267870903015137, |
|
"fg_loss": 0.8011055588722229, |
|
"fg_rewards/chosen_sum": -0.647752583026886, |
|
"fg_rewards/rejected_sum": -0.20049738883972168, |
|
"grad_norm": 50.115512333731104, |
|
"kl": 0.03811788558959961, |
|
"learning_rate": 4.823261117445838e-07, |
|
"logps/chosen": -450.6170099431818, |
|
"logps/rejected": -392.42621527777777, |
|
"loss": 0.5078, |
|
"rewards/chosen": 1.0679140090942383, |
|
"rewards/margins": 1.1960734128952026, |
|
"rewards/rejected": -0.12815940380096436, |
|
"step": 130 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.4761905670166, |
|
"count/fg_rejected": 8.699999809265137, |
|
"epoch": 0.14358974358974358, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.193472862243652, |
|
"fg_logps/policy_chosen": -6.388846397399902, |
|
"fg_logps/policy_rejected": -7.679136753082275, |
|
"fg_logps/reference_KL": -11.09471607208252, |
|
"fg_logps/reference_chosen": -6.169702529907227, |
|
"fg_logps/reference_rejected": -7.332272529602051, |
|
"fg_loss": 0.7396747469902039, |
|
"fg_rewards/chosen_sum": -0.5886417627334595, |
|
"fg_rewards/rejected_sum": -0.24698862433433533, |
|
"grad_norm": 48.94452126464587, |
|
"kl": 0.03351273387670517, |
|
"learning_rate": 4.766248574686431e-07, |
|
"logps/chosen": -334.752628279321, |
|
"logps/rejected": -386.6670292721519, |
|
"loss": 0.5475, |
|
"rewards/chosen": 0.8302505869924286, |
|
"rewards/margins": 1.2393360176688526, |
|
"rewards/rejected": -0.40908543067642406, |
|
"step": 140 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.105262756347656, |
|
"count/fg_rejected": 4.526315689086914, |
|
"epoch": 0.15384615384615385, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.575007438659668, |
|
"fg_logps/policy_chosen": -7.0048089027404785, |
|
"fg_logps/policy_rejected": -8.861261367797852, |
|
"fg_logps/reference_KL": -12.408534049987793, |
|
"fg_logps/reference_chosen": -6.816770076751709, |
|
"fg_logps/reference_rejected": -8.450145721435547, |
|
"fg_loss": 0.7104328870773315, |
|
"fg_rewards/chosen_sum": -0.4196644723415375, |
|
"fg_rewards/rejected_sum": -0.17191696166992188, |
|
"grad_norm": 49.69486730354108, |
|
"kl": 0.05726609379053116, |
|
"learning_rate": 4.7092360319270236e-07, |
|
"logps/chosen": -418.9573688271605, |
|
"logps/rejected": -390.3833069620253, |
|
"loss": 0.5064, |
|
"rewards/chosen": 0.9700225076557677, |
|
"rewards/margins": 1.2821282176491542, |
|
"rewards/rejected": -0.31210570999338655, |
|
"step": 150 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.764705657958984, |
|
"count/fg_rejected": 4.125, |
|
"epoch": 0.1641025641025641, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.1893892288208, |
|
"fg_logps/policy_chosen": -6.999147415161133, |
|
"fg_logps/policy_rejected": -7.968658447265625, |
|
"fg_logps/reference_KL": -12.764519691467285, |
|
"fg_logps/reference_chosen": -6.616185188293457, |
|
"fg_logps/reference_rejected": -7.533709526062012, |
|
"fg_loss": 0.7398098707199097, |
|
"fg_rewards/chosen_sum": -0.8501734733581543, |
|
"fg_rewards/rejected_sum": -0.15099674463272095, |
|
"grad_norm": 41.227539280479604, |
|
"kl": 0.0, |
|
"learning_rate": 4.652223489167617e-07, |
|
"logps/chosen": -377.22486787683823, |
|
"logps/rejected": -458.93584408967394, |
|
"loss": 0.4883, |
|
"rewards/chosen": 0.8492268955006319, |
|
"rewards/margins": 1.2064810067491458, |
|
"rewards/rejected": -0.3572541112485139, |
|
"step": 160 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.06666564941406, |
|
"count/fg_rejected": 7.615384578704834, |
|
"epoch": 0.17435897435897435, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.796510696411133, |
|
"fg_logps/policy_chosen": -6.570502758026123, |
|
"fg_logps/policy_rejected": -7.280531883239746, |
|
"fg_logps/reference_KL": -11.486601829528809, |
|
"fg_logps/reference_chosen": -6.288327693939209, |
|
"fg_logps/reference_rejected": -6.919613361358643, |
|
"fg_loss": 0.7529634237289429, |
|
"fg_rewards/chosen_sum": -0.7710135579109192, |
|
"fg_rewards/rejected_sum": -0.2511799931526184, |
|
"grad_norm": 50.55336505699433, |
|
"kl": 0.0, |
|
"learning_rate": 4.5952109464082095e-07, |
|
"logps/chosen": -339.1629430259146, |
|
"logps/rejected": -407.0223607772436, |
|
"loss": 0.4934, |
|
"rewards/chosen": 0.810060268495141, |
|
"rewards/margins": 1.2479861452103855, |
|
"rewards/rejected": -0.4379258767152444, |
|
"step": 170 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.875, |
|
"count/fg_rejected": 8.25, |
|
"epoch": 0.18461538461538463, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.054594993591309, |
|
"fg_logps/policy_chosen": -6.169870853424072, |
|
"fg_logps/policy_rejected": -7.339913368225098, |
|
"fg_logps/reference_KL": -10.784111022949219, |
|
"fg_logps/reference_chosen": -5.852520942687988, |
|
"fg_logps/reference_rejected": -6.8034443855285645, |
|
"fg_loss": 0.7260686755180359, |
|
"fg_rewards/chosen_sum": -0.8828132152557373, |
|
"fg_rewards/rejected_sum": -0.5087793469429016, |
|
"grad_norm": 79.2618606536439, |
|
"kl": 0.0, |
|
"learning_rate": 4.5381984036488027e-07, |
|
"logps/chosen": -298.9371427210366, |
|
"logps/rejected": -415.6966145833333, |
|
"loss": 0.5143, |
|
"rewards/chosen": 0.3949350496617759, |
|
"rewards/margins": 1.4249423386679356, |
|
"rewards/rejected": -1.0300072890061598, |
|
"step": 180 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.363636016845703, |
|
"count/fg_rejected": 6.409090995788574, |
|
"epoch": 0.19487179487179487, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.836668968200684, |
|
"fg_logps/policy_chosen": -6.473412990570068, |
|
"fg_logps/policy_rejected": -7.306280612945557, |
|
"fg_logps/reference_KL": -10.574777603149414, |
|
"fg_logps/reference_chosen": -6.141035079956055, |
|
"fg_logps/reference_rejected": -7.045315265655518, |
|
"fg_loss": 0.7152173519134521, |
|
"fg_rewards/chosen_sum": -0.9164342880249023, |
|
"fg_rewards/rejected_sum": -0.21355971693992615, |
|
"grad_norm": 62.51457716342411, |
|
"kl": 0.0, |
|
"learning_rate": 4.4811858608893954e-07, |
|
"logps/chosen": -378.7162252286585, |
|
"logps/rejected": -380.27271133814105, |
|
"loss": 0.482, |
|
"rewards/chosen": 0.9614362949278297, |
|
"rewards/margins": 1.7487119516035108, |
|
"rewards/rejected": -0.7872756566756811, |
|
"step": 190 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.789474487304688, |
|
"count/fg_rejected": 6.263157844543457, |
|
"epoch": 0.20512820512820512, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.003227233886719, |
|
"fg_logps/policy_chosen": -6.706553936004639, |
|
"fg_logps/policy_rejected": -7.728447437286377, |
|
"fg_logps/reference_KL": -12.440956115722656, |
|
"fg_logps/reference_chosen": -6.388577938079834, |
|
"fg_logps/reference_rejected": -7.369418621063232, |
|
"fg_loss": 0.7191720008850098, |
|
"fg_rewards/chosen_sum": -0.8144214749336243, |
|
"fg_rewards/rejected_sum": -0.220667764544487, |
|
"grad_norm": 53.07709071243361, |
|
"kl": 0.016681909561157227, |
|
"learning_rate": 4.4241733181299887e-07, |
|
"logps/chosen": -376.6458753360215, |
|
"logps/rejected": -420.5650652985075, |
|
"loss": 0.5059, |
|
"rewards/chosen": 0.6015197179650748, |
|
"rewards/margins": 1.3459980290767528, |
|
"rewards/rejected": -0.744478311111678, |
|
"step": 200 |
|
}, |
|
{ |
|
"count/fg_chosen": 35.38461685180664, |
|
"count/fg_rejected": 9.230769157409668, |
|
"epoch": 0.2153846153846154, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.624795913696289, |
|
"fg_logps/policy_chosen": -6.518624305725098, |
|
"fg_logps/policy_rejected": -6.696259021759033, |
|
"fg_logps/reference_KL": -12.152335166931152, |
|
"fg_logps/reference_chosen": -6.259435653686523, |
|
"fg_logps/reference_rejected": -6.192153453826904, |
|
"fg_loss": 0.7262544631958008, |
|
"fg_rewards/chosen_sum": -0.6904063820838928, |
|
"fg_rewards/rejected_sum": -0.5146002173423767, |
|
"grad_norm": 53.74268239721948, |
|
"kl": 0.0, |
|
"learning_rate": 4.3671607753705814e-07, |
|
"logps/chosen": -386.4969911317568, |
|
"logps/rejected": -467.1247274709302, |
|
"loss": 0.4207, |
|
"rewards/chosen": 1.3281025242161106, |
|
"rewards/margins": 2.8707167735120773, |
|
"rewards/rejected": -1.5426142492959665, |
|
"step": 210 |
|
}, |
|
{ |
|
"count/fg_chosen": 24.214284896850586, |
|
"count/fg_rejected": 7.5, |
|
"epoch": 0.22564102564102564, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.594059944152832, |
|
"fg_logps/policy_chosen": -6.371434688568115, |
|
"fg_logps/policy_rejected": -7.165874004364014, |
|
"fg_logps/reference_KL": -10.995756149291992, |
|
"fg_logps/reference_chosen": -6.087092399597168, |
|
"fg_logps/reference_rejected": -6.730601787567139, |
|
"fg_loss": 0.7775211334228516, |
|
"fg_rewards/chosen_sum": -0.6547192931175232, |
|
"fg_rewards/rejected_sum": -0.29393261671066284, |
|
"grad_norm": 46.801027582446935, |
|
"kl": 0.0, |
|
"learning_rate": 4.3101482326111746e-07, |
|
"logps/chosen": -344.7091128700658, |
|
"logps/rejected": -398.7763671875, |
|
"loss": 0.4334, |
|
"rewards/chosen": 0.5747735876786081, |
|
"rewards/margins": 2.151842461492782, |
|
"rewards/rejected": -1.577068873814174, |
|
"step": 220 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.272727966308594, |
|
"count/fg_rejected": 6.55555534362793, |
|
"epoch": 0.2358974358974359, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -11.965290069580078, |
|
"fg_logps/policy_chosen": -6.299896717071533, |
|
"fg_logps/policy_rejected": -7.80208158493042, |
|
"fg_logps/reference_KL": -10.39778995513916, |
|
"fg_logps/reference_chosen": -5.838742256164551, |
|
"fg_logps/reference_rejected": -7.279135704040527, |
|
"fg_loss": 0.6748415231704712, |
|
"fg_rewards/chosen_sum": -1.2193199396133423, |
|
"fg_rewards/rejected_sum": -0.36254453659057617, |
|
"grad_norm": 45.120354528449354, |
|
"kl": 0.0, |
|
"learning_rate": 4.2531356898517673e-07, |
|
"logps/chosen": -358.53559470663265, |
|
"logps/rejected": -378.2181829637097, |
|
"loss": 0.4775, |
|
"rewards/chosen": 0.5253227389588648, |
|
"rewards/margins": 2.214199475507504, |
|
"rewards/rejected": -1.688876736548639, |
|
"step": 230 |
|
}, |
|
{ |
|
"count/fg_chosen": 36.1875, |
|
"count/fg_rejected": 7.199999809265137, |
|
"epoch": 0.24615384615384617, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.929790496826172, |
|
"fg_logps/policy_chosen": -6.825524806976318, |
|
"fg_logps/policy_rejected": -7.702876091003418, |
|
"fg_logps/reference_KL": -11.064438819885254, |
|
"fg_logps/reference_chosen": -6.324741363525391, |
|
"fg_logps/reference_rejected": -7.179370403289795, |
|
"fg_loss": 0.7855690717697144, |
|
"fg_rewards/chosen_sum": -1.7501062154769897, |
|
"fg_rewards/rejected_sum": -0.501429557800293, |
|
"grad_norm": 41.89950021868993, |
|
"kl": 0.26182326674461365, |
|
"learning_rate": 4.1961231470923605e-07, |
|
"logps/chosen": -347.2475725446429, |
|
"logps/rejected": -382.114453125, |
|
"loss": 0.4878, |
|
"rewards/chosen": 1.19210935320173, |
|
"rewards/margins": 1.8935284205845424, |
|
"rewards/rejected": -0.7014190673828125, |
|
"step": 240 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.399999618530273, |
|
"count/fg_rejected": 7.266666889190674, |
|
"epoch": 0.2564102564102564, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.258040428161621, |
|
"fg_logps/policy_chosen": -6.943975925445557, |
|
"fg_logps/policy_rejected": -7.556437969207764, |
|
"fg_logps/reference_KL": -11.820046424865723, |
|
"fg_logps/reference_chosen": -6.191815376281738, |
|
"fg_logps/reference_rejected": -6.7565178871154785, |
|
"fg_loss": 0.8718132972717285, |
|
"fg_rewards/chosen_sum": -1.6466922760009766, |
|
"fg_rewards/rejected_sum": -0.6005190014839172, |
|
"grad_norm": 41.93986307357718, |
|
"kl": 0.0, |
|
"learning_rate": 4.139110604332953e-07, |
|
"logps/chosen": -327.5843017578125, |
|
"logps/rejected": -385.35126953125, |
|
"loss": 0.4012, |
|
"rewards/chosen": 1.1960113525390625, |
|
"rewards/margins": 2.935526466369629, |
|
"rewards/rejected": -1.7395151138305665, |
|
"step": 250 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.38461685180664, |
|
"count/fg_rejected": 7.692307472229004, |
|
"epoch": 0.26666666666666666, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.9034423828125, |
|
"fg_logps/policy_chosen": -6.196410655975342, |
|
"fg_logps/policy_rejected": -7.9534759521484375, |
|
"fg_logps/reference_KL": -10.658266067504883, |
|
"fg_logps/reference_chosen": -5.453469276428223, |
|
"fg_logps/reference_rejected": -7.014803409576416, |
|
"fg_loss": 0.9994122982025146, |
|
"fg_rewards/chosen_sum": -1.9320785999298096, |
|
"fg_rewards/rejected_sum": -0.8277677893638611, |
|
"grad_norm": 32.044513351466335, |
|
"kl": 0.0, |
|
"learning_rate": 4.0820980615735465e-07, |
|
"logps/chosen": -334.1531723484849, |
|
"logps/rejected": -391.7591838430851, |
|
"loss": 0.4248, |
|
"rewards/chosen": 0.9683192859996449, |
|
"rewards/margins": 2.343336492719226, |
|
"rewards/rejected": -1.3750172067195812, |
|
"step": 260 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.071428298950195, |
|
"count/fg_rejected": 5.142857074737549, |
|
"epoch": 0.27692307692307694, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.50527286529541, |
|
"fg_logps/policy_chosen": -7.0650315284729, |
|
"fg_logps/policy_rejected": -8.667463302612305, |
|
"fg_logps/reference_KL": -12.12157154083252, |
|
"fg_logps/reference_chosen": -6.389377593994141, |
|
"fg_logps/reference_rejected": -7.870638370513916, |
|
"fg_loss": 0.7973106503486633, |
|
"fg_rewards/chosen_sum": -1.2882376909255981, |
|
"fg_rewards/rejected_sum": -0.4027543365955353, |
|
"grad_norm": 68.19722281582398, |
|
"kl": 0.020351696759462357, |
|
"learning_rate": 4.025085518814139e-07, |
|
"logps/chosen": -405.6799411525974, |
|
"logps/rejected": -402.68011106927713, |
|
"loss": 0.4322, |
|
"rewards/chosen": 0.5332601472928926, |
|
"rewards/margins": 2.588090307778585, |
|
"rewards/rejected": -2.0548301604856927, |
|
"step": 270 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.959999084472656, |
|
"count/fg_rejected": 9.0, |
|
"epoch": 0.28717948717948716, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.018957138061523, |
|
"fg_logps/policy_chosen": -6.5620503425598145, |
|
"fg_logps/policy_rejected": -8.468265533447266, |
|
"fg_logps/reference_KL": -9.829309463500977, |
|
"fg_logps/reference_chosen": -5.755721569061279, |
|
"fg_logps/reference_rejected": -7.4278693199157715, |
|
"fg_loss": 0.8410596251487732, |
|
"fg_rewards/chosen_sum": -2.1979994773864746, |
|
"fg_rewards/rejected_sum": -1.1406161785125732, |
|
"grad_norm": 42.49380746961441, |
|
"kl": 0.0, |
|
"learning_rate": 3.9680729760547324e-07, |
|
"logps/chosen": -333.30290316358025, |
|
"logps/rejected": -438.11288568037975, |
|
"loss": 0.5378, |
|
"rewards/chosen": 0.4965087513864776, |
|
"rewards/margins": 2.022118021313297, |
|
"rewards/rejected": -1.5256092699268196, |
|
"step": 280 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.38888931274414, |
|
"count/fg_rejected": 9.166666984558105, |
|
"epoch": 0.29743589743589743, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.498015403747559, |
|
"fg_logps/policy_chosen": -6.605287551879883, |
|
"fg_logps/policy_rejected": -7.190271377563477, |
|
"fg_logps/reference_KL": -11.353104591369629, |
|
"fg_logps/reference_chosen": -6.0449957847595215, |
|
"fg_logps/reference_rejected": -6.696670055389404, |
|
"fg_loss": 0.9254876971244812, |
|
"fg_rewards/chosen_sum": -1.749000072479248, |
|
"fg_rewards/rejected_sum": -0.5272374153137207, |
|
"grad_norm": 34.10871711923179, |
|
"kl": 0.0, |
|
"learning_rate": 3.9110604332953246e-07, |
|
"logps/chosen": -374.3355087652439, |
|
"logps/rejected": -443.0232371794872, |
|
"loss": 0.4904, |
|
"rewards/chosen": 0.7626230658554449, |
|
"rewards/margins": 2.348516941368766, |
|
"rewards/rejected": -1.5858938755133214, |
|
"step": 290 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.75, |
|
"count/fg_rejected": 7.6315789222717285, |
|
"epoch": 0.3076923076923077, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.35108757019043, |
|
"fg_logps/policy_chosen": -6.358473300933838, |
|
"fg_logps/policy_rejected": -7.603394508361816, |
|
"fg_logps/reference_KL": -10.117968559265137, |
|
"fg_logps/reference_chosen": -5.656960964202881, |
|
"fg_logps/reference_rejected": -6.988058090209961, |
|
"fg_loss": 0.687099277973175, |
|
"fg_rewards/chosen_sum": -1.7474342584609985, |
|
"fg_rewards/rejected_sum": -0.6550286412239075, |
|
"grad_norm": 45.87667859445539, |
|
"kl": 0.0, |
|
"learning_rate": 3.854047890535917e-07, |
|
"logps/chosen": -291.94080528846155, |
|
"logps/rejected": -454.1446265243902, |
|
"loss": 0.5046, |
|
"rewards/chosen": -0.05552493608914889, |
|
"rewards/margins": 2.302112236702867, |
|
"rewards/rejected": -2.357637172792016, |
|
"step": 300 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.052631378173828, |
|
"count/fg_rejected": 6.157894611358643, |
|
"epoch": 0.31794871794871793, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.878127098083496, |
|
"fg_logps/policy_chosen": -6.794922828674316, |
|
"fg_logps/policy_rejected": -8.655734062194824, |
|
"fg_logps/reference_KL": -11.557811737060547, |
|
"fg_logps/reference_chosen": -6.259448528289795, |
|
"fg_logps/reference_rejected": -7.914809226989746, |
|
"fg_loss": 0.7258095145225525, |
|
"fg_rewards/chosen_sum": -1.6086143255233765, |
|
"fg_rewards/rejected_sum": -0.5489023327827454, |
|
"grad_norm": 43.21903612458155, |
|
"kl": 0.0, |
|
"learning_rate": 3.7970353477765105e-07, |
|
"logps/chosen": -361.2526117369186, |
|
"logps/rejected": -382.5555320945946, |
|
"loss": 0.4464, |
|
"rewards/chosen": 0.8190518756245457, |
|
"rewards/margins": 3.1284712498327383, |
|
"rewards/rejected": -2.3094193742081925, |
|
"step": 310 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.66666603088379, |
|
"count/fg_rejected": 7.05555534362793, |
|
"epoch": 0.3282051282051282, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.007112503051758, |
|
"fg_logps/policy_chosen": -7.029316425323486, |
|
"fg_logps/policy_rejected": -8.816922187805176, |
|
"fg_logps/reference_KL": -11.414546012878418, |
|
"fg_logps/reference_chosen": -6.1291069984436035, |
|
"fg_logps/reference_rejected": -8.072396278381348, |
|
"fg_loss": 0.6982179880142212, |
|
"fg_rewards/chosen_sum": -1.864067554473877, |
|
"fg_rewards/rejected_sum": -0.625391960144043, |
|
"grad_norm": 49.56835553157457, |
|
"kl": 0.0, |
|
"learning_rate": 3.740022805017103e-07, |
|
"logps/chosen": -308.43726245777026, |
|
"logps/rejected": -462.0808502906977, |
|
"loss": 0.4611, |
|
"rewards/chosen": 0.7936567358068518, |
|
"rewards/margins": 2.709756816430035, |
|
"rewards/rejected": -1.9161000806231832, |
|
"step": 320 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.941177368164062, |
|
"count/fg_rejected": 6.125, |
|
"epoch": 0.3384615384615385, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.976774215698242, |
|
"fg_logps/policy_chosen": -6.938152313232422, |
|
"fg_logps/policy_rejected": -7.871105670928955, |
|
"fg_logps/reference_KL": -11.392000198364258, |
|
"fg_logps/reference_chosen": -6.208968162536621, |
|
"fg_logps/reference_rejected": -6.786693096160889, |
|
"fg_loss": 0.8347401022911072, |
|
"fg_rewards/chosen_sum": -2.1311469078063965, |
|
"fg_rewards/rejected_sum": -0.6044603586196899, |
|
"grad_norm": 23.82687521831931, |
|
"kl": 0.0, |
|
"learning_rate": 3.6830102622576964e-07, |
|
"logps/chosen": -316.02463269589555, |
|
"logps/rejected": -366.19430443548384, |
|
"loss": 0.4596, |
|
"rewards/chosen": 0.36891575713655844, |
|
"rewards/margins": 2.952398068754502, |
|
"rewards/rejected": -2.5834823116179435, |
|
"step": 330 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.647058486938477, |
|
"count/fg_rejected": 5.0, |
|
"epoch": 0.3487179487179487, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.484383583068848, |
|
"fg_logps/policy_chosen": -6.541139602661133, |
|
"fg_logps/policy_rejected": -8.020905494689941, |
|
"fg_logps/reference_KL": -11.032403945922852, |
|
"fg_logps/reference_chosen": -5.8485236167907715, |
|
"fg_logps/reference_rejected": -7.10052490234375, |
|
"fg_loss": 0.7300294041633606, |
|
"fg_rewards/chosen_sum": -1.9582923650741577, |
|
"fg_rewards/rejected_sum": -0.4550693929195404, |
|
"grad_norm": 44.41360203093714, |
|
"kl": 0.0, |
|
"learning_rate": 3.625997719498289e-07, |
|
"logps/chosen": -337.7124953497024, |
|
"logps/rejected": -364.32930715460526, |
|
"loss": 0.4522, |
|
"rewards/chosen": 0.633344604855492, |
|
"rewards/margins": 2.6144102545907923, |
|
"rewards/rejected": -1.9810656497353, |
|
"step": 340 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.35293960571289, |
|
"count/fg_rejected": 8.764705657958984, |
|
"epoch": 0.358974358974359, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.230541229248047, |
|
"fg_logps/policy_chosen": -7.154247760772705, |
|
"fg_logps/policy_rejected": -7.5230560302734375, |
|
"fg_logps/reference_KL": -11.367884635925293, |
|
"fg_logps/reference_chosen": -6.23461389541626, |
|
"fg_logps/reference_rejected": -6.495339870452881, |
|
"fg_loss": 0.853543221950531, |
|
"fg_rewards/chosen_sum": -2.904083490371704, |
|
"fg_rewards/rejected_sum": -0.9132155179977417, |
|
"grad_norm": 30.3057952063642, |
|
"kl": 0.0, |
|
"learning_rate": 3.5689851767388824e-07, |
|
"logps/chosen": -394.6584884129214, |
|
"logps/rejected": -393.08568992077466, |
|
"loss": 0.4936, |
|
"rewards/chosen": 0.6523181615250834, |
|
"rewards/margins": 2.7101455334716062, |
|
"rewards/rejected": -2.057827371946523, |
|
"step": 350 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.30769157409668, |
|
"count/fg_rejected": 5.0, |
|
"epoch": 0.36923076923076925, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.898691177368164, |
|
"fg_logps/policy_chosen": -7.163309574127197, |
|
"fg_logps/policy_rejected": -6.87624979019165, |
|
"fg_logps/reference_KL": -10.903498649597168, |
|
"fg_logps/reference_chosen": -6.516329288482666, |
|
"fg_logps/reference_rejected": -6.057809829711914, |
|
"fg_loss": 0.7118747234344482, |
|
"fg_rewards/chosen_sum": -1.5648789405822754, |
|
"fg_rewards/rejected_sum": -0.6483681201934814, |
|
"grad_norm": 37.36414008433845, |
|
"kl": 0.0, |
|
"learning_rate": 3.511972633979475e-07, |
|
"logps/chosen": -316.7720240542763, |
|
"logps/rejected": -391.03125, |
|
"loss": 0.4128, |
|
"rewards/chosen": 1.343739258615594, |
|
"rewards/margins": 2.428480033587692, |
|
"rewards/rejected": -1.0847407749720983, |
|
"step": 360 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.190475463867188, |
|
"count/fg_rejected": 7.050000190734863, |
|
"epoch": 0.37948717948717947, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.466145515441895, |
|
"fg_logps/policy_chosen": -6.301754951477051, |
|
"fg_logps/policy_rejected": -8.657445907592773, |
|
"fg_logps/reference_KL": -10.610649108886719, |
|
"fg_logps/reference_chosen": -5.920670509338379, |
|
"fg_logps/reference_rejected": -8.133522987365723, |
|
"fg_loss": 0.7434370517730713, |
|
"fg_rewards/chosen_sum": -0.8639131188392639, |
|
"fg_rewards/rejected_sum": -0.45794281363487244, |
|
"grad_norm": 28.069125042652725, |
|
"kl": 0.0, |
|
"learning_rate": 3.4549600912200683e-07, |
|
"logps/chosen": -327.7210542485955, |
|
"logps/rejected": -410.5584286971831, |
|
"loss": 0.4659, |
|
"rewards/chosen": 1.6249963996115695, |
|
"rewards/margins": 2.5865509134622515, |
|
"rewards/rejected": -0.9615545138506822, |
|
"step": 370 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.900001525878906, |
|
"count/fg_rejected": 8.899999618530273, |
|
"epoch": 0.38974358974358975, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.721631050109863, |
|
"fg_logps/policy_chosen": -6.37019681930542, |
|
"fg_logps/policy_rejected": -6.883843898773193, |
|
"fg_logps/reference_KL": -11.473298072814941, |
|
"fg_logps/reference_chosen": -6.092167854309082, |
|
"fg_logps/reference_rejected": -6.5457611083984375, |
|
"fg_loss": 0.8677409887313843, |
|
"fg_rewards/chosen_sum": -0.9150064587593079, |
|
"fg_rewards/rejected_sum": -0.41792982816696167, |
|
"grad_norm": 42.9333549511222, |
|
"kl": 0.0, |
|
"learning_rate": 3.397947548460661e-07, |
|
"logps/chosen": -340.5213176448171, |
|
"logps/rejected": -451.7316706730769, |
|
"loss": 0.4514, |
|
"rewards/chosen": 1.505601836413872, |
|
"rewards/margins": 2.4657741472674877, |
|
"rewards/rejected": -0.9601723108536158, |
|
"step": 380 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.55555534362793, |
|
"count/fg_rejected": 7.0, |
|
"epoch": 0.4, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.328727722167969, |
|
"fg_logps/policy_chosen": -6.023360729217529, |
|
"fg_logps/policy_rejected": -7.300014972686768, |
|
"fg_logps/reference_KL": -10.421500205993652, |
|
"fg_logps/reference_chosen": -5.618011474609375, |
|
"fg_logps/reference_rejected": -6.620323181152344, |
|
"fg_loss": 0.7781895399093628, |
|
"fg_rewards/chosen_sum": -0.9389697313308716, |
|
"fg_rewards/rejected_sum": -0.5253291130065918, |
|
"grad_norm": 42.84097066535792, |
|
"kl": 0.0, |
|
"learning_rate": 3.340935005701254e-07, |
|
"logps/chosen": -364.2041149400685, |
|
"logps/rejected": -441.3032956178161, |
|
"loss": 0.4992, |
|
"rewards/chosen": 1.0798116187526756, |
|
"rewards/margins": 2.0560204480843476, |
|
"rewards/rejected": -0.9762088293316721, |
|
"step": 390 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.72222137451172, |
|
"count/fg_rejected": 6.647058963775635, |
|
"epoch": 0.41025641025641024, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.881956100463867, |
|
"fg_logps/policy_chosen": -6.5600266456604, |
|
"fg_logps/policy_rejected": -9.088929176330566, |
|
"fg_logps/reference_KL": -11.40665054321289, |
|
"fg_logps/reference_chosen": -5.7827043533325195, |
|
"fg_logps/reference_rejected": -8.174890518188477, |
|
"fg_loss": 0.8777969479560852, |
|
"fg_rewards/chosen_sum": -1.556259036064148, |
|
"fg_rewards/rejected_sum": -0.7908374667167664, |
|
"grad_norm": 37.72027450146743, |
|
"kl": 0.0, |
|
"learning_rate": 3.283922462941847e-07, |
|
"logps/chosen": -365.9978794642857, |
|
"logps/rejected": -436.88159722222224, |
|
"loss": 0.4478, |
|
"rewards/chosen": 1.2335292271205358, |
|
"rewards/margins": 2.4691440885029143, |
|
"rewards/rejected": -1.2356148613823785, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"eval_count/fg_chosen": 30.183246612548828, |
|
"eval_count/fg_rejected": 6.92391300201416, |
|
"eval_fg_kl": NaN, |
|
"eval_fg_logps/policy_KL": -13.678318977355957, |
|
"eval_fg_logps/policy_chosen": -6.628693580627441, |
|
"eval_fg_logps/policy_rejected": -8.363188743591309, |
|
"eval_fg_logps/reference_KL": -11.47359848022461, |
|
"eval_fg_logps/reference_chosen": -6.041894912719727, |
|
"eval_fg_logps/reference_rejected": -7.58065938949585, |
|
"eval_fg_loss": 0.7654322385787964, |
|
"eval_fg_rewards/chosen_sum": -1.3938791751861572, |
|
"eval_fg_rewards/rejected_sum": -0.6767725944519043, |
|
"eval_kl": 0.02797871269285679, |
|
"eval_logps/chosen": -340.2313144329897, |
|
"eval_logps/rejected": -400.85385283893396, |
|
"eval_loss": 0.4325231909751892, |
|
"eval_rewards/chosen": 1.316945568665879, |
|
"eval_rewards/margins": 3.0533541780318263, |
|
"eval_rewards/rejected": -1.7364086093659472, |
|
"eval_runtime": 492.9712, |
|
"eval_samples_per_second": 3.515, |
|
"eval_steps_per_second": 0.88, |
|
"step": 400 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.549999237060547, |
|
"count/fg_rejected": 6.25, |
|
"epoch": 0.4205128205128205, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.237287521362305, |
|
"fg_logps/policy_chosen": -6.820374488830566, |
|
"fg_logps/policy_rejected": -8.927366256713867, |
|
"fg_logps/reference_KL": -12.311280250549316, |
|
"fg_logps/reference_chosen": -5.970030784606934, |
|
"fg_logps/reference_rejected": -7.938845634460449, |
|
"fg_loss": 0.8091492056846619, |
|
"fg_rewards/chosen_sum": -1.5172061920166016, |
|
"fg_rewards/rejected_sum": -0.7380185723304749, |
|
"grad_norm": 53.93549024472509, |
|
"kl": 0.0, |
|
"learning_rate": 3.22690992018244e-07, |
|
"logps/chosen": -324.6869419642857, |
|
"logps/rejected": -405.54951054216866, |
|
"loss": 0.4023, |
|
"rewards/chosen": 1.265897478376116, |
|
"rewards/margins": 3.5306600810328366, |
|
"rewards/rejected": -2.2647626026567207, |
|
"step": 410 |
|
}, |
|
{ |
|
"count/fg_chosen": 23.399999618530273, |
|
"count/fg_rejected": 6.133333206176758, |
|
"epoch": 0.4307692307692308, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.516840934753418, |
|
"fg_logps/policy_chosen": -7.477798938751221, |
|
"fg_logps/policy_rejected": -8.535691261291504, |
|
"fg_logps/reference_KL": -11.834728240966797, |
|
"fg_logps/reference_chosen": -6.342043876647949, |
|
"fg_logps/reference_rejected": -7.10928201675415, |
|
"fg_loss": 0.8881044387817383, |
|
"fg_rewards/chosen_sum": -2.0730390548706055, |
|
"fg_rewards/rejected_sum": -0.9379479289054871, |
|
"grad_norm": 37.934930263204464, |
|
"kl": 0.04062976688146591, |
|
"learning_rate": 3.169897377423033e-07, |
|
"logps/chosen": -352.2984280873494, |
|
"logps/rejected": -437.3393871753247, |
|
"loss": 0.4353, |
|
"rewards/chosen": 0.722329794642437, |
|
"rewards/margins": 3.3173880871799777, |
|
"rewards/rejected": -2.5950582925375407, |
|
"step": 420 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.41176414489746, |
|
"count/fg_rejected": 5.882352828979492, |
|
"epoch": 0.441025641025641, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.61406421661377, |
|
"fg_logps/policy_chosen": -6.908777713775635, |
|
"fg_logps/policy_rejected": -9.259625434875488, |
|
"fg_logps/reference_KL": -10.859848976135254, |
|
"fg_logps/reference_chosen": -5.828268527984619, |
|
"fg_logps/reference_rejected": -7.893514156341553, |
|
"fg_loss": 0.7920488119125366, |
|
"fg_rewards/chosen_sum": -2.7851388454437256, |
|
"fg_rewards/rejected_sum": -0.8430763483047485, |
|
"grad_norm": 31.263236198590103, |
|
"kl": 0.20134501159191132, |
|
"learning_rate": 3.112884834663626e-07, |
|
"logps/chosen": -338.0028831845238, |
|
"logps/rejected": -437.03207236842104, |
|
"loss": 0.4237, |
|
"rewards/chosen": 1.1830097380138578, |
|
"rewards/margins": 3.346872267567723, |
|
"rewards/rejected": -2.163862529553865, |
|
"step": 430 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.16666603088379, |
|
"count/fg_rejected": 5.583333492279053, |
|
"epoch": 0.4512820512820513, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.71406078338623, |
|
"fg_logps/policy_chosen": -6.013169765472412, |
|
"fg_logps/policy_rejected": -7.012132167816162, |
|
"fg_logps/reference_KL": -10.454259872436523, |
|
"fg_logps/reference_chosen": -5.288631439208984, |
|
"fg_logps/reference_rejected": -6.411142826080322, |
|
"fg_loss": 0.8265379071235657, |
|
"fg_rewards/chosen_sum": -1.9277740716934204, |
|
"fg_rewards/rejected_sum": -0.4081937372684479, |
|
"grad_norm": 33.498855345311206, |
|
"kl": 0.0, |
|
"learning_rate": 3.055872291904219e-07, |
|
"logps/chosen": -433.9810126582278, |
|
"logps/rejected": -409.45997299382714, |
|
"loss": 0.4124, |
|
"rewards/chosen": 0.23994885215276404, |
|
"rewards/margins": 2.5844254342442956, |
|
"rewards/rejected": -2.3444765820915316, |
|
"step": 440 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.214284896850586, |
|
"count/fg_rejected": 7.0714287757873535, |
|
"epoch": 0.46153846153846156, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.437283515930176, |
|
"fg_logps/policy_chosen": -7.329289436340332, |
|
"fg_logps/policy_rejected": -7.971861839294434, |
|
"fg_logps/reference_KL": -11.504508018493652, |
|
"fg_logps/reference_chosen": -6.307824611663818, |
|
"fg_logps/reference_rejected": -7.138981342315674, |
|
"fg_loss": 0.8858200907707214, |
|
"fg_rewards/chosen_sum": -2.4875144958496094, |
|
"fg_rewards/rejected_sum": -0.8280299305915833, |
|
"grad_norm": 31.189671182424355, |
|
"kl": 0.0, |
|
"learning_rate": 2.998859749144812e-07, |
|
"logps/chosen": -298.6210195806962, |
|
"logps/rejected": -419.6058545524691, |
|
"loss": 0.4201, |
|
"rewards/chosen": 0.8161652963372725, |
|
"rewards/margins": 3.091305375788468, |
|
"rewards/rejected": -2.275140079451196, |
|
"step": 450 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.3636360168457, |
|
"count/fg_rejected": 4.7272725105285645, |
|
"epoch": 0.4717948717948718, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.52069091796875, |
|
"fg_logps/policy_chosen": -6.610226154327393, |
|
"fg_logps/policy_rejected": -10.518632888793945, |
|
"fg_logps/reference_KL": -10.879704475402832, |
|
"fg_logps/reference_chosen": -5.88496732711792, |
|
"fg_logps/reference_rejected": -9.407367706298828, |
|
"fg_loss": 0.6551663279533386, |
|
"fg_rewards/chosen_sum": -2.258774757385254, |
|
"fg_rewards/rejected_sum": -0.5948446989059448, |
|
"grad_norm": 33.51878345246348, |
|
"kl": 0.029797697439789772, |
|
"learning_rate": 2.941847206385404e-07, |
|
"logps/chosen": -332.60402610085225, |
|
"logps/rejected": -395.40771484375, |
|
"loss": 0.4001, |
|
"rewards/chosen": 1.4196222478693181, |
|
"rewards/margins": 3.0095812864977907, |
|
"rewards/rejected": -1.5899590386284723, |
|
"step": 460 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.5, |
|
"count/fg_rejected": 5.800000190734863, |
|
"epoch": 0.48205128205128206, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.285706520080566, |
|
"fg_logps/policy_chosen": -6.048055648803711, |
|
"fg_logps/policy_rejected": -8.306843757629395, |
|
"fg_logps/reference_KL": -10.14644718170166, |
|
"fg_logps/reference_chosen": -5.535238742828369, |
|
"fg_logps/reference_rejected": -7.302800178527832, |
|
"fg_loss": 0.6428090333938599, |
|
"fg_rewards/chosen_sum": -1.36484956741333, |
|
"fg_rewards/rejected_sum": -0.6093672513961792, |
|
"grad_norm": 26.805133801472735, |
|
"kl": 0.17673882842063904, |
|
"learning_rate": 2.8848346636259974e-07, |
|
"logps/chosen": -317.91790291432585, |
|
"logps/rejected": -368.47114326584506, |
|
"loss": 0.4744, |
|
"rewards/chosen": 1.5331906093640273, |
|
"rewards/margins": 2.4303819928332997, |
|
"rewards/rejected": -0.8971913834692726, |
|
"step": 470 |
|
}, |
|
{ |
|
"count/fg_chosen": 39.266666412353516, |
|
"count/fg_rejected": 7.4666666984558105, |
|
"epoch": 0.49230769230769234, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.738459587097168, |
|
"fg_logps/policy_chosen": -6.287077903747559, |
|
"fg_logps/policy_rejected": -6.787537097930908, |
|
"fg_logps/reference_KL": -11.216691017150879, |
|
"fg_logps/reference_chosen": -5.826966762542725, |
|
"fg_logps/reference_rejected": -6.379599571228027, |
|
"fg_loss": 0.7447641491889954, |
|
"fg_rewards/chosen_sum": -0.980557382106781, |
|
"fg_rewards/rejected_sum": -0.506367564201355, |
|
"grad_norm": 39.97199247238054, |
|
"kl": 0.0, |
|
"learning_rate": 2.82782212086659e-07, |
|
"logps/chosen": -401.7761665239726, |
|
"logps/rejected": -363.2634698275862, |
|
"loss": 0.5103, |
|
"rewards/chosen": 1.34750000418049, |
|
"rewards/margins": 1.8919780588848274, |
|
"rewards/rejected": -0.5444780547043373, |
|
"step": 480 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.875, |
|
"count/fg_rejected": 5.400000095367432, |
|
"epoch": 0.5025641025641026, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.641780853271484, |
|
"fg_logps/policy_chosen": -8.056283950805664, |
|
"fg_logps/policy_rejected": -8.251357078552246, |
|
"fg_logps/reference_KL": -13.59717845916748, |
|
"fg_logps/reference_chosen": -7.162622928619385, |
|
"fg_logps/reference_rejected": -7.323818683624268, |
|
"fg_loss": 0.8411279916763306, |
|
"fg_rewards/chosen_sum": -1.9043647050857544, |
|
"fg_rewards/rejected_sum": -0.7839928269386292, |
|
"grad_norm": 29.60091559742249, |
|
"kl": 0.22372007369995117, |
|
"learning_rate": 2.7708095781071834e-07, |
|
"logps/chosen": -324.56468441611844, |
|
"logps/rejected": -461.18638392857144, |
|
"loss": 0.4437, |
|
"rewards/chosen": 1.3998164126747532, |
|
"rewards/margins": 2.890947968141178, |
|
"rewards/rejected": -1.4911315554664248, |
|
"step": 490 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.3157901763916, |
|
"count/fg_rejected": 5.294117450714111, |
|
"epoch": 0.5128205128205128, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.019055366516113, |
|
"fg_logps/policy_chosen": -6.862349033355713, |
|
"fg_logps/policy_rejected": -8.392266273498535, |
|
"fg_logps/reference_KL": -11.065834999084473, |
|
"fg_logps/reference_chosen": -6.182069301605225, |
|
"fg_logps/reference_rejected": -7.583798408508301, |
|
"fg_loss": 0.8486608266830444, |
|
"fg_rewards/chosen_sum": -1.7188913822174072, |
|
"fg_rewards/rejected_sum": -0.4710962176322937, |
|
"grad_norm": 37.93308715806046, |
|
"kl": 0.0, |
|
"learning_rate": 2.713797035347776e-07, |
|
"logps/chosen": -336.33485504518075, |
|
"logps/rejected": -412.9320211038961, |
|
"loss": 0.4063, |
|
"rewards/chosen": 1.6464567988751881, |
|
"rewards/margins": 3.429747315121637, |
|
"rewards/rejected": -1.783290516246449, |
|
"step": 500 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.52941131591797, |
|
"count/fg_rejected": 6.1875, |
|
"epoch": 0.5230769230769231, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.041118621826172, |
|
"fg_logps/policy_chosen": -6.475778579711914, |
|
"fg_logps/policy_rejected": -8.933878898620605, |
|
"fg_logps/reference_KL": -11.386185646057129, |
|
"fg_logps/reference_chosen": -6.088446617126465, |
|
"fg_logps/reference_rejected": -8.2723970413208, |
|
"fg_loss": 0.6639065742492676, |
|
"fg_rewards/chosen_sum": -1.039247989654541, |
|
"fg_rewards/rejected_sum": -0.31398898363113403, |
|
"grad_norm": 51.003351409032156, |
|
"kl": 0.0, |
|
"learning_rate": 2.6567844925883693e-07, |
|
"logps/chosen": -315.5387290396341, |
|
"logps/rejected": -434.9411057692308, |
|
"loss": 0.4328, |
|
"rewards/chosen": 1.3493434626881669, |
|
"rewards/margins": 2.8553199195503964, |
|
"rewards/rejected": -1.5059764568622296, |
|
"step": 510 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.549999237060547, |
|
"count/fg_rejected": 7.300000190734863, |
|
"epoch": 0.5333333333333333, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.461477279663086, |
|
"fg_logps/policy_chosen": -7.2638678550720215, |
|
"fg_logps/policy_rejected": -8.44337272644043, |
|
"fg_logps/reference_KL": -12.010942459106445, |
|
"fg_logps/reference_chosen": -6.417178153991699, |
|
"fg_logps/reference_rejected": -7.042668342590332, |
|
"fg_loss": 0.8494647145271301, |
|
"fg_rewards/chosen_sum": -2.0671496391296387, |
|
"fg_rewards/rejected_sum": -0.9835360646247864, |
|
"grad_norm": 41.33647069198984, |
|
"kl": 0.0, |
|
"learning_rate": 2.599771949828962e-07, |
|
"logps/chosen": -332.418183117378, |
|
"logps/rejected": -371.52498998397436, |
|
"loss": 0.455, |
|
"rewards/chosen": 1.320475787651248, |
|
"rewards/margins": 3.076387394659962, |
|
"rewards/rejected": -1.755911607008714, |
|
"step": 520 |
|
}, |
|
{ |
|
"count/fg_chosen": 30.526315689086914, |
|
"count/fg_rejected": 9.11111068725586, |
|
"epoch": 0.5435897435897435, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.753694534301758, |
|
"fg_logps/policy_chosen": -6.313483238220215, |
|
"fg_logps/policy_rejected": -8.319221496582031, |
|
"fg_logps/reference_KL": -10.8753080368042, |
|
"fg_logps/reference_chosen": -5.6761603355407715, |
|
"fg_logps/reference_rejected": -7.430839538574219, |
|
"fg_loss": 0.7786957621574402, |
|
"fg_rewards/chosen_sum": -1.3800252676010132, |
|
"fg_rewards/rejected_sum": -1.1956380605697632, |
|
"grad_norm": 34.93340197656234, |
|
"kl": 0.0, |
|
"learning_rate": 2.542759407069555e-07, |
|
"logps/chosen": -311.92927758487656, |
|
"logps/rejected": -372.22604331487344, |
|
"loss": 0.448, |
|
"rewards/chosen": 1.0696545824592496, |
|
"rewards/margins": 3.1661021045864253, |
|
"rewards/rejected": -2.0964475221271757, |
|
"step": 530 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.764705657958984, |
|
"count/fg_rejected": 6.352941036224365, |
|
"epoch": 0.5538461538461539, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.03756046295166, |
|
"fg_logps/policy_chosen": -6.464415073394775, |
|
"fg_logps/policy_rejected": -7.652871608734131, |
|
"fg_logps/reference_KL": -10.35094165802002, |
|
"fg_logps/reference_chosen": -5.7773261070251465, |
|
"fg_logps/reference_rejected": -6.825320720672607, |
|
"fg_loss": 0.8047051429748535, |
|
"fg_rewards/chosen_sum": -1.6725194454193115, |
|
"fg_rewards/rejected_sum": -0.7678513526916504, |
|
"grad_norm": 41.240539642737886, |
|
"kl": 0.0, |
|
"learning_rate": 2.485746864310148e-07, |
|
"logps/chosen": -345.1589215158046, |
|
"logps/rejected": -397.16462435787673, |
|
"loss": 0.4683, |
|
"rewards/chosen": 1.1091806696749282, |
|
"rewards/margins": 2.9815141440301294, |
|
"rewards/rejected": -1.8723334743552011, |
|
"step": 540 |
|
}, |
|
{ |
|
"count/fg_chosen": 36.0, |
|
"count/fg_rejected": 7.176470756530762, |
|
"epoch": 0.5641025641025641, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.34708023071289, |
|
"fg_logps/policy_chosen": -6.144561767578125, |
|
"fg_logps/policy_rejected": -7.657267093658447, |
|
"fg_logps/reference_KL": -10.591930389404297, |
|
"fg_logps/reference_chosen": -5.485869884490967, |
|
"fg_logps/reference_rejected": -6.727408409118652, |
|
"fg_loss": 0.7655860185623169, |
|
"fg_rewards/chosen_sum": -1.8279484510421753, |
|
"fg_rewards/rejected_sum": -0.5319306254386902, |
|
"grad_norm": 34.418931407344175, |
|
"kl": 0.0, |
|
"learning_rate": 2.428734321550741e-07, |
|
"logps/chosen": -332.24548669763516, |
|
"logps/rejected": -384.9080214389535, |
|
"loss": 0.4533, |
|
"rewards/chosen": 1.4185297166978992, |
|
"rewards/margins": 3.59055198175513, |
|
"rewards/rejected": -2.172022265057231, |
|
"step": 550 |
|
}, |
|
{ |
|
"count/fg_chosen": 30.6842098236084, |
|
"count/fg_rejected": 5.842105388641357, |
|
"epoch": 0.5743589743589743, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.044573783874512, |
|
"fg_logps/policy_chosen": -6.35300874710083, |
|
"fg_logps/policy_rejected": -8.149062156677246, |
|
"fg_logps/reference_KL": -9.564573287963867, |
|
"fg_logps/reference_chosen": -5.577574253082275, |
|
"fg_logps/reference_rejected": -7.246463298797607, |
|
"fg_loss": 0.7160053849220276, |
|
"fg_rewards/chosen_sum": -1.8285123109817505, |
|
"fg_rewards/rejected_sum": -0.428337961435318, |
|
"grad_norm": 32.969063143022574, |
|
"kl": 0.0, |
|
"learning_rate": 2.371721778791334e-07, |
|
"logps/chosen": -404.944683908046, |
|
"logps/rejected": -394.23758561643837, |
|
"loss": 0.4521, |
|
"rewards/chosen": 1.1878378857141254, |
|
"rewards/margins": 2.883744642389532, |
|
"rewards/rejected": -1.6959067566754067, |
|
"step": 560 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.5, |
|
"count/fg_rejected": 6.733333110809326, |
|
"epoch": 0.5846153846153846, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.527681350708008, |
|
"fg_logps/policy_chosen": -6.548896789550781, |
|
"fg_logps/policy_rejected": -6.040011882781982, |
|
"fg_logps/reference_KL": -9.865092277526855, |
|
"fg_logps/reference_chosen": -5.893582344055176, |
|
"fg_logps/reference_rejected": -5.497416019439697, |
|
"fg_loss": 0.5755335092544556, |
|
"fg_rewards/chosen_sum": -1.5131157636642456, |
|
"fg_rewards/rejected_sum": -0.35849064588546753, |
|
"grad_norm": 40.43474566516004, |
|
"kl": 0.0, |
|
"learning_rate": 2.314709236031927e-07, |
|
"logps/chosen": -354.1348353794643, |
|
"logps/rejected": -417.60911800986844, |
|
"loss": 0.4105, |
|
"rewards/chosen": 0.7818209330240885, |
|
"rewards/margins": 3.425596471418414, |
|
"rewards/rejected": -2.6437755383943258, |
|
"step": 570 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.428571701049805, |
|
"count/fg_rejected": 7.599999904632568, |
|
"epoch": 0.5948717948717949, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.412928581237793, |
|
"fg_logps/policy_chosen": -6.371100902557373, |
|
"fg_logps/policy_rejected": -7.6736016273498535, |
|
"fg_logps/reference_KL": -10.896791458129883, |
|
"fg_logps/reference_chosen": -5.65976095199585, |
|
"fg_logps/reference_rejected": -6.982507228851318, |
|
"fg_loss": 0.7268858551979065, |
|
"fg_rewards/chosen_sum": -1.6253752708435059, |
|
"fg_rewards/rejected_sum": -0.8460947871208191, |
|
"grad_norm": 57.39186053322085, |
|
"kl": 0.0, |
|
"learning_rate": 2.2576966932725198e-07, |
|
"logps/chosen": -291.66895736882714, |
|
"logps/rejected": -384.9341623813291, |
|
"loss": 0.4683, |
|
"rewards/chosen": 1.6188120900848766, |
|
"rewards/margins": 2.9102534159847977, |
|
"rewards/rejected": -1.2914413258999209, |
|
"step": 580 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.904762268066406, |
|
"count/fg_rejected": 6.526315689086914, |
|
"epoch": 0.6051282051282051, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.970428466796875, |
|
"fg_logps/policy_chosen": -6.568854808807373, |
|
"fg_logps/policy_rejected": -8.221002578735352, |
|
"fg_logps/reference_KL": -11.08731746673584, |
|
"fg_logps/reference_chosen": -5.649308204650879, |
|
"fg_logps/reference_rejected": -7.402557373046875, |
|
"fg_loss": 0.705423891544342, |
|
"fg_rewards/chosen_sum": -2.060758590698242, |
|
"fg_rewards/rejected_sum": -0.8252547979354858, |
|
"grad_norm": 35.44361919546434, |
|
"kl": 0.0, |
|
"learning_rate": 2.2006841505131128e-07, |
|
"logps/chosen": -442.12862723214283, |
|
"logps/rejected": -406.4741981907895, |
|
"loss": 0.4365, |
|
"rewards/chosen": 1.5486488342285156, |
|
"rewards/margins": 3.623551418906764, |
|
"rewards/rejected": -2.0749025846782483, |
|
"step": 590 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.83333396911621, |
|
"count/fg_rejected": 6.2727274894714355, |
|
"epoch": 0.6153846153846154, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.578009605407715, |
|
"fg_logps/policy_chosen": -7.306002140045166, |
|
"fg_logps/policy_rejected": -8.329404830932617, |
|
"fg_logps/reference_KL": -11.412123680114746, |
|
"fg_logps/reference_chosen": -6.204747676849365, |
|
"fg_logps/reference_rejected": -6.661261081695557, |
|
"fg_loss": 0.649915337562561, |
|
"fg_rewards/chosen_sum": -2.443979263305664, |
|
"fg_rewards/rejected_sum": -0.9118065237998962, |
|
"grad_norm": 48.33661978525442, |
|
"kl": 0.0, |
|
"learning_rate": 2.1436716077537057e-07, |
|
"logps/chosen": -353.3546720805921, |
|
"logps/rejected": -475.7469773065476, |
|
"loss": 0.4239, |
|
"rewards/chosen": 1.3294219970703125, |
|
"rewards/margins": 3.233419145856585, |
|
"rewards/rejected": -1.9039971487862724, |
|
"step": 600 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.647058486938477, |
|
"count/fg_rejected": 6.470588207244873, |
|
"epoch": 0.6256410256410256, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.512959480285645, |
|
"fg_logps/policy_chosen": -6.967007160186768, |
|
"fg_logps/policy_rejected": -8.427515029907227, |
|
"fg_logps/reference_KL": -10.780054092407227, |
|
"fg_logps/reference_chosen": -6.145755290985107, |
|
"fg_logps/reference_rejected": -7.377350807189941, |
|
"fg_loss": 0.8846892714500427, |
|
"fg_rewards/chosen_sum": -1.872863531112671, |
|
"fg_rewards/rejected_sum": -0.8110222816467285, |
|
"grad_norm": 27.94628622877484, |
|
"kl": 0.0, |
|
"learning_rate": 2.0866590649942987e-07, |
|
"logps/chosen": -325.96284054487177, |
|
"logps/rejected": -358.4112280868902, |
|
"loss": 0.443, |
|
"rewards/chosen": 1.2946516183706431, |
|
"rewards/margins": 2.8088877894417656, |
|
"rewards/rejected": -1.5142361710711223, |
|
"step": 610 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.238094329833984, |
|
"count/fg_rejected": 6.949999809265137, |
|
"epoch": 0.6358974358974359, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.278539657592773, |
|
"fg_logps/policy_chosen": -6.847230434417725, |
|
"fg_logps/policy_rejected": -9.253725051879883, |
|
"fg_logps/reference_KL": -11.118898391723633, |
|
"fg_logps/reference_chosen": -5.77498722076416, |
|
"fg_logps/reference_rejected": -7.873915195465088, |
|
"fg_loss": 0.7475059628486633, |
|
"fg_rewards/chosen_sum": -2.7328929901123047, |
|
"fg_rewards/rejected_sum": -1.1806801557540894, |
|
"grad_norm": 37.21517260596127, |
|
"kl": 0.0, |
|
"learning_rate": 2.0296465222348917e-07, |
|
"logps/chosen": -345.37012924382714, |
|
"logps/rejected": -452.38132911392404, |
|
"loss": 0.4182, |
|
"rewards/chosen": 2.110634132667824, |
|
"rewards/margins": 4.059031805296879, |
|
"rewards/rejected": -1.9483976726290546, |
|
"step": 620 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.3125, |
|
"count/fg_rejected": 7.4375, |
|
"epoch": 0.6461538461538462, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.633122444152832, |
|
"fg_logps/policy_chosen": -6.758295059204102, |
|
"fg_logps/policy_rejected": -8.180941581726074, |
|
"fg_logps/reference_KL": -12.073482513427734, |
|
"fg_logps/reference_chosen": -5.663504600524902, |
|
"fg_logps/reference_rejected": -6.7309250831604, |
|
"fg_loss": 0.8814060091972351, |
|
"fg_rewards/chosen_sum": -2.544795513153076, |
|
"fg_rewards/rejected_sum": -1.284183144569397, |
|
"grad_norm": 42.32817075648944, |
|
"kl": 0.0, |
|
"learning_rate": 1.9726339794754846e-07, |
|
"logps/chosen": -351.775993441358, |
|
"logps/rejected": -522.9557950949367, |
|
"loss": 0.4343, |
|
"rewards/chosen": 1.261369964222849, |
|
"rewards/margins": 3.565065836083015, |
|
"rewards/rejected": -2.303695871860166, |
|
"step": 630 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.64706039428711, |
|
"count/fg_rejected": 8.058823585510254, |
|
"epoch": 0.6564102564102564, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.795089721679688, |
|
"fg_logps/policy_chosen": -6.418089389801025, |
|
"fg_logps/policy_rejected": -8.086543083190918, |
|
"fg_logps/reference_KL": -10.756563186645508, |
|
"fg_logps/reference_chosen": -5.482754230499268, |
|
"fg_logps/reference_rejected": -6.624981880187988, |
|
"fg_loss": 0.8172480463981628, |
|
"fg_rewards/chosen_sum": -2.2949178218841553, |
|
"fg_rewards/rejected_sum": -1.4898707866668701, |
|
"grad_norm": 49.06648139657291, |
|
"kl": 0.0, |
|
"learning_rate": 1.9156214367160776e-07, |
|
"logps/chosen": -323.18095703125, |
|
"logps/rejected": -457.679443359375, |
|
"loss": 0.3975, |
|
"rewards/chosen": 1.7368663787841796, |
|
"rewards/margins": 4.7420207977294915, |
|
"rewards/rejected": -3.0051544189453123, |
|
"step": 640 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.785715103149414, |
|
"count/fg_rejected": 6.0, |
|
"epoch": 0.6666666666666666, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.923561096191406, |
|
"fg_logps/policy_chosen": -7.209476470947266, |
|
"fg_logps/policy_rejected": -8.816498756408691, |
|
"fg_logps/reference_KL": -12.868348121643066, |
|
"fg_logps/reference_chosen": -5.93485164642334, |
|
"fg_logps/reference_rejected": -7.260035991668701, |
|
"fg_loss": 0.7890381217002869, |
|
"fg_rewards/chosen_sum": -2.429896831512451, |
|
"fg_rewards/rejected_sum": -0.9942983388900757, |
|
"grad_norm": 42.684857833622196, |
|
"kl": 0.0, |
|
"learning_rate": 1.8586088939566706e-07, |
|
"logps/chosen": -307.12525576636904, |
|
"logps/rejected": -396.83095189144734, |
|
"loss": 0.4325, |
|
"rewards/chosen": 1.7140017918178014, |
|
"rewards/margins": 3.020266568750367, |
|
"rewards/rejected": -1.3062647769325657, |
|
"step": 650 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.578948974609375, |
|
"count/fg_rejected": 5.5789475440979, |
|
"epoch": 0.676923076923077, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.829089164733887, |
|
"fg_logps/policy_chosen": -6.605353832244873, |
|
"fg_logps/policy_rejected": -8.679577827453613, |
|
"fg_logps/reference_KL": -11.020931243896484, |
|
"fg_logps/reference_chosen": -5.843183517456055, |
|
"fg_logps/reference_rejected": -7.641061305999756, |
|
"fg_loss": 0.7101105451583862, |
|
"fg_rewards/chosen_sum": -1.6802619695663452, |
|
"fg_rewards/rejected_sum": -0.5640377998352051, |
|
"grad_norm": 51.20082166149615, |
|
"kl": 0.0, |
|
"learning_rate": 1.8015963511972635e-07, |
|
"logps/chosen": -326.6531840479651, |
|
"logps/rejected": -467.38508234797297, |
|
"loss": 0.3955, |
|
"rewards/chosen": 1.7110205362009447, |
|
"rewards/margins": 3.708857912151114, |
|
"rewards/rejected": -1.9978373759501689, |
|
"step": 660 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.52941131591797, |
|
"count/fg_rejected": 8.882352828979492, |
|
"epoch": 0.6871794871794872, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.263018608093262, |
|
"fg_logps/policy_chosen": -7.245337009429932, |
|
"fg_logps/policy_rejected": -10.788996696472168, |
|
"fg_logps/reference_KL": -11.636683464050293, |
|
"fg_logps/reference_chosen": -6.2825164794921875, |
|
"fg_logps/reference_rejected": -8.302498817443848, |
|
"fg_loss": 0.9282689094543457, |
|
"fg_rewards/chosen_sum": -2.5015087127685547, |
|
"fg_rewards/rejected_sum": -1.4177420139312744, |
|
"grad_norm": 43.77061774737123, |
|
"kl": 0.16583053767681122, |
|
"learning_rate": 1.7445838084378562e-07, |
|
"logps/chosen": -347.34482020547944, |
|
"logps/rejected": -440.41316451149424, |
|
"loss": 0.4428, |
|
"rewards/chosen": 1.4226540343402183, |
|
"rewards/margins": 3.992055567094349, |
|
"rewards/rejected": -2.5694015327541306, |
|
"step": 670 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.0625, |
|
"count/fg_rejected": 6.1875, |
|
"epoch": 0.6974358974358974, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.69076156616211, |
|
"fg_logps/policy_chosen": -6.070934295654297, |
|
"fg_logps/policy_rejected": -9.86292552947998, |
|
"fg_logps/reference_KL": -10.596210479736328, |
|
"fg_logps/reference_chosen": -5.411348342895508, |
|
"fg_logps/reference_rejected": -8.194981575012207, |
|
"fg_loss": 0.7803856730461121, |
|
"fg_rewards/chosen_sum": -1.5216065645217896, |
|
"fg_rewards/rejected_sum": -0.7260585427284241, |
|
"grad_norm": 41.07172408412241, |
|
"kl": 0.0, |
|
"learning_rate": 1.6875712656784492e-07, |
|
"logps/chosen": -321.7051943824405, |
|
"logps/rejected": -402.75840357730266, |
|
"loss": 0.431, |
|
"rewards/chosen": 1.6818878537132627, |
|
"rewards/margins": 2.9571292035860526, |
|
"rewards/rejected": -1.27524134987279, |
|
"step": 680 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.04166793823242, |
|
"count/fg_rejected": 7.956521511077881, |
|
"epoch": 0.7076923076923077, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.681419372558594, |
|
"fg_logps/policy_chosen": -6.311405181884766, |
|
"fg_logps/policy_rejected": -8.458182334899902, |
|
"fg_logps/reference_KL": -10.43921947479248, |
|
"fg_logps/reference_chosen": -5.538773059844971, |
|
"fg_logps/reference_rejected": -7.221550941467285, |
|
"fg_loss": 0.774190366268158, |
|
"fg_rewards/chosen_sum": -2.0744409561157227, |
|
"fg_rewards/rejected_sum": -1.0601459741592407, |
|
"grad_norm": 26.76398723713879, |
|
"kl": 0.0, |
|
"learning_rate": 1.6305587229190422e-07, |
|
"logps/chosen": -337.47130408653845, |
|
"logps/rejected": -426.1006573932927, |
|
"loss": 0.4689, |
|
"rewards/chosen": 1.7125216753054888, |
|
"rewards/margins": 3.957819735280718, |
|
"rewards/rejected": -2.245298059975229, |
|
"step": 690 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.933332443237305, |
|
"count/fg_rejected": 7.5, |
|
"epoch": 0.717948717948718, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -18.62053871154785, |
|
"fg_logps/policy_chosen": -6.723959445953369, |
|
"fg_logps/policy_rejected": -8.809980392456055, |
|
"fg_logps/reference_KL": -14.621339797973633, |
|
"fg_logps/reference_chosen": -5.710059642791748, |
|
"fg_logps/reference_rejected": -7.013789176940918, |
|
"fg_loss": 0.7358676791191101, |
|
"fg_rewards/chosen_sum": -2.391604423522949, |
|
"fg_rewards/rejected_sum": -1.1176395416259766, |
|
"grad_norm": 32.71207561201088, |
|
"kl": 0.0, |
|
"learning_rate": 1.573546180159635e-07, |
|
"logps/chosen": -377.94707661290323, |
|
"logps/rejected": -423.6705923507463, |
|
"loss": 0.3983, |
|
"rewards/chosen": 1.9157637729439685, |
|
"rewards/margins": 3.7593996736406803, |
|
"rewards/rejected": -1.8436359006967118, |
|
"step": 700 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.0, |
|
"count/fg_rejected": 7.0, |
|
"epoch": 0.7282051282051282, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.678239822387695, |
|
"fg_logps/policy_chosen": -5.730792999267578, |
|
"fg_logps/policy_rejected": -6.297366142272949, |
|
"fg_logps/reference_KL": -10.122818946838379, |
|
"fg_logps/reference_chosen": -5.305339813232422, |
|
"fg_logps/reference_rejected": -5.903895378112793, |
|
"fg_loss": 0.8335784673690796, |
|
"fg_rewards/chosen_sum": -1.278421401977539, |
|
"fg_rewards/rejected_sum": -0.4984094202518463, |
|
"grad_norm": 30.093125040941494, |
|
"kl": 0.0, |
|
"learning_rate": 1.516533637400228e-07, |
|
"logps/chosen": -311.1296672077922, |
|
"logps/rejected": -378.9050263554217, |
|
"loss": 0.4411, |
|
"rewards/chosen": 1.7025673606178977, |
|
"rewards/margins": 4.042804529998545, |
|
"rewards/rejected": -2.3402371693806474, |
|
"step": 710 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.214284896850586, |
|
"count/fg_rejected": 4.538461685180664, |
|
"epoch": 0.7384615384615385, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.262101173400879, |
|
"fg_logps/policy_chosen": -6.9626898765563965, |
|
"fg_logps/policy_rejected": -9.714632034301758, |
|
"fg_logps/reference_KL": -11.78027057647705, |
|
"fg_logps/reference_chosen": -6.246325969696045, |
|
"fg_logps/reference_rejected": -8.87566089630127, |
|
"fg_loss": 0.7088484764099121, |
|
"fg_rewards/chosen_sum": -1.5614358186721802, |
|
"fg_rewards/rejected_sum": -0.42503052949905396, |
|
"grad_norm": 41.939062686169684, |
|
"kl": 0.0, |
|
"learning_rate": 1.459521094640821e-07, |
|
"logps/chosen": -357.324462890625, |
|
"logps/rejected": -446.333984375, |
|
"loss": 0.3344, |
|
"rewards/chosen": 1.5038203239440917, |
|
"rewards/margins": 4.055677318572998, |
|
"rewards/rejected": -2.5518569946289062, |
|
"step": 720 |
|
}, |
|
{ |
|
"count/fg_chosen": 34.75, |
|
"count/fg_rejected": 8.470588684082031, |
|
"epoch": 0.7487179487179487, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.44446086883545, |
|
"fg_logps/policy_chosen": -6.577872276306152, |
|
"fg_logps/policy_rejected": -8.81990909576416, |
|
"fg_logps/reference_KL": -10.716168403625488, |
|
"fg_logps/reference_chosen": -6.01681661605835, |
|
"fg_logps/reference_rejected": -8.056208610534668, |
|
"fg_loss": 0.7597875595092773, |
|
"fg_rewards/chosen_sum": -1.6217119693756104, |
|
"fg_rewards/rejected_sum": -1.0568969249725342, |
|
"grad_norm": 29.217422708191275, |
|
"kl": 0.0, |
|
"learning_rate": 1.402508551881414e-07, |
|
"logps/chosen": -325.07026041666666, |
|
"logps/rejected": -434.04232536764704, |
|
"loss": 0.3915, |
|
"rewards/chosen": 2.13581298828125, |
|
"rewards/margins": 5.36644473805147, |
|
"rewards/rejected": -3.2306317497702204, |
|
"step": 730 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.095237731933594, |
|
"count/fg_rejected": 8.7619047164917, |
|
"epoch": 0.7589743589743589, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.643653869628906, |
|
"fg_logps/policy_chosen": -7.633936405181885, |
|
"fg_logps/policy_rejected": -8.764749526977539, |
|
"fg_logps/reference_KL": -12.597454071044922, |
|
"fg_logps/reference_chosen": -6.6152801513671875, |
|
"fg_logps/reference_rejected": -7.493948936462402, |
|
"fg_loss": 0.8724325895309448, |
|
"fg_rewards/chosen_sum": -2.190063714981079, |
|
"fg_rewards/rejected_sum": -1.0442728996276855, |
|
"grad_norm": 35.508557979266605, |
|
"kl": 0.0, |
|
"learning_rate": 1.345496009122007e-07, |
|
"logps/chosen": -323.4399604301948, |
|
"logps/rejected": -447.21136106927713, |
|
"loss": 0.4821, |
|
"rewards/chosen": 1.2747364787312296, |
|
"rewards/margins": 3.8379994181843546, |
|
"rewards/rejected": -2.563262939453125, |
|
"step": 740 |
|
}, |
|
{ |
|
"count/fg_chosen": 35.55555725097656, |
|
"count/fg_rejected": 7.764705657958984, |
|
"epoch": 0.7692307692307693, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -12.14694881439209, |
|
"fg_logps/policy_chosen": -5.938383102416992, |
|
"fg_logps/policy_rejected": -6.999124526977539, |
|
"fg_logps/reference_KL": -9.31165599822998, |
|
"fg_logps/reference_chosen": -5.441190242767334, |
|
"fg_logps/reference_rejected": -6.349566459655762, |
|
"fg_loss": 0.7159730792045593, |
|
"fg_rewards/chosen_sum": -1.1845917701721191, |
|
"fg_rewards/rejected_sum": -0.6632856726646423, |
|
"grad_norm": 29.30713270318622, |
|
"kl": 0.0, |
|
"learning_rate": 1.2884834663625997e-07, |
|
"logps/chosen": -351.3742959665698, |
|
"logps/rejected": -477.6983741554054, |
|
"loss": 0.4247, |
|
"rewards/chosen": 1.214830709058185, |
|
"rewards/margins": 3.774690938550372, |
|
"rewards/rejected": -2.5598602294921875, |
|
"step": 750 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.450000762939453, |
|
"count/fg_rejected": 8.941176414489746, |
|
"epoch": 0.7794871794871795, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.91418743133545, |
|
"fg_logps/policy_chosen": -6.500802516937256, |
|
"fg_logps/policy_rejected": -7.696014404296875, |
|
"fg_logps/reference_KL": -10.872803688049316, |
|
"fg_logps/reference_chosen": -6.060533046722412, |
|
"fg_logps/reference_rejected": -6.9095940589904785, |
|
"fg_loss": 0.8821809887886047, |
|
"fg_rewards/chosen_sum": -1.222095012664795, |
|
"fg_rewards/rejected_sum": -0.8427726030349731, |
|
"grad_norm": 23.409794702847652, |
|
"kl": 0.0, |
|
"learning_rate": 1.2314709236031927e-07, |
|
"logps/chosen": -350.1331449468085, |
|
"logps/rejected": -399.28329190340907, |
|
"loss": 0.4556, |
|
"rewards/chosen": 1.079000432440575, |
|
"rewards/margins": 3.400327179525684, |
|
"rewards/rejected": -2.321326747085109, |
|
"step": 760 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.789474487304688, |
|
"count/fg_rejected": 7.0, |
|
"epoch": 0.7897435897435897, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.873922348022461, |
|
"fg_logps/policy_chosen": -7.308094024658203, |
|
"fg_logps/policy_rejected": -7.175009727478027, |
|
"fg_logps/reference_KL": -11.33347225189209, |
|
"fg_logps/reference_chosen": -6.172826766967773, |
|
"fg_logps/reference_rejected": -6.013812065124512, |
|
"fg_loss": 0.7144444584846497, |
|
"fg_rewards/chosen_sum": -2.5322296619415283, |
|
"fg_rewards/rejected_sum": -0.9527682065963745, |
|
"grad_norm": 39.24437423103846, |
|
"kl": 0.0, |
|
"learning_rate": 1.1744583808437855e-07, |
|
"logps/chosen": -331.80161458333333, |
|
"logps/rejected": -438.73373161764704, |
|
"loss": 0.4217, |
|
"rewards/chosen": 1.7163297526041668, |
|
"rewards/margins": 3.8777695360370714, |
|
"rewards/rejected": -2.1614397834329044, |
|
"step": 770 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.1875, |
|
"count/fg_rejected": 5.8125, |
|
"epoch": 0.8, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.304938316345215, |
|
"fg_logps/policy_chosen": -6.930516242980957, |
|
"fg_logps/policy_rejected": -7.694925785064697, |
|
"fg_logps/reference_KL": -10.620773315429688, |
|
"fg_logps/reference_chosen": -6.272948265075684, |
|
"fg_logps/reference_rejected": -6.997465133666992, |
|
"fg_loss": 0.7136435508728027, |
|
"fg_rewards/chosen_sum": -1.7926644086837769, |
|
"fg_rewards/rejected_sum": -0.3232946991920471, |
|
"grad_norm": 24.909110123805196, |
|
"kl": 0.0, |
|
"learning_rate": 1.1174458380843785e-07, |
|
"logps/chosen": -303.2761627906977, |
|
"logps/rejected": -392.0064400337838, |
|
"loss": 0.4265, |
|
"rewards/chosen": 1.246623904206032, |
|
"rewards/margins": 3.1565479731724597, |
|
"rewards/rejected": -1.9099240689664274, |
|
"step": 780 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.0, |
|
"count/fg_rejected": 6.0, |
|
"epoch": 0.8102564102564103, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.473762512207031, |
|
"fg_logps/policy_chosen": -6.058125972747803, |
|
"fg_logps/policy_rejected": -7.64716100692749, |
|
"fg_logps/reference_KL": -10.569419860839844, |
|
"fg_logps/reference_chosen": -5.636739730834961, |
|
"fg_logps/reference_rejected": -6.955509662628174, |
|
"fg_loss": 0.6918079257011414, |
|
"fg_rewards/chosen_sum": -0.8551385998725891, |
|
"fg_rewards/rejected_sum": -0.39899593591690063, |
|
"grad_norm": 22.682895979877173, |
|
"kl": 0.0, |
|
"learning_rate": 1.0604332953249714e-07, |
|
"logps/chosen": -329.2703077936747, |
|
"logps/rejected": -374.5271154626623, |
|
"loss": 0.4628, |
|
"rewards/chosen": 1.6247121052569653, |
|
"rewards/margins": 2.985569189456945, |
|
"rewards/rejected": -1.3608570841999796, |
|
"step": 790 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.46666717529297, |
|
"count/fg_rejected": 5.666666507720947, |
|
"epoch": 0.8205128205128205, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.543877601623535, |
|
"fg_logps/policy_chosen": -7.2420654296875, |
|
"fg_logps/policy_rejected": -8.499801635742188, |
|
"fg_logps/reference_KL": -11.280120849609375, |
|
"fg_logps/reference_chosen": -6.594120502471924, |
|
"fg_logps/reference_rejected": -7.241811275482178, |
|
"fg_loss": 0.6977981925010681, |
|
"fg_rewards/chosen_sum": -1.8107116222381592, |
|
"fg_rewards/rejected_sum": -0.6922832727432251, |
|
"grad_norm": 39.98049317260069, |
|
"kl": 0.0, |
|
"learning_rate": 1.0034207525655644e-07, |
|
"logps/chosen": -359.1409722222222, |
|
"logps/rejected": -515.9350446428572, |
|
"loss": 0.4043, |
|
"rewards/chosen": 1.5236521402994792, |
|
"rewards/margins": 3.4145729428245906, |
|
"rewards/rejected": -1.8909208025251116, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"eval_count/fg_chosen": 30.183246612548828, |
|
"eval_count/fg_rejected": 6.92391300201416, |
|
"eval_fg_kl": NaN, |
|
"eval_fg_logps/policy_KL": -14.794645309448242, |
|
"eval_fg_logps/policy_chosen": -6.733245849609375, |
|
"eval_fg_logps/policy_rejected": -8.626864433288574, |
|
"eval_fg_logps/reference_KL": -11.47359848022461, |
|
"eval_fg_logps/reference_chosen": -6.041894912719727, |
|
"eval_fg_logps/reference_rejected": -7.58065938949585, |
|
"eval_fg_loss": 0.762517511844635, |
|
"eval_fg_rewards/chosen_sum": -1.556026816368103, |
|
"eval_fg_rewards/rejected_sum": -0.9032577276229858, |
|
"eval_kl": 0.014131884090602398, |
|
"eval_logps/chosen": -336.04120131729667, |
|
"eval_logps/rejected": -406.1173232908459, |
|
"eval_loss": 0.41103363037109375, |
|
"eval_rewards/chosen": 1.7359535243503006, |
|
"eval_rewards/margins": 3.998709942730949, |
|
"eval_rewards/rejected": -2.262756418380649, |
|
"eval_runtime": 462.7715, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.938, |
|
"step": 800 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.733333587646484, |
|
"count/fg_rejected": 8.800000190734863, |
|
"epoch": 0.8307692307692308, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.521686553955078, |
|
"fg_logps/policy_chosen": -6.098317623138428, |
|
"fg_logps/policy_rejected": -7.37031888961792, |
|
"fg_logps/reference_KL": -11.138436317443848, |
|
"fg_logps/reference_chosen": -5.529090881347656, |
|
"fg_logps/reference_rejected": -6.567668437957764, |
|
"fg_loss": 0.7037224173545837, |
|
"fg_rewards/chosen_sum": -1.2442917823791504, |
|
"fg_rewards/rejected_sum": -0.9032351970672607, |
|
"grad_norm": 39.139843341578626, |
|
"kl": 0.0, |
|
"learning_rate": 9.464082098061574e-08, |
|
"logps/chosen": -351.8864535108025, |
|
"logps/rejected": -414.8218453322785, |
|
"loss": 0.4442, |
|
"rewards/chosen": 1.5498073248215665, |
|
"rewards/margins": 3.332525065809996, |
|
"rewards/rejected": -1.7827177409884296, |
|
"step": 810 |
|
}, |
|
{ |
|
"count/fg_chosen": 29.53333282470703, |
|
"count/fg_rejected": 8.714285850524902, |
|
"epoch": 0.841025641025641, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.194475173950195, |
|
"fg_logps/policy_chosen": -7.472283840179443, |
|
"fg_logps/policy_rejected": -8.897085189819336, |
|
"fg_logps/reference_KL": -12.283650398254395, |
|
"fg_logps/reference_chosen": -6.056351184844971, |
|
"fg_logps/reference_rejected": -7.35612154006958, |
|
"fg_loss": 0.8785532712936401, |
|
"fg_rewards/chosen_sum": -2.9767565727233887, |
|
"fg_rewards/rejected_sum": -1.322200059890747, |
|
"grad_norm": 18.084277972168177, |
|
"kl": 0.11083474010229111, |
|
"learning_rate": 8.893956670467502e-08, |
|
"logps/chosen": -357.466950491573, |
|
"logps/rejected": -436.3477937940141, |
|
"loss": 0.4174, |
|
"rewards/chosen": 1.0562572693556882, |
|
"rewards/margins": 4.643260735464843, |
|
"rewards/rejected": -3.587003466109155, |
|
"step": 820 |
|
}, |
|
{ |
|
"count/fg_chosen": 38.94117736816406, |
|
"count/fg_rejected": 9.133333206176758, |
|
"epoch": 0.8512820512820513, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.23822021484375, |
|
"fg_logps/policy_chosen": -6.3059844970703125, |
|
"fg_logps/policy_rejected": -7.230159282684326, |
|
"fg_logps/reference_KL": -12.178247451782227, |
|
"fg_logps/reference_chosen": -5.708430290222168, |
|
"fg_logps/reference_rejected": -5.989579200744629, |
|
"fg_loss": 0.8925216794013977, |
|
"fg_rewards/chosen_sum": -1.6965184211730957, |
|
"fg_rewards/rejected_sum": -1.320400357246399, |
|
"grad_norm": 36.47201338586735, |
|
"kl": 0.0, |
|
"learning_rate": 8.323831242873432e-08, |
|
"logps/chosen": -357.6101471656977, |
|
"logps/rejected": -458.4434121621622, |
|
"loss": 0.4677, |
|
"rewards/chosen": 1.8875178847202034, |
|
"rewards/margins": 3.2487285789043776, |
|
"rewards/rejected": -1.3612106941841744, |
|
"step": 830 |
|
}, |
|
{ |
|
"count/fg_chosen": 32.33333206176758, |
|
"count/fg_rejected": 6.941176414489746, |
|
"epoch": 0.8615384615384616, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.08393669128418, |
|
"fg_logps/policy_chosen": -6.780251502990723, |
|
"fg_logps/policy_rejected": -7.011376857757568, |
|
"fg_logps/reference_KL": -10.381501197814941, |
|
"fg_logps/reference_chosen": -6.354065418243408, |
|
"fg_logps/reference_rejected": -6.308195114135742, |
|
"fg_loss": 0.7612662315368652, |
|
"fg_rewards/chosen_sum": -1.2925324440002441, |
|
"fg_rewards/rejected_sum": -0.4022027552127838, |
|
"grad_norm": 32.781155840821306, |
|
"kl": 0.0, |
|
"learning_rate": 7.753705815279361e-08, |
|
"logps/chosen": -331.03251953125, |
|
"logps/rejected": -414.158447265625, |
|
"loss": 0.4681, |
|
"rewards/chosen": 1.1188287734985352, |
|
"rewards/margins": 3.2083324432373046, |
|
"rewards/rejected": -2.0895036697387694, |
|
"step": 840 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.047618865966797, |
|
"count/fg_rejected": 9.380952835083008, |
|
"epoch": 0.8717948717948718, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.173805236816406, |
|
"fg_logps/policy_chosen": -6.497308731079102, |
|
"fg_logps/policy_rejected": -8.013084411621094, |
|
"fg_logps/reference_KL": -10.995800018310547, |
|
"fg_logps/reference_chosen": -5.791131973266602, |
|
"fg_logps/reference_rejected": -7.026320457458496, |
|
"fg_loss": 0.8731069564819336, |
|
"fg_rewards/chosen_sum": -1.6361898183822632, |
|
"fg_rewards/rejected_sum": -1.0837599039077759, |
|
"grad_norm": 29.75869838237667, |
|
"kl": 0.0, |
|
"learning_rate": 7.183580387685291e-08, |
|
"logps/chosen": -385.02049512987014, |
|
"logps/rejected": -396.0473926957831, |
|
"loss": 0.4773, |
|
"rewards/chosen": 1.234356471470424, |
|
"rewards/margins": 3.495361367519464, |
|
"rewards/rejected": -2.26100489604904, |
|
"step": 850 |
|
}, |
|
{ |
|
"count/fg_chosen": 30.14285659790039, |
|
"count/fg_rejected": 5.857142925262451, |
|
"epoch": 0.882051282051282, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.854009628295898, |
|
"fg_logps/policy_chosen": -6.418879508972168, |
|
"fg_logps/policy_rejected": -8.779900550842285, |
|
"fg_logps/reference_KL": -12.032855033874512, |
|
"fg_logps/reference_chosen": -5.604477405548096, |
|
"fg_logps/reference_rejected": -7.313387870788574, |
|
"fg_loss": 0.8508789539337158, |
|
"fg_rewards/chosen_sum": -1.4381144046783447, |
|
"fg_rewards/rejected_sum": -0.7955017685890198, |
|
"grad_norm": 25.28218871544972, |
|
"kl": 0.0, |
|
"learning_rate": 6.613454960091219e-08, |
|
"logps/chosen": -300.05953414351853, |
|
"logps/rejected": -383.09023931962025, |
|
"loss": 0.3998, |
|
"rewards/chosen": 1.808587156696084, |
|
"rewards/margins": 3.9609218286823378, |
|
"rewards/rejected": -2.152334671986254, |
|
"step": 860 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.25, |
|
"count/fg_rejected": 5.800000190734863, |
|
"epoch": 0.8923076923076924, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.163013458251953, |
|
"fg_logps/policy_chosen": -6.579830646514893, |
|
"fg_logps/policy_rejected": -7.748932361602783, |
|
"fg_logps/reference_KL": -10.930763244628906, |
|
"fg_logps/reference_chosen": -5.884364128112793, |
|
"fg_logps/reference_rejected": -6.846892356872559, |
|
"fg_loss": 0.7400026321411133, |
|
"fg_rewards/chosen_sum": -1.4189780950546265, |
|
"fg_rewards/rejected_sum": -0.8359920978546143, |
|
"grad_norm": 35.432818767338894, |
|
"kl": 0.0, |
|
"learning_rate": 6.043329532497149e-08, |
|
"logps/chosen": -343.70828125, |
|
"logps/rejected": -468.51098345588235, |
|
"loss": 0.3876, |
|
"rewards/chosen": 1.4050553385416666, |
|
"rewards/margins": 4.349431846469056, |
|
"rewards/rejected": -2.9443765079273896, |
|
"step": 870 |
|
}, |
|
{ |
|
"count/fg_chosen": 23.538461685180664, |
|
"count/fg_rejected": 6.166666507720947, |
|
"epoch": 0.9025641025641026, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.849803924560547, |
|
"fg_logps/policy_chosen": -6.889503002166748, |
|
"fg_logps/policy_rejected": -9.813651084899902, |
|
"fg_logps/reference_KL": -11.724679946899414, |
|
"fg_logps/reference_chosen": -6.115569114685059, |
|
"fg_logps/reference_rejected": -8.466334342956543, |
|
"fg_loss": 0.923600435256958, |
|
"fg_rewards/chosen_sum": -1.7626667022705078, |
|
"fg_rewards/rejected_sum": -0.8649892210960388, |
|
"grad_norm": 28.479901425326158, |
|
"kl": 0.0, |
|
"learning_rate": 5.4732041049030787e-08, |
|
"logps/chosen": -310.8540810032895, |
|
"logps/rejected": -370.5546642485119, |
|
"loss": 0.3796, |
|
"rewards/chosen": 2.1148808127955387, |
|
"rewards/margins": 4.169318722603016, |
|
"rewards/rejected": -2.054437909807478, |
|
"step": 880 |
|
}, |
|
{ |
|
"count/fg_chosen": 28.647058486938477, |
|
"count/fg_rejected": 7.214285850524902, |
|
"epoch": 0.9128205128205128, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.122632026672363, |
|
"fg_logps/policy_chosen": -6.575372219085693, |
|
"fg_logps/policy_rejected": -8.797600746154785, |
|
"fg_logps/reference_KL": -11.014878273010254, |
|
"fg_logps/reference_chosen": -6.147511005401611, |
|
"fg_logps/reference_rejected": -7.686254024505615, |
|
"fg_loss": 0.7267153263092041, |
|
"fg_rewards/chosen_sum": -1.0292545557022095, |
|
"fg_rewards/rejected_sum": -1.021606206893921, |
|
"grad_norm": 14.860543570560992, |
|
"kl": 0.0, |
|
"learning_rate": 4.9030786773090077e-08, |
|
"logps/chosen": -310.51392463235294, |
|
"logps/rejected": -438.26770833333336, |
|
"loss": 0.4036, |
|
"rewards/chosen": 1.7528151568244486, |
|
"rewards/margins": 4.661474010991116, |
|
"rewards/rejected": -2.9086588541666667, |
|
"step": 890 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.649999618530273, |
|
"count/fg_rejected": 5.099999904632568, |
|
"epoch": 0.9230769230769231, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.193872451782227, |
|
"fg_logps/policy_chosen": -6.921626091003418, |
|
"fg_logps/policy_rejected": -9.522343635559082, |
|
"fg_logps/reference_KL": -11.5450439453125, |
|
"fg_logps/reference_chosen": -6.295389175415039, |
|
"fg_logps/reference_rejected": -8.422341346740723, |
|
"fg_loss": 0.8100715279579163, |
|
"fg_rewards/chosen_sum": -1.2453607320785522, |
|
"fg_rewards/rejected_sum": -0.6394971609115601, |
|
"grad_norm": 26.313505637704687, |
|
"kl": 0.0, |
|
"learning_rate": 4.332953249714937e-08, |
|
"logps/chosen": -379.3786095727848, |
|
"logps/rejected": -469.6590470679012, |
|
"loss": 0.4082, |
|
"rewards/chosen": 1.5717159222952928, |
|
"rewards/margins": 4.730814041206847, |
|
"rewards/rejected": -3.1590981189115546, |
|
"step": 900 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.809524536132812, |
|
"count/fg_rejected": 7.949999809265137, |
|
"epoch": 0.9333333333333333, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -13.88550090789795, |
|
"fg_logps/policy_chosen": -6.575550556182861, |
|
"fg_logps/policy_rejected": -8.97218132019043, |
|
"fg_logps/reference_KL": -10.913046836853027, |
|
"fg_logps/reference_chosen": -5.786440372467041, |
|
"fg_logps/reference_rejected": -8.161312103271484, |
|
"fg_loss": 0.6918947696685791, |
|
"fg_rewards/chosen_sum": -1.8372831344604492, |
|
"fg_rewards/rejected_sum": -0.7708438038825989, |
|
"grad_norm": 30.704094667430244, |
|
"kl": 0.0, |
|
"learning_rate": 3.762827822120866e-08, |
|
"logps/chosen": -342.4310891544118, |
|
"logps/rejected": -465.26286458333334, |
|
"loss": 0.394, |
|
"rewards/chosen": 1.6974679385914522, |
|
"rewards/margins": 5.162705975700828, |
|
"rewards/rejected": -3.465238037109375, |
|
"step": 910 |
|
}, |
|
{ |
|
"count/fg_chosen": 33.44444274902344, |
|
"count/fg_rejected": 7.05555534362793, |
|
"epoch": 0.9435897435897436, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.852989196777344, |
|
"fg_logps/policy_chosen": -8.116382598876953, |
|
"fg_logps/policy_rejected": -9.44908332824707, |
|
"fg_logps/reference_KL": -12.696809768676758, |
|
"fg_logps/reference_chosen": -6.711515426635742, |
|
"fg_logps/reference_rejected": -8.2290678024292, |
|
"fg_loss": 0.8823240995407104, |
|
"fg_rewards/chosen_sum": -2.763169527053833, |
|
"fg_rewards/rejected_sum": -0.838661789894104, |
|
"grad_norm": 28.750070922344392, |
|
"kl": 0.0, |
|
"learning_rate": 3.192702394526796e-08, |
|
"logps/chosen": -369.98503449675326, |
|
"logps/rejected": -378.85448042168673, |
|
"loss": 0.455, |
|
"rewards/chosen": 1.3270432113052963, |
|
"rewards/margins": 4.1537548429882785, |
|
"rewards/rejected": -2.826711631682982, |
|
"step": 920 |
|
}, |
|
{ |
|
"count/fg_chosen": 31.0, |
|
"count/fg_rejected": 7.222222328186035, |
|
"epoch": 0.9538461538461539, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -16.319583892822266, |
|
"fg_logps/policy_chosen": -7.1412739753723145, |
|
"fg_logps/policy_rejected": -8.26430892944336, |
|
"fg_logps/reference_KL": -12.383834838867188, |
|
"fg_logps/reference_chosen": -6.26031494140625, |
|
"fg_logps/reference_rejected": -7.345766544342041, |
|
"fg_loss": 0.7640350461006165, |
|
"fg_rewards/chosen_sum": -1.9626283645629883, |
|
"fg_rewards/rejected_sum": -0.9573346376419067, |
|
"grad_norm": 30.27804248529551, |
|
"kl": 0.0, |
|
"learning_rate": 2.6225769669327253e-08, |
|
"logps/chosen": -328.4565281723485, |
|
"logps/rejected": -376.20595079787233, |
|
"loss": 0.3767, |
|
"rewards/chosen": 2.0332070552941524, |
|
"rewards/margins": 5.176523121305622, |
|
"rewards/rejected": -3.1433160660114696, |
|
"step": 930 |
|
}, |
|
{ |
|
"count/fg_chosen": 30.625, |
|
"count/fg_rejected": 7.0, |
|
"epoch": 0.9641025641025641, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.706071853637695, |
|
"fg_logps/policy_chosen": -7.53517484664917, |
|
"fg_logps/policy_rejected": -10.4346284866333, |
|
"fg_logps/reference_KL": -11.768562316894531, |
|
"fg_logps/reference_chosen": -6.489566326141357, |
|
"fg_logps/reference_rejected": -9.170198440551758, |
|
"fg_loss": 0.7636561989784241, |
|
"fg_rewards/chosen_sum": -2.5118489265441895, |
|
"fg_rewards/rejected_sum": -0.7650282979011536, |
|
"grad_norm": 39.27702261397548, |
|
"kl": 0.0, |
|
"learning_rate": 2.0524515393386543e-08, |
|
"logps/chosen": -346.63032670454544, |
|
"logps/rejected": -424.40479103915663, |
|
"loss": 0.4212, |
|
"rewards/chosen": 1.459620389071378, |
|
"rewards/margins": 4.2732272947213215, |
|
"rewards/rejected": -2.8136069056499435, |
|
"step": 940 |
|
}, |
|
{ |
|
"count/fg_chosen": 25.272727966308594, |
|
"count/fg_rejected": 4.363636493682861, |
|
"epoch": 0.9743589743589743, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -14.18252182006836, |
|
"fg_logps/policy_chosen": -6.842197895050049, |
|
"fg_logps/policy_rejected": -7.672128677368164, |
|
"fg_logps/reference_KL": -10.544868469238281, |
|
"fg_logps/reference_chosen": -5.763217449188232, |
|
"fg_logps/reference_rejected": -6.418917655944824, |
|
"fg_loss": 0.6513127088546753, |
|
"fg_rewards/chosen_sum": -1.7274693250656128, |
|
"fg_rewards/rejected_sum": -0.795689582824707, |
|
"grad_norm": 23.77562108897806, |
|
"kl": 0.0, |
|
"learning_rate": 1.4823261117445838e-08, |
|
"logps/chosen": -349.6753555689103, |
|
"logps/rejected": -403.2014100609756, |
|
"loss": 0.3828, |
|
"rewards/chosen": 1.3632464286608574, |
|
"rewards/margins": 3.730917743923815, |
|
"rewards/rejected": -2.367671315262957, |
|
"step": 950 |
|
}, |
|
{ |
|
"count/fg_chosen": 26.649999618530273, |
|
"count/fg_rejected": 6.5, |
|
"epoch": 0.9846153846153847, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.6975736618042, |
|
"fg_logps/policy_chosen": -7.5320539474487305, |
|
"fg_logps/policy_rejected": -9.357258796691895, |
|
"fg_logps/reference_KL": -12.039111137390137, |
|
"fg_logps/reference_chosen": -6.5699005126953125, |
|
"fg_logps/reference_rejected": -8.148908615112305, |
|
"fg_loss": 0.6992577910423279, |
|
"fg_rewards/chosen_sum": -1.6473188400268555, |
|
"fg_rewards/rejected_sum": -0.9504286646842957, |
|
"grad_norm": 29.616218116706296, |
|
"kl": 0.0, |
|
"learning_rate": 9.122006841505132e-09, |
|
"logps/chosen": -366.2652652138158, |
|
"logps/rejected": -412.12672061011904, |
|
"loss": 0.4654, |
|
"rewards/chosen": 0.8081491369950143, |
|
"rewards/margins": 3.221180578819791, |
|
"rewards/rejected": -2.413031441824777, |
|
"step": 960 |
|
}, |
|
{ |
|
"count/fg_chosen": 27.294116973876953, |
|
"count/fg_rejected": 6.294117450714111, |
|
"epoch": 0.9948717948717949, |
|
"fg_kl": NaN, |
|
"fg_logps/policy_KL": -15.349953651428223, |
|
"fg_logps/policy_chosen": -7.822242259979248, |
|
"fg_logps/policy_rejected": -8.426324844360352, |
|
"fg_logps/reference_KL": -11.580068588256836, |
|
"fg_logps/reference_chosen": -6.358785629272461, |
|
"fg_logps/reference_rejected": -7.00697135925293, |
|
"fg_loss": 0.8312911987304688, |
|
"fg_rewards/chosen_sum": -2.5030882358551025, |
|
"fg_rewards/rejected_sum": -0.9714083671569824, |
|
"grad_norm": 26.639106291790316, |
|
"kl": 0.0, |
|
"learning_rate": 3.420752565564424e-09, |
|
"logps/chosen": -419.9399809966216, |
|
"logps/rejected": -412.7228379360465, |
|
"loss": 0.4666, |
|
"rewards/chosen": 0.5730146459631018, |
|
"rewards/margins": 2.736655932413865, |
|
"rewards/rejected": -2.1636412864507633, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 975, |
|
"total_flos": 0.0, |
|
"train_loss": 0.45996271347388246, |
|
"train_runtime": 8430.3956, |
|
"train_samples_per_second": 1.85, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 975, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|