diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25399 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.247011952191235, + "eval_steps": 500, + "global_step": 1692, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "logps_train/chosen": -84.51311492919922, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -93.47111511230469, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.0958428904414177, + "rewards_train/margins": -0.016700252890586853, + "rewards_train/rejected": -0.07914263755083084, + "step": 0 + }, + { + "epoch": 0.0, + "logps_train/chosen": -93.37198638916016, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -107.01815032958984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.13563546538352966, + "rewards_train/margins": 0.08024188876152039, + "rewards_train/rejected": -0.21587735414505005, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.622516556291391e-09, + "loss": 0.6883, + "step": 2 + }, + { + "epoch": 0.0, + "logps_train/chosen": -91.27803039550781, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -93.9560775756836, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.15905329585075378, + "rewards_train/margins": -0.3353206366300583, + "rewards_train/rejected": 0.1762673407793045, + "step": 2 + }, + { + "epoch": 0.0, + "logps_train/chosen": -68.57564544677734, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -68.01426696777344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.021627191454172134, + "rewards_train/margins": 0.12198681756854057, + "rewards_train/rejected": -0.1436140090227127, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.3245033112582781e-08, + "loss": 0.767, + "step": 4 + }, + { + "epoch": 0.01, + "logps_train/chosen": -62.96977996826172, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -55.99552917480469, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.16729095578193665, + "rewards_train/margins": -0.17867529951035976, + "rewards_train/rejected": 0.011384343728423119, + "step": 4 + }, + { + "epoch": 0.01, + "logps_train/chosen": -66.53120422363281, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -66.25678253173828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03125469759106636, + "rewards_train/margins": 0.041307687759399414, + "rewards_train/rejected": -0.010052990168333054, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.9867549668874173e-08, + "loss": 0.7398, + "step": 6 + }, + { + "epoch": 0.01, + "logps_train/chosen": -61.914283752441406, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -71.35198974609375, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.005490925163030624, + "rewards_train/margins": -0.181229617446661, + "rewards_train/rejected": 0.17573869228363037, + "step": 6 + }, + { + "epoch": 0.01, + "logps_train/chosen": -97.30050659179688, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -89.40110778808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.28869879245758057, + "rewards_train/margins": 0.16318489611148834, + "rewards_train/rejected": 0.12551389634609222, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 2.6490066225165563e-08, + "loss": 0.7093, + "step": 8 + }, + { + "epoch": 0.01, + "logps_train/chosen": -71.537353515625, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -75.89208221435547, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.031029988080263138, + "rewards_train/margins": 0.1296129710972309, + "rewards_train/rejected": -0.09858298301696777, + "step": 8 + }, + { + "epoch": 0.01, + "logps_train/chosen": -56.81492233276367, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -57.13829040527344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.10024227946996689, + "rewards_train/margins": -0.02547553926706314, + "rewards_train/rejected": -0.07476674020290375, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 3.311258278145695e-08, + "loss": 0.6746, + "step": 10 + }, + { + "epoch": 0.01, + "logps_train/chosen": -48.844825744628906, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -57.598480224609375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1053614467382431, + "rewards_train/margins": 0.04724089428782463, + "rewards_train/rejected": 0.05812055245041847, + "step": 10 + }, + { + "epoch": 0.01, + "logps_train/chosen": -52.23993682861328, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -61.658687591552734, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.2528998851776123, + "rewards_train/margins": -0.14484356343746185, + "rewards_train/rejected": -0.10805632174015045, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 3.9735099337748346e-08, + "loss": 0.7293, + "step": 12 + }, + { + "epoch": 0.02, + "logps_train/chosen": -59.14757537841797, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.62351989746094, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.09913238137960434, + "rewards_train/margins": -0.028967678546905518, + "rewards_train/rejected": -0.07016470283269882, + "step": 12 + }, + { + "epoch": 0.02, + "logps_train/chosen": -51.99066162109375, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -58.12510299682617, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.16156604886054993, + "rewards_train/margins": -0.12093072757124901, + "rewards_train/rejected": -0.04063532128930092, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 4.635761589403973e-08, + "loss": 0.7412, + "step": 14 + }, + { + "epoch": 0.02, + "logps_train/chosen": -77.4368896484375, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -84.78419494628906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.04693605750799179, + "rewards_train/margins": -0.02464430034160614, + "rewards_train/rejected": 0.07158035784959793, + "step": 14 + }, + { + "epoch": 0.02, + "logps_train/chosen": -44.22900390625, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -52.039039611816406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.013036925345659256, + "rewards_train/margins": 0.18959732726216316, + "rewards_train/rejected": -0.1765604019165039, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 5.2980132450331126e-08, + "loss": 0.6629, + "step": 16 + }, + { + "epoch": 0.02, + "logps_train/chosen": -44.198028564453125, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -47.66931915283203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01144709624350071, + "rewards_train/margins": 0.0041603464633226395, + "rewards_train/rejected": 0.00728674978017807, + "step": 16 + }, + { + "epoch": 0.02, + "logps_train/chosen": -39.628387451171875, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -47.1257438659668, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0684109702706337, + "rewards_train/margins": 0.1266883872449398, + "rewards_train/rejected": -0.05827741697430611, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 5.960264900662251e-08, + "loss": 0.6637, + "step": 18 + }, + { + "epoch": 0.02, + "logps_train/chosen": -43.85874938964844, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -41.347694396972656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.014125023037195206, + "rewards_train/margins": -0.0018867962062358856, + "rewards_train/rejected": 0.01601181924343109, + "step": 18 + }, + { + "epoch": 0.03, + "logps_train/chosen": -53.821372985839844, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -61.67257308959961, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.10713744163513184, + "rewards_train/margins": -0.15550511330366135, + "rewards_train/rejected": 0.04836767166852951, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 6.62251655629139e-08, + "loss": 0.7409, + "step": 20 + }, + { + "epoch": 0.03, + "logps_train/chosen": -71.7474136352539, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -58.922584533691406, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.14974145591259003, + "rewards_train/margins": -0.07467035949230194, + "rewards_train/rejected": -0.07507109642028809, + "step": 20 + }, + { + "epoch": 0.03, + "logps_train/chosen": -86.9837646484375, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -90.32958984375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1313113570213318, + "rewards_train/margins": 0.2759898006916046, + "rewards_train/rejected": -0.14467844367027283, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 7.284768211920529e-08, + "loss": 0.6575, + "step": 22 + }, + { + "epoch": 0.03, + "logps_train/chosen": -67.74806213378906, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -86.9749755859375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.028709255158901215, + "rewards_train/margins": 0.004331827163696289, + "rewards_train/rejected": 0.024377427995204926, + "step": 22 + }, + { + "epoch": 0.03, + "logps_train/chosen": -69.238525390625, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -88.7351303100586, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": 0.03630336374044418, + "rewards_train/margins": -0.1839330866932869, + "rewards_train/rejected": 0.22023645043373108, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 7.947019867549669e-08, + "loss": 0.7637, + "step": 24 + }, + { + "epoch": 0.03, + "logps_train/chosen": -36.02545166015625, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -42.00984191894531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10682964324951172, + "rewards_train/margins": 0.09687619283795357, + "rewards_train/rejected": 0.009953450411558151, + "step": 24 + }, + { + "epoch": 0.03, + "logps_train/chosen": -38.06768035888672, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -46.420928955078125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.10286159813404083, + "rewards_train/margins": -0.05803433805704117, + "rewards_train/rejected": -0.044827260076999664, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 8.609271523178807e-08, + "loss": 0.6898, + "step": 26 + }, + { + "epoch": 0.03, + "logps_train/chosen": -66.19087982177734, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -70.36297607421875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.11450610309839249, + "rewards_train/margins": 0.13517922163009644, + "rewards_train/rejected": -0.02067311853170395, + "step": 26 + }, + { + "epoch": 0.04, + "logps_train/chosen": -64.31070709228516, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -71.96617126464844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.00017967447638511658, + "rewards_train/margins": -0.04382825270295143, + "rewards_train/rejected": 0.04400792717933655, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 9.271523178807946e-08, + "loss": 0.6823, + "step": 28 + }, + { + "epoch": 0.04, + "logps_train/chosen": -45.852569580078125, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.71060180664062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.08291341364383698, + "rewards_train/margins": -0.010290443897247314, + "rewards_train/rejected": -0.07262296974658966, + "step": 28 + }, + { + "epoch": 0.04, + "logps_train/chosen": -58.55830764770508, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -48.15802764892578, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.0761432871222496, + "rewards_train/margins": -0.02830956131219864, + "rewards_train/rejected": -0.047833725810050964, + "step": 29 + }, + { + "epoch": 0.04, + "learning_rate": 9.933774834437085e-08, + "loss": 0.7121, + "step": 30 + }, + { + "epoch": 0.04, + "logps_train/chosen": -78.21870422363281, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -84.91961669921875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.18594178557395935, + "rewards_train/margins": -0.034597113728523254, + "rewards_train/rejected": 0.2205388993024826, + "step": 30 + }, + { + "epoch": 0.04, + "logps_train/chosen": -45.589080810546875, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -52.50957107543945, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.038107212632894516, + "rewards_train/margins": -0.12816570326685905, + "rewards_train/rejected": 0.09005849063396454, + "step": 31 + }, + { + "epoch": 0.04, + "learning_rate": 1.0596026490066225e-07, + "loss": 0.7561, + "step": 32 + }, + { + "epoch": 0.04, + "logps_train/chosen": -66.94876098632812, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -78.79197692871094, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.030032292008399963, + "rewards_train/margins": -0.1750534325838089, + "rewards_train/rejected": 0.14502114057540894, + "step": 32 + }, + { + "epoch": 0.04, + "logps_train/chosen": -71.43030548095703, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -67.35134887695312, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.10709311813116074, + "rewards_train/margins": 0.007729530334472656, + "rewards_train/rejected": -0.11482264846563339, + "step": 33 + }, + { + "epoch": 0.05, + "learning_rate": 1.1258278145695364e-07, + "loss": 0.7448, + "step": 34 + }, + { + "epoch": 0.05, + "logps_train/chosen": -42.91541290283203, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -47.038291931152344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.025721075013279915, + "rewards_train/margins": -0.025016890838742256, + "rewards_train/rejected": -0.0007041841745376587, + "step": 34 + }, + { + "epoch": 0.05, + "logps_train/chosen": -56.560001373291016, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -53.944644927978516, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.09428122639656067, + "rewards_train/margins": -0.020129263401031494, + "rewards_train/rejected": -0.07415196299552917, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 1.1920529801324502e-07, + "loss": 0.714, + "step": 36 + }, + { + "epoch": 0.05, + "logps_train/chosen": -34.96800231933594, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -53.19981002807617, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.09406614303588867, + "rewards_train/margins": -0.22877249121665955, + "rewards_train/rejected": 0.13470634818077087, + "step": 36 + }, + { + "epoch": 0.05, + "logps_train/chosen": -83.79179382324219, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -73.38970184326172, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.041914697736501694, + "rewards_train/margins": 0.19865834340453148, + "rewards_train/rejected": -0.15674364566802979, + "step": 37 + }, + { + "epoch": 0.05, + "learning_rate": 1.2582781456953642e-07, + "loss": 0.736, + "step": 38 + }, + { + "epoch": 0.05, + "logps_train/chosen": -50.90718078613281, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -53.10429763793945, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.027436543256044388, + "rewards_train/margins": 0.13768086954951286, + "rewards_train/rejected": -0.16511741280555725, + "step": 38 + }, + { + "epoch": 0.05, + "logps_train/chosen": -51.373138427734375, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -44.57908630371094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.015810921788215637, + "rewards_train/margins": 0.08465710282325745, + "rewards_train/rejected": -0.06884618103504181, + "step": 39 + }, + { + "epoch": 0.05, + "learning_rate": 1.324503311258278e-07, + "loss": 0.6463, + "step": 40 + }, + { + "epoch": 0.05, + "logps_train/chosen": -62.13481903076172, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -58.01111602783203, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06776849925518036, + "rewards_train/margins": 0.07356762979179621, + "rewards_train/rejected": -0.0057991305366158485, + "step": 40 + }, + { + "epoch": 0.05, + "logps_train/chosen": -93.16519165039062, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -83.23350524902344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.01339399442076683, + "rewards_train/margins": -0.15098082646727562, + "rewards_train/rejected": 0.1375868320465088, + "step": 41 + }, + { + "epoch": 0.06, + "learning_rate": 1.390728476821192e-07, + "loss": 0.7213, + "step": 42 + }, + { + "epoch": 0.06, + "logps_train/chosen": -68.20494079589844, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -75.3348388671875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.023731615394353867, + "rewards_train/margins": 0.015028480440378189, + "rewards_train/rejected": 0.008703134953975677, + "step": 42 + }, + { + "epoch": 0.06, + "logps_train/chosen": -38.220970153808594, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -50.798240661621094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.043137263506650925, + "rewards_train/margins": 0.12178956344723701, + "rewards_train/rejected": -0.07865229994058609, + "step": 43 + }, + { + "epoch": 0.06, + "learning_rate": 1.4569536423841058e-07, + "loss": 0.6651, + "step": 44 + }, + { + "epoch": 0.06, + "logps_train/chosen": -86.4616928100586, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -92.05072784423828, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.11179409921169281, + "rewards_train/margins": -0.18172165006399155, + "rewards_train/rejected": 0.06992755085229874, + "step": 44 + }, + { + "epoch": 0.06, + "logps_train/chosen": -46.09622573852539, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -50.20856475830078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06693994998931885, + "rewards_train/margins": 0.0014684349298477173, + "rewards_train/rejected": 0.06547151505947113, + "step": 45 + }, + { + "epoch": 0.06, + "learning_rate": 1.5231788079470197e-07, + "loss": 0.7467, + "step": 46 + }, + { + "epoch": 0.06, + "logps_train/chosen": -49.72835922241211, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -40.57377243041992, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03846099227666855, + "rewards_train/margins": -0.011552426964044571, + "rewards_train/rejected": -0.026908565312623978, + "step": 46 + }, + { + "epoch": 0.06, + "logps_train/chosen": -80.9888916015625, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -66.10010528564453, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.010486461222171783, + "rewards_train/margins": 0.09432516247034073, + "rewards_train/rejected": -0.08383870124816895, + "step": 47 + }, + { + "epoch": 0.06, + "learning_rate": 1.5894039735099338e-07, + "loss": 0.6786, + "step": 48 + }, + { + "epoch": 0.06, + "logps_train/chosen": -67.37806701660156, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -53.611351013183594, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.03563149273395538, + "rewards_train/margins": -0.11026449501514435, + "rewards_train/rejected": 0.14589598774909973, + "step": 48 + }, + { + "epoch": 0.07, + "logps_train/chosen": -72.42200469970703, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -81.93806457519531, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.1007942259311676, + "rewards_train/margins": -0.01948826014995575, + "rewards_train/rejected": -0.08130596578121185, + "step": 49 + }, + { + "epoch": 0.07, + "learning_rate": 1.6556291390728477e-07, + "loss": 0.7421, + "step": 50 + }, + { + "epoch": 0.07, + "logps_train/chosen": -74.96185302734375, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -91.39497375488281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.1663145124912262, + "rewards_train/margins": 0.037061452865600586, + "rewards_train/rejected": 0.1292530596256256, + "step": 50 + }, + { + "epoch": 0.07, + "logps_train/chosen": -25.62216567993164, + "logps_train/ref_chosen": -25.0, + "logps_train/ref_rejected": -31.5, + "logps_train/rejected": -33.801307678222656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06182580441236496, + "rewards_train/margins": 0.168891079723835, + "rewards_train/rejected": -0.23071688413619995, + "step": 51 + }, + { + "epoch": 0.07, + "learning_rate": 1.7218543046357613e-07, + "loss": 0.6492, + "step": 52 + }, + { + "epoch": 0.07, + "logps_train/chosen": -80.94408416748047, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -88.69902038574219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06887286901473999, + "rewards_train/margins": 0.1254931017756462, + "rewards_train/rejected": -0.05662023276090622, + "step": 52 + }, + { + "epoch": 0.07, + "logps_train/chosen": -70.8918685913086, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -85.67730712890625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.16321057081222534, + "rewards_train/margins": -0.11891760677099228, + "rewards_train/rejected": -0.04429296404123306, + "step": 53 + }, + { + "epoch": 0.07, + "learning_rate": 1.7880794701986754e-07, + "loss": 0.7014, + "step": 54 + }, + { + "epoch": 0.07, + "logps_train/chosen": -40.09686279296875, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -42.26985549926758, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.005779888480901718, + "rewards_train/margins": -0.07396034523844719, + "rewards_train/rejected": 0.06818045675754547, + "step": 54 + }, + { + "epoch": 0.07, + "logps_train/chosen": -55.24117660522461, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -71.84161376953125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.040524162352085114, + "rewards_train/margins": 0.029574863612651825, + "rewards_train/rejected": -0.07009902596473694, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 1.8543046357615893e-07, + "loss": 0.7135, + "step": 56 + }, + { + "epoch": 0.07, + "logps_train/chosen": -111.13737487792969, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -81.009033203125, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.030925631523132324, + "rewards_train/margins": -0.036272644996643066, + "rewards_train/rejected": 0.005347013473510742, + "step": 56 + }, + { + "epoch": 0.08, + "logps_train/chosen": -68.06702423095703, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -74.94760131835938, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.009703557938337326, + "rewards_train/margins": -0.05510689690709114, + "rewards_train/rejected": 0.06481045484542847, + "step": 57 + }, + { + "epoch": 0.08, + "learning_rate": 1.9205298013245034e-07, + "loss": 0.7229, + "step": 58 + }, + { + "epoch": 0.08, + "logps_train/chosen": -81.5542221069336, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -82.67008972167969, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.03855158016085625, + "rewards_train/margins": -0.01333947665989399, + "rewards_train/rejected": -0.025212103500962257, + "step": 58 + }, + { + "epoch": 0.08, + "logps_train/chosen": -62.39722442626953, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -59.491939544677734, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.09909740090370178, + "rewards_train/margins": -0.12177861109375954, + "rewards_train/rejected": 0.022681210190057755, + "step": 59 + }, + { + "epoch": 0.08, + "learning_rate": 1.986754966887417e-07, + "loss": 0.7326, + "step": 60 + }, + { + "epoch": 0.08, + "logps_train/chosen": -54.281272888183594, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -54.476112365722656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.032028935849666595, + "rewards_train/margins": -0.020848341286182404, + "rewards_train/rejected": 0.052877277135849, + "step": 60 + }, + { + "epoch": 0.08, + "logps_train/chosen": -58.266319274902344, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -63.26285171508789, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.1311240941286087, + "rewards_train/margins": -0.347710058093071, + "rewards_train/rejected": 0.21658596396446228, + "step": 61 + }, + { + "epoch": 0.08, + "learning_rate": 2.052980132450331e-07, + "loss": 0.808, + "step": 62 + }, + { + "epoch": 0.08, + "logps_train/chosen": -57.68476867675781, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -64.23268127441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11433573067188263, + "rewards_train/margins": 0.2719792574644089, + "rewards_train/rejected": -0.15764352679252625, + "step": 62 + }, + { + "epoch": 0.08, + "logps_train/chosen": -44.09886932373047, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -57.92987060546875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.11184025555849075, + "rewards_train/margins": -0.031353630125522614, + "rewards_train/rejected": -0.08048662543296814, + "step": 63 + }, + { + "epoch": 0.08, + "learning_rate": 2.119205298013245e-07, + "loss": 0.6448, + "step": 64 + }, + { + "epoch": 0.08, + "logps_train/chosen": -83.86918640136719, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -78.87030029296875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1119191125035286, + "rewards_train/margins": 0.15011154860258102, + "rewards_train/rejected": -0.2620306611061096, + "step": 64 + }, + { + "epoch": 0.09, + "logps_train/chosen": -44.80962371826172, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -58.183319091796875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.06494668126106262, + "rewards_train/margins": -0.02473984658718109, + "rewards_train/rejected": -0.04020683467388153, + "step": 65 + }, + { + "epoch": 0.09, + "learning_rate": 2.185430463576159e-07, + "loss": 0.6741, + "step": 66 + }, + { + "epoch": 0.09, + "logps_train/chosen": -72.55757904052734, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -69.10940551757812, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.12450781464576721, + "rewards_train/margins": -0.13700440526008606, + "rewards_train/rejected": 0.012496590614318848, + "step": 66 + }, + { + "epoch": 0.09, + "logps_train/chosen": -46.07583999633789, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -50.48297119140625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06383398920297623, + "rewards_train/margins": -0.010849431157112122, + "rewards_train/rejected": -0.052984558045864105, + "step": 67 + }, + { + "epoch": 0.09, + "learning_rate": 2.2516556291390728e-07, + "loss": 0.737, + "step": 68 + }, + { + "epoch": 0.09, + "logps_train/chosen": -52.24906539916992, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -53.30507278442383, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12490688264369965, + "rewards_train/margins": 0.043100327253341675, + "rewards_train/rejected": -0.16800720989704132, + "step": 68 + }, + { + "epoch": 0.09, + "logps_train/chosen": -79.39360046386719, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -74.55288696289062, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.08623476326465607, + "rewards_train/margins": 0.01436634361743927, + "rewards_train/rejected": -0.10060110688209534, + "step": 69 + }, + { + "epoch": 0.09, + "learning_rate": 2.3178807947019866e-07, + "loss": 0.6844, + "step": 70 + }, + { + "epoch": 0.09, + "logps_train/chosen": -91.63473510742188, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -92.17798614501953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.13925480842590332, + "rewards_train/margins": 0.10901263356208801, + "rewards_train/rejected": -0.24826744198799133, + "step": 70 + }, + { + "epoch": 0.09, + "logps_train/chosen": -63.37866973876953, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -70.65562438964844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08474203944206238, + "rewards_train/margins": 0.07769595086574554, + "rewards_train/rejected": -0.16243799030780792, + "step": 71 + }, + { + "epoch": 0.1, + "learning_rate": 2.3841059602649005e-07, + "loss": 0.6567, + "step": 72 + }, + { + "epoch": 0.1, + "logps_train/chosen": -71.5158462524414, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -117.028076171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.056227780878543854, + "rewards_train/margins": 0.11840992420911789, + "rewards_train/rejected": -0.062182143330574036, + "step": 72 + }, + { + "epoch": 0.1, + "logps_train/chosen": -45.88077926635742, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -43.769962310791016, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.08885923773050308, + "rewards_train/margins": -0.039987921714782715, + "rewards_train/rejected": -0.04887131601572037, + "step": 73 + }, + { + "epoch": 0.1, + "learning_rate": 2.4503311258278146e-07, + "loss": 0.6879, + "step": 74 + }, + { + "epoch": 0.1, + "logps_train/chosen": -52.970603942871094, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -47.509727478027344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.07206050306558609, + "rewards_train/margins": -0.07577532436698675, + "rewards_train/rejected": 0.0037148213014006615, + "step": 74 + }, + { + "epoch": 0.1, + "logps_train/chosen": -53.47748565673828, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -39.79557800292969, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.014936279505491257, + "rewards_train/margins": -0.033034609630703926, + "rewards_train/rejected": 0.01809833012521267, + "step": 75 + }, + { + "epoch": 0.1, + "learning_rate": 2.5165562913907285e-07, + "loss": 0.7242, + "step": 76 + }, + { + "epoch": 0.1, + "logps_train/chosen": -65.38147735595703, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -62.105567932128906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.010022591799497604, + "rewards_train/margins": -0.05102801322937012, + "rewards_train/rejected": 0.04100542142987251, + "step": 76 + }, + { + "epoch": 0.1, + "logps_train/chosen": -87.7823257446289, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -114.4891128540039, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.0835060402750969, + "rewards_train/margins": -0.020532339811325073, + "rewards_train/rejected": -0.06297370046377182, + "step": 77 + }, + { + "epoch": 0.1, + "learning_rate": 2.5827814569536424e-07, + "loss": 0.7218, + "step": 78 + }, + { + "epoch": 0.1, + "logps_train/chosen": -39.83503341674805, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -45.94615936279297, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.1506907194852829, + "rewards_train/margins": -0.07013702392578125, + "rewards_train/rejected": -0.08055369555950165, + "step": 78 + }, + { + "epoch": 0.1, + "logps_train/chosen": -48.3602294921875, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -48.82488250732422, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.12469467520713806, + "rewards_train/margins": -0.06408148258924484, + "rewards_train/rejected": -0.06061319261789322, + "step": 79 + }, + { + "epoch": 0.11, + "learning_rate": 2.649006622516556e-07, + "loss": 0.7301, + "step": 80 + }, + { + "epoch": 0.11, + "logps_train/chosen": -57.425209045410156, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -65.28491973876953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.04888518899679184, + "rewards_train/margins": 0.09300201013684273, + "rewards_train/rejected": -0.04411682114005089, + "step": 80 + }, + { + "epoch": 0.11, + "logps_train/chosen": -62.06060028076172, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -76.48416137695312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.09512293338775635, + "rewards_train/margins": 0.05173105001449585, + "rewards_train/rejected": -0.1468539834022522, + "step": 81 + }, + { + "epoch": 0.11, + "learning_rate": 2.71523178807947e-07, + "loss": 0.6648, + "step": 82 + }, + { + "epoch": 0.11, + "logps_train/chosen": -70.86492919921875, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -55.90704345703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.10055575519800186, + "rewards_train/margins": 0.17217954248189926, + "rewards_train/rejected": -0.2727352976799011, + "step": 82 + }, + { + "epoch": 0.11, + "logps_train/chosen": -95.26240539550781, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -121.41331481933594, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.15436625480651855, + "rewards_train/margins": -0.12553440034389496, + "rewards_train/rejected": -0.028831854462623596, + "step": 83 + }, + { + "epoch": 0.11, + "learning_rate": 2.781456953642384e-07, + "loss": 0.6969, + "step": 84 + }, + { + "epoch": 0.11, + "logps_train/chosen": -68.42686462402344, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -66.47966766357422, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.07549843937158585, + "rewards_train/margins": -0.163469098508358, + "rewards_train/rejected": 0.08797065913677216, + "step": 84 + }, + { + "epoch": 0.11, + "logps_train/chosen": -41.177154541015625, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -36.356689453125, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.038027845323085785, + "rewards_train/margins": -0.009780626744031906, + "rewards_train/rejected": -0.02824721857905388, + "step": 85 + }, + { + "epoch": 0.11, + "learning_rate": 2.847682119205298e-07, + "loss": 0.7441, + "step": 86 + }, + { + "epoch": 0.11, + "logps_train/chosen": -57.412818908691406, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -62.96641540527344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.15053001046180725, + "rewards_train/margins": 0.15654639713466167, + "rewards_train/rejected": -0.0060163866728544235, + "step": 86 + }, + { + "epoch": 0.12, + "logps_train/chosen": -56.833213806152344, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -68.21234893798828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.05027265101671219, + "rewards_train/margins": 0.034007830545306206, + "rewards_train/rejected": 0.016264820471405983, + "step": 87 + }, + { + "epoch": 0.12, + "learning_rate": 2.9139072847682117e-07, + "loss": 0.6559, + "step": 88 + }, + { + "epoch": 0.12, + "logps_train/chosen": -76.35112762451172, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -61.98542022705078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.014800550416111946, + "rewards_train/margins": 0.17749163322150707, + "rewards_train/rejected": -0.19229218363761902, + "step": 88 + }, + { + "epoch": 0.12, + "logps_train/chosen": -49.15941619873047, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -47.03160858154297, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.03679283335804939, + "rewards_train/margins": 0.07862583175301552, + "rewards_train/rejected": -0.041832998394966125, + "step": 89 + }, + { + "epoch": 0.12, + "learning_rate": 2.980132450331126e-07, + "loss": 0.6457, + "step": 90 + }, + { + "epoch": 0.12, + "logps_train/chosen": -63.048431396484375, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -58.56714630126953, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.21578097343444824, + "rewards_train/margins": 0.09093338251113892, + "rewards_train/rejected": -0.30671435594558716, + "step": 90 + }, + { + "epoch": 0.12, + "logps_train/chosen": -46.299842834472656, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -35.97846984863281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.050296660512685776, + "rewards_train/margins": -0.036824654787778854, + "rewards_train/rejected": -0.013472005724906921, + "step": 91 + }, + { + "epoch": 0.12, + "learning_rate": 3.0463576158940394e-07, + "loss": 0.6899, + "step": 92 + }, + { + "epoch": 0.12, + "logps_train/chosen": -43.684913635253906, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -49.776309967041016, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03724118322134018, + "rewards_train/margins": -0.13539118319749832, + "rewards_train/rejected": 0.09814999997615814, + "step": 92 + }, + { + "epoch": 0.12, + "logps_train/chosen": -92.63056945800781, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -108.69706726074219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.26770493388175964, + "rewards_train/margins": 0.3069436140358448, + "rewards_train/rejected": -0.03923868015408516, + "step": 93 + }, + { + "epoch": 0.12, + "learning_rate": 3.1125827814569533e-07, + "loss": 0.6728, + "step": 94 + }, + { + "epoch": 0.12, + "logps_train/chosen": -64.66177368164062, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -74.56458282470703, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2021145224571228, + "rewards_train/margins": -0.059718817472457886, + "rewards_train/rejected": -0.14239570498466492, + "step": 94 + }, + { + "epoch": 0.13, + "logps_train/chosen": -48.30315399169922, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -68.46471405029297, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03843483328819275, + "rewards_train/margins": 0.07865602150559425, + "rewards_train/rejected": -0.040221188217401505, + "step": 95 + }, + { + "epoch": 0.13, + "learning_rate": 3.1788079470198677e-07, + "loss": 0.7181, + "step": 96 + }, + { + "epoch": 0.13, + "logps_train/chosen": -91.6103286743164, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -82.10276794433594, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.12275154143571854, + "rewards_train/margins": -0.2640368416905403, + "rewards_train/rejected": 0.14128530025482178, + "step": 96 + }, + { + "epoch": 0.13, + "logps_train/chosen": -76.30892944335938, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -74.22945404052734, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.012143261730670929, + "rewards_train/margins": -0.05716678127646446, + "rewards_train/rejected": 0.04502351954579353, + "step": 97 + }, + { + "epoch": 0.13, + "learning_rate": 3.245033112582781e-07, + "loss": 0.7964, + "step": 98 + }, + { + "epoch": 0.13, + "logps_train/chosen": -51.50962448120117, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -78.12258911132812, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.08299360424280167, + "rewards_train/margins": -0.08714120648801327, + "rewards_train/rejected": 0.004147602245211601, + "step": 98 + }, + { + "epoch": 0.13, + "logps_train/chosen": -61.6601448059082, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -55.41648864746094, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.061329349875450134, + "rewards_train/margins": -0.034521862864494324, + "rewards_train/rejected": 0.09585121273994446, + "step": 99 + }, + { + "epoch": 0.13, + "learning_rate": 3.3112582781456954e-07, + "loss": 0.7289, + "step": 100 + }, + { + "epoch": 0.13, + "logps_train/chosen": -67.82522583007812, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -52.624847412109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11611630767583847, + "rewards_train/margins": 0.20339957624673843, + "rewards_train/rejected": -0.3195158839225769, + "step": 100 + }, + { + "epoch": 0.13, + "logps_train/chosen": -112.18073272705078, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -105.205078125, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.08994840830564499, + "rewards_train/margins": -0.010066166520118713, + "rewards_train/rejected": -0.07988224178552628, + "step": 101 + }, + { + "epoch": 0.14, + "learning_rate": 3.3774834437086093e-07, + "loss": 0.6666, + "step": 102 + }, + { + "epoch": 0.14, + "logps_train/chosen": -61.86896514892578, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -78.6822509765625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.15564638376235962, + "rewards_train/margins": -0.11242171376943588, + "rewards_train/rejected": -0.04322466999292374, + "step": 102 + }, + { + "epoch": 0.14, + "logps_train/chosen": -50.12421798706055, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -55.02149963378906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.1355273276567459, + "rewards_train/margins": -0.05919753015041351, + "rewards_train/rejected": 0.19472485780715942, + "step": 103 + }, + { + "epoch": 0.14, + "learning_rate": 3.4437086092715226e-07, + "loss": 0.7486, + "step": 104 + }, + { + "epoch": 0.14, + "logps_train/chosen": -75.27784729003906, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -101.50634765625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.173777773976326, + "rewards_train/margins": 0.15253830701112747, + "rewards_train/rejected": 0.021239466965198517, + "step": 104 + }, + { + "epoch": 0.14, + "logps_train/chosen": -107.19461059570312, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -72.81076049804688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.15149196982383728, + "rewards_train/margins": -0.021587401628494263, + "rewards_train/rejected": -0.12990456819534302, + "step": 105 + }, + { + "epoch": 0.14, + "learning_rate": 3.509933774834437e-07, + "loss": 0.6806, + "step": 106 + }, + { + "epoch": 0.14, + "logps_train/chosen": -81.36860656738281, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -64.16646575927734, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0951707661151886, + "rewards_train/margins": 0.016504988074302673, + "rewards_train/rejected": 0.07866577804088593, + "step": 106 + }, + { + "epoch": 0.14, + "logps_train/chosen": -37.63343048095703, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -36.02953338623047, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.003844795748591423, + "rewards_train/margins": -0.07054592855274677, + "rewards_train/rejected": 0.0743907243013382, + "step": 107 + }, + { + "epoch": 0.14, + "learning_rate": 3.576158940397351e-07, + "loss": 0.7129, + "step": 108 + }, + { + "epoch": 0.14, + "logps_train/chosen": -44.51172637939453, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -51.17156982421875, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.09570383280515671, + "rewards_train/margins": 0.08864045888185501, + "rewards_train/rejected": -0.18434429168701172, + "step": 108 + }, + { + "epoch": 0.14, + "logps_train/chosen": -51.38105392456055, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -53.045955657958984, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.08738280832767487, + "rewards_train/margins": 0.03963473439216614, + "rewards_train/rejected": 0.04774807393550873, + "step": 109 + }, + { + "epoch": 0.15, + "learning_rate": 3.642384105960264e-07, + "loss": 0.6678, + "step": 110 + }, + { + "epoch": 0.15, + "logps_train/chosen": -49.302364349365234, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -58.857460021972656, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.07320532202720642, + "rewards_train/margins": 0.07660304009914398, + "rewards_train/rejected": -0.1498083621263504, + "step": 110 + }, + { + "epoch": 0.15, + "logps_train/chosen": -72.46064758300781, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -66.34900665283203, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.15856443345546722, + "rewards_train/margins": -0.06116417050361633, + "rewards_train/rejected": -0.09740026295185089, + "step": 111 + }, + { + "epoch": 0.15, + "learning_rate": 3.7086092715231786e-07, + "loss": 0.6988, + "step": 112 + }, + { + "epoch": 0.15, + "logps_train/chosen": -71.7870101928711, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -82.71179962158203, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.03956109285354614, + "rewards_train/margins": 0.056053757667541504, + "rewards_train/rejected": -0.01649266481399536, + "step": 112 + }, + { + "epoch": 0.15, + "logps_train/chosen": -72.26091003417969, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -57.19587707519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0692216157913208, + "rewards_train/margins": 0.19974707067012787, + "rewards_train/rejected": -0.13052545487880707, + "step": 113 + }, + { + "epoch": 0.15, + "learning_rate": 3.7748344370860925e-07, + "loss": 0.6416, + "step": 114 + }, + { + "epoch": 0.15, + "logps_train/chosen": -48.94877624511719, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -42.96794891357422, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.007377296686172485, + "rewards_train/margins": -0.02152006048709154, + "rewards_train/rejected": 0.014142763800919056, + "step": 114 + }, + { + "epoch": 0.15, + "logps_train/chosen": -118.05181884765625, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -110.0438232421875, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": 0.00106887798756361, + "rewards_train/margins": -0.07892351318150759, + "rewards_train/rejected": 0.0799923911690712, + "step": 115 + }, + { + "epoch": 0.15, + "learning_rate": 3.841059602649007e-07, + "loss": 0.7267, + "step": 116 + }, + { + "epoch": 0.15, + "logps_train/chosen": -58.02381896972656, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -64.1988754272461, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.10494227707386017, + "rewards_train/margins": 0.084204763174057, + "rewards_train/rejected": 0.02073751389980316, + "step": 116 + }, + { + "epoch": 0.16, + "logps_train/chosen": -73.90216064453125, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -90.1214599609375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.17615409195423126, + "rewards_train/margins": -0.17650871723890305, + "rewards_train/rejected": 0.00035462528467178345, + "step": 117 + }, + { + "epoch": 0.16, + "learning_rate": 3.90728476821192e-07, + "loss": 0.7343, + "step": 118 + }, + { + "epoch": 0.16, + "logps_train/chosen": -51.456459045410156, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -63.093505859375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0026772618293762207, + "rewards_train/margins": 0.1941731870174408, + "rewards_train/rejected": -0.19685044884681702, + "step": 118 + }, + { + "epoch": 0.16, + "logps_train/chosen": -53.95072937011719, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -51.750728607177734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05492675304412842, + "rewards_train/margins": 0.017499543726444244, + "rewards_train/rejected": 0.037427209317684174, + "step": 119 + }, + { + "epoch": 0.16, + "learning_rate": 3.973509933774834e-07, + "loss": 0.6564, + "step": 120 + }, + { + "epoch": 0.16, + "logps_train/chosen": -95.12608337402344, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -95.79153442382812, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.16942286491394043, + "rewards_train/margins": 0.1782645247876644, + "rewards_train/rejected": -0.008841659873723984, + "step": 120 + }, + { + "epoch": 0.16, + "logps_train/chosen": -65.90806579589844, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -72.44938659667969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06033756583929062, + "rewards_train/margins": 0.05335157364606857, + "rewards_train/rejected": -0.11368913948535919, + "step": 121 + }, + { + "epoch": 0.16, + "learning_rate": 4.0397350993377485e-07, + "loss": 0.6521, + "step": 122 + }, + { + "epoch": 0.16, + "logps_train/chosen": -66.84635162353516, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -52.49150085449219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.022135093808174133, + "rewards_train/margins": 0.12310890853404999, + "rewards_train/rejected": -0.14524400234222412, + "step": 122 + }, + { + "epoch": 0.16, + "logps_train/chosen": -53.78950500488281, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -67.31559753417969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.04917469620704651, + "rewards_train/margins": 0.1572965681552887, + "rewards_train/rejected": -0.10812187194824219, + "step": 123 + }, + { + "epoch": 0.16, + "learning_rate": 4.105960264900662e-07, + "loss": 0.6372, + "step": 124 + }, + { + "epoch": 0.16, + "logps_train/chosen": -61.14987564086914, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -56.155784606933594, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.04779965430498123, + "rewards_train/margins": 0.0021537616848945618, + "rewards_train/rejected": -0.04995341598987579, + "step": 124 + }, + { + "epoch": 0.17, + "logps_train/chosen": -40.7641487121582, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -48.47035217285156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.019383713603019714, + "rewards_train/margins": 0.0010889321565628052, + "rewards_train/rejected": -0.02047264575958252, + "step": 125 + }, + { + "epoch": 0.17, + "learning_rate": 4.172185430463576e-07, + "loss": 0.699, + "step": 126 + }, + { + "epoch": 0.17, + "logps_train/chosen": -57.5948371887207, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -62.32328796386719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.042078785598278046, + "rewards_train/margins": 0.041594890877604485, + "rewards_train/rejected": 0.0004838947206735611, + "step": 126 + }, + { + "epoch": 0.17, + "logps_train/chosen": -71.44166564941406, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -67.69255828857422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.19489586353302002, + "rewards_train/margins": 0.13602691888809204, + "rewards_train/rejected": 0.05886894464492798, + "step": 127 + }, + { + "epoch": 0.17, + "learning_rate": 4.23841059602649e-07, + "loss": 0.6663, + "step": 128 + }, + { + "epoch": 0.17, + "logps_train/chosen": -78.86708068847656, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -81.16221618652344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.013291553594172001, + "rewards_train/margins": -0.024784128181636333, + "rewards_train/rejected": 0.038075681775808334, + "step": 128 + }, + { + "epoch": 0.17, + "logps_train/chosen": -57.14472961425781, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -58.5099983215332, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.14333954453468323, + "rewards_train/margins": 0.1412142775952816, + "rewards_train/rejected": 0.0021252669394016266, + "step": 129 + }, + { + "epoch": 0.17, + "learning_rate": 4.3046357615894034e-07, + "loss": 0.6866, + "step": 130 + }, + { + "epoch": 0.17, + "logps_train/chosen": -64.92575073242188, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -63.77936553955078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.07930011302232742, + "rewards_train/margins": 0.21036160737276077, + "rewards_train/rejected": -0.13106149435043335, + "step": 130 + }, + { + "epoch": 0.17, + "logps_train/chosen": -70.21444702148438, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -77.1873779296875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1667574793100357, + "rewards_train/margins": -0.016770094633102417, + "rewards_train/rejected": -0.1499873846769333, + "step": 131 + }, + { + "epoch": 0.18, + "learning_rate": 4.370860927152318e-07, + "loss": 0.6749, + "step": 132 + }, + { + "epoch": 0.18, + "logps_train/chosen": -87.6690673828125, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -94.22894287109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09090535342693329, + "rewards_train/margins": 0.2075498104095459, + "rewards_train/rejected": -0.11664445698261261, + "step": 132 + }, + { + "epoch": 0.18, + "logps_train/chosen": -82.33617401123047, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.46888732910156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.04333607479929924, + "rewards_train/margins": -0.013290923088788986, + "rewards_train/rejected": 0.056626997888088226, + "step": 133 + }, + { + "epoch": 0.18, + "learning_rate": 4.4370860927152317e-07, + "loss": 0.6607, + "step": 134 + }, + { + "epoch": 0.18, + "logps_train/chosen": -75.47453308105469, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -92.07035064697266, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.03692217171192169, + "rewards_train/margins": 0.08458223938941956, + "rewards_train/rejected": -0.047660067677497864, + "step": 134 + }, + { + "epoch": 0.18, + "logps_train/chosen": -54.34320831298828, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -54.20006561279297, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.09146048873662949, + "rewards_train/margins": 0.048185981810092926, + "rewards_train/rejected": 0.04327450692653656, + "step": 135 + }, + { + "epoch": 0.18, + "learning_rate": 4.5033112582781455e-07, + "loss": 0.6764, + "step": 136 + }, + { + "epoch": 0.18, + "logps_train/chosen": -70.4914321899414, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -66.65330505371094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1680445671081543, + "rewards_train/margins": 0.2169685736298561, + "rewards_train/rejected": -0.04892400652170181, + "step": 136 + }, + { + "epoch": 0.18, + "logps_train/chosen": -100.63397216796875, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -79.61186218261719, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.04519620165228844, + "rewards_train/margins": 0.07474162802100182, + "rewards_train/rejected": -0.02954542636871338, + "step": 137 + }, + { + "epoch": 0.18, + "learning_rate": 4.5695364238410594e-07, + "loss": 0.6578, + "step": 138 + }, + { + "epoch": 0.18, + "logps_train/chosen": -88.49551391601562, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -88.72273254394531, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.04419870302081108, + "rewards_train/margins": -0.0772779993712902, + "rewards_train/rejected": 0.12147670239210129, + "step": 138 + }, + { + "epoch": 0.18, + "logps_train/chosen": -56.23085021972656, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -57.55557632446289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.005040287971496582, + "rewards_train/margins": 0.28872281312942505, + "rewards_train/rejected": -0.28368252515792847, + "step": 139 + }, + { + "epoch": 0.19, + "learning_rate": 4.635761589403973e-07, + "loss": 0.6585, + "step": 140 + }, + { + "epoch": 0.19, + "logps_train/chosen": -95.6607437133789, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -83.00914764404297, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.14154255390167236, + "rewards_train/margins": 0.18151971325278282, + "rewards_train/rejected": -0.03997715935111046, + "step": 140 + }, + { + "epoch": 0.19, + "logps_train/chosen": -83.20655059814453, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -67.08049011230469, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.005029797554016113, + "rewards_train/margins": -0.08760550618171692, + "rewards_train/rejected": 0.0825757086277008, + "step": 141 + }, + { + "epoch": 0.19, + "learning_rate": 4.701986754966887e-07, + "loss": 0.7011, + "step": 142 + }, + { + "epoch": 0.19, + "logps_train/chosen": -58.488731384277344, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -64.4619369506836, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.13081428408622742, + "rewards_train/margins": 0.18716440349817276, + "rewards_train/rejected": -0.05635011941194534, + "step": 142 + }, + { + "epoch": 0.19, + "logps_train/chosen": -44.711212158203125, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -55.97105407714844, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.12073065340518951, + "rewards_train/margins": -0.0017506703734397888, + "rewards_train/rejected": -0.11897998303174973, + "step": 143 + }, + { + "epoch": 0.19, + "learning_rate": 4.768211920529801e-07, + "loss": 0.6712, + "step": 144 + }, + { + "epoch": 0.19, + "logps_train/chosen": -61.643856048583984, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -61.83428192138672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.16862204670906067, + "rewards_train/margins": 0.08330032229423523, + "rewards_train/rejected": 0.08532172441482544, + "step": 144 + }, + { + "epoch": 0.19, + "logps_train/chosen": -49.488956451416016, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -42.87102508544922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06829171627759933, + "rewards_train/margins": 0.03742557391524315, + "rewards_train/rejected": 0.030866142362356186, + "step": 145 + }, + { + "epoch": 0.19, + "learning_rate": 4.834437086092715e-07, + "loss": 0.6815, + "step": 146 + }, + { + "epoch": 0.19, + "logps_train/chosen": -75.34461975097656, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -77.51017761230469, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0483504980802536, + "rewards_train/margins": 0.047025082632899284, + "rewards_train/rejected": 0.0013254154473543167, + "step": 146 + }, + { + "epoch": 0.2, + "logps_train/chosen": -82.11029052734375, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -86.05786895751953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.06084573641419411, + "rewards_train/margins": 0.4760078825056553, + "rewards_train/rejected": -0.4151621460914612, + "step": 147 + }, + { + "epoch": 0.2, + "learning_rate": 4.900662251655629e-07, + "loss": 0.5963, + "step": 148 + }, + { + "epoch": 0.2, + "logps_train/chosen": -78.65860748291016, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -69.48919677734375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.07289192080497742, + "rewards_train/margins": 0.09243376553058624, + "rewards_train/rejected": -0.16532568633556366, + "step": 148 + }, + { + "epoch": 0.2, + "logps_train/chosen": -86.75151062011719, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -82.36729431152344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.1904740333557129, + "rewards_train/margins": 0.1213437020778656, + "rewards_train/rejected": 0.06913033127784729, + "step": 149 + }, + { + "epoch": 0.2, + "learning_rate": 4.966887417218543e-07, + "loss": 0.6674, + "step": 150 + }, + { + "epoch": 0.2, + "logps_train/chosen": -61.20390701293945, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -40.24456024169922, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.18273454904556274, + "rewards_train/margins": 0.16109668835997581, + "rewards_train/rejected": 0.02163786068558693, + "step": 150 + }, + { + "epoch": 0.2, + "logps_train/chosen": -45.33393096923828, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -49.1523323059082, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.023637887090444565, + "rewards_train/margins": 0.15449609979987144, + "rewards_train/rejected": -0.13085821270942688, + "step": 151 + }, + { + "epoch": 0.2, + "learning_rate": 4.999998488565838e-07, + "loss": 0.6326, + "step": 152 + }, + { + "epoch": 0.2, + "logps_train/chosen": -67.12495422363281, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -90.41917419433594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.14062972366809845, + "rewards_train/margins": 0.3200472891330719, + "rewards_train/rejected": -0.17941756546497345, + "step": 152 + }, + { + "epoch": 0.2, + "logps_train/chosen": -35.22588348388672, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -43.22602081298828, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.06067415326833725, + "rewards_train/margins": -0.0028182752430438995, + "rewards_train/rejected": -0.05785587802529335, + "step": 153 + }, + { + "epoch": 0.2, + "learning_rate": 4.999986397103514e-07, + "loss": 0.6345, + "step": 154 + }, + { + "epoch": 0.2, + "logps_train/chosen": -53.85517883300781, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -63.273860931396484, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.254268079996109, + "rewards_train/margins": -0.017507046461105347, + "rewards_train/rejected": -0.23676103353500366, + "step": 154 + }, + { + "epoch": 0.21, + "logps_train/chosen": -54.74589920043945, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -59.33473205566406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1675974428653717, + "rewards_train/margins": 0.11825825273990631, + "rewards_train/rejected": 0.04933919012546539, + "step": 155 + }, + { + "epoch": 0.21, + "learning_rate": 4.999962214237345e-07, + "loss": 0.6804, + "step": 156 + }, + { + "epoch": 0.21, + "logps_train/chosen": -44.9425048828125, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -52.465545654296875, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.2215944528579712, + "rewards_train/margins": -0.13910240679979324, + "rewards_train/rejected": -0.08249204605817795, + "step": 156 + }, + { + "epoch": 0.21, + "logps_train/chosen": -76.8110580444336, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -85.73584747314453, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.007668402045965195, + "rewards_train/margins": -0.08720861002802849, + "rewards_train/rejected": 0.0795402079820633, + "step": 157 + }, + { + "epoch": 0.21, + "learning_rate": 4.999925940084296e-07, + "loss": 0.7699, + "step": 158 + }, + { + "epoch": 0.21, + "logps_train/chosen": -45.07207489013672, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -55.9972038269043, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.08810484409332275, + "rewards_train/margins": 0.3167314976453781, + "rewards_train/rejected": -0.22862665355205536, + "step": 158 + }, + { + "epoch": 0.21, + "logps_train/chosen": -91.51863861083984, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -108.4189453125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.14701326191425323, + "rewards_train/margins": 0.09125164151191711, + "rewards_train/rejected": 0.05576162040233612, + "step": 159 + }, + { + "epoch": 0.21, + "learning_rate": 4.999877574819808e-07, + "loss": 0.6321, + "step": 160 + }, + { + "epoch": 0.21, + "logps_train/chosen": -64.62169647216797, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -64.21802520751953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.3722056746482849, + "rewards_train/margins": 0.3815084397792816, + "rewards_train/rejected": -0.009302765130996704, + "step": 160 + }, + { + "epoch": 0.21, + "logps_train/chosen": -48.48457336425781, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -55.74340057373047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0968550443649292, + "rewards_train/margins": 0.12275736406445503, + "rewards_train/rejected": -0.025902319699525833, + "step": 161 + }, + { + "epoch": 0.22, + "learning_rate": 4.999817118677806e-07, + "loss": 0.6014, + "step": 162 + }, + { + "epoch": 0.22, + "logps_train/chosen": -40.219757080078125, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -55.90486145019531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.04697583243250847, + "rewards_train/margins": 0.09233813360333443, + "rewards_train/rejected": -0.1393139660358429, + "step": 162 + }, + { + "epoch": 0.22, + "logps_train/chosen": -53.66594696044922, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -57.26887512207031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.0791085734963417, + "rewards_train/margins": -0.001816391944885254, + "rewards_train/rejected": 0.08092496544122696, + "step": 163 + }, + { + "epoch": 0.22, + "learning_rate": 4.999744571950691e-07, + "loss": 0.6981, + "step": 164 + }, + { + "epoch": 0.22, + "logps_train/chosen": -55.405059814453125, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -58.94813919067383, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1326933205127716, + "rewards_train/margins": 0.18868306279182434, + "rewards_train/rejected": -0.32137638330459595, + "step": 164 + }, + { + "epoch": 0.22, + "logps_train/chosen": -62.670623779296875, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -56.03486251831055, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06935718655586243, + "rewards_train/margins": 0.06083807349205017, + "rewards_train/rejected": -0.1301952600479126, + "step": 165 + }, + { + "epoch": 0.22, + "learning_rate": 4.99965993498934e-07, + "loss": 0.6793, + "step": 166 + }, + { + "epoch": 0.22, + "logps_train/chosen": -55.48243713378906, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -57.08064651489258, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.08691291511058807, + "rewards_train/margins": 0.2840401232242584, + "rewards_train/rejected": -0.19712720811367035, + "step": 166 + }, + { + "epoch": 0.22, + "logps_train/chosen": -63.22955322265625, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -72.77273559570312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1098572239279747, + "rewards_train/margins": 0.20783356577157974, + "rewards_train/rejected": -0.09797634184360504, + "step": 167 + }, + { + "epoch": 0.22, + "learning_rate": 4.999563208203109e-07, + "loss": 0.6387, + "step": 168 + }, + { + "epoch": 0.22, + "logps_train/chosen": -48.420230865478516, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -50.79782485961914, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0876644104719162, + "rewards_train/margins": 0.45729057490825653, + "rewards_train/rejected": -0.36962616443634033, + "step": 168 + }, + { + "epoch": 0.22, + "logps_train/chosen": -60.79928970336914, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -60.372554779052734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.16382136940956116, + "rewards_train/margins": 0.6014676988124847, + "rewards_train/rejected": -0.4376463294029236, + "step": 169 + }, + { + "epoch": 0.23, + "learning_rate": 4.999454392059824e-07, + "loss": 0.5151, + "step": 170 + }, + { + "epoch": 0.23, + "logps_train/chosen": -80.40921783447266, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -103.83680725097656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03154726326465607, + "rewards_train/margins": 0.05682013928890228, + "rewards_train/rejected": -0.08836740255355835, + "step": 170 + }, + { + "epoch": 0.23, + "logps_train/chosen": -65.59782409667969, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -65.05398559570312, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.019156955182552338, + "rewards_train/margins": 0.29854581505060196, + "rewards_train/rejected": -0.3177027702331543, + "step": 171 + }, + { + "epoch": 0.23, + "learning_rate": 4.999333487085786e-07, + "loss": 0.6326, + "step": 172 + }, + { + "epoch": 0.23, + "logps_train/chosen": -43.34238815307617, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -58.07159423828125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03938404098153114, + "rewards_train/margins": -0.035740336403250694, + "rewards_train/rejected": -0.003643704578280449, + "step": 172 + }, + { + "epoch": 0.23, + "logps_train/chosen": -51.10371398925781, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -56.03058624267578, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.132597416639328, + "rewards_train/margins": 0.44737502932548523, + "rewards_train/rejected": -0.3147776126861572, + "step": 173 + }, + { + "epoch": 0.23, + "learning_rate": 4.999200493865761e-07, + "loss": 0.6503, + "step": 174 + }, + { + "epoch": 0.23, + "logps_train/chosen": -53.67700958251953, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -60.64039611816406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.11276771128177643, + "rewards_train/margins": 0.5689948946237564, + "rewards_train/rejected": -0.45622718334198, + "step": 174 + }, + { + "epoch": 0.23, + "logps_train/chosen": -71.80726623535156, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -65.7108154296875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.05989867448806763, + "rewards_train/margins": 0.2153552770614624, + "rewards_train/rejected": -0.15545660257339478, + "step": 175 + }, + { + "epoch": 0.23, + "learning_rate": 4.99905541304298e-07, + "loss": 0.5703, + "step": 176 + }, + { + "epoch": 0.23, + "logps_train/chosen": -54.552207946777344, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -64.19302368164062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.016654152423143387, + "rewards_train/margins": 0.3797064907848835, + "rewards_train/rejected": -0.3630523383617401, + "step": 176 + }, + { + "epoch": 0.24, + "logps_train/chosen": -39.572669982910156, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -45.44940185546875, + "rewards_train/accuracies": 0.125, + "rewards_train/chosen": -0.05570477247238159, + "rewards_train/margins": -0.10842059552669525, + "rewards_train/rejected": 0.05271582305431366, + "step": 177 + }, + { + "epoch": 0.24, + "learning_rate": 4.998898245319145e-07, + "loss": 0.6695, + "step": 178 + }, + { + "epoch": 0.24, + "logps_train/chosen": -77.93913269042969, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -88.1424560546875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.4013994336128235, + "rewards_train/margins": 0.6250197887420654, + "rewards_train/rejected": -0.22362035512924194, + "step": 178 + }, + { + "epoch": 0.24, + "logps_train/chosen": -40.6803092956543, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -49.181480407714844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.06295301765203476, + "rewards_train/margins": 0.1731637939810753, + "rewards_train/rejected": -0.23611681163311005, + "step": 179 + }, + { + "epoch": 0.24, + "learning_rate": 4.998728991454407e-07, + "loss": 0.5836, + "step": 180 + }, + { + "epoch": 0.24, + "logps_train/chosen": -54.1745491027832, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -42.93806457519531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.1278574913740158, + "rewards_train/margins": 0.2341638058423996, + "rewards_train/rejected": -0.10630631446838379, + "step": 180 + }, + { + "epoch": 0.24, + "logps_train/chosen": -42.657432556152344, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -49.74334716796875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0067177414894104, + "rewards_train/margins": 0.3238258957862854, + "rewards_train/rejected": -0.317108154296875, + "step": 181 + }, + { + "epoch": 0.24, + "learning_rate": 4.998547652267378e-07, + "loss": 0.6148, + "step": 182 + }, + { + "epoch": 0.24, + "logps_train/chosen": -77.77507019042969, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -122.24191284179688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3404620885848999, + "rewards_train/margins": 0.9459036588668823, + "rewards_train/rejected": -0.6054415702819824, + "step": 182 + }, + { + "epoch": 0.24, + "logps_train/chosen": -61.900638580322266, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -44.3643798828125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.007251359522342682, + "rewards_train/margins": 0.1948118731379509, + "rewards_train/rejected": -0.20206323266029358, + "step": 183 + }, + { + "epoch": 0.24, + "learning_rate": 4.998354228635121e-07, + "loss": 0.5137, + "step": 184 + }, + { + "epoch": 0.24, + "logps_train/chosen": -71.55892944335938, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -77.88035583496094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.15745575726032257, + "rewards_train/margins": -0.14442064613103867, + "rewards_train/rejected": -0.013035111129283905, + "step": 184 + }, + { + "epoch": 0.25, + "logps_train/chosen": -65.37760925292969, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -73.73092651367188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.02994891256093979, + "rewards_train/margins": 0.15876873582601547, + "rewards_train/rejected": -0.18871764838695526, + "step": 185 + }, + { + "epoch": 0.25, + "learning_rate": 4.998148721493147e-07, + "loss": 0.7105, + "step": 186 + }, + { + "epoch": 0.25, + "logps_train/chosen": -53.002296447753906, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -60.96434783935547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02633272111415863, + "rewards_train/margins": 0.3727674037218094, + "rewards_train/rejected": -0.34643468260765076, + "step": 186 + }, + { + "epoch": 0.25, + "logps_train/chosen": -53.66192626953125, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -50.88130569458008, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1213073655962944, + "rewards_train/margins": 0.1867815926671028, + "rewards_train/rejected": -0.06547422707080841, + "step": 187 + }, + { + "epoch": 0.25, + "learning_rate": 4.997931131835409e-07, + "loss": 0.6, + "step": 188 + }, + { + "epoch": 0.25, + "logps_train/chosen": -87.65532684326172, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -89.2316665649414, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.08290465176105499, + "rewards_train/margins": 0.29513366520404816, + "rewards_train/rejected": -0.21222901344299316, + "step": 188 + }, + { + "epoch": 0.25, + "logps_train/chosen": -60.90528869628906, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -64.6880874633789, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.11240382492542267, + "rewards_train/margins": 0.08921770751476288, + "rewards_train/rejected": -0.20162153244018555, + "step": 189 + }, + { + "epoch": 0.25, + "learning_rate": 4.997701460714298e-07, + "loss": 0.6243, + "step": 190 + }, + { + "epoch": 0.25, + "logps_train/chosen": -78.42573547363281, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -74.37881469726562, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.029301732778549194, + "rewards_train/margins": 0.12187006324529648, + "rewards_train/rejected": -0.09256833046674728, + "step": 190 + }, + { + "epoch": 0.25, + "logps_train/chosen": -39.57395553588867, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -43.86948776245117, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.042326103895902634, + "rewards_train/margins": 0.1289978213608265, + "rewards_train/rejected": -0.17132392525672913, + "step": 191 + }, + { + "epoch": 0.25, + "learning_rate": 4.99745970924064e-07, + "loss": 0.6525, + "step": 192 + }, + { + "epoch": 0.25, + "logps_train/chosen": -64.17208862304688, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -100.61174011230469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.19060376286506653, + "rewards_train/margins": 0.7814652025699615, + "rewards_train/rejected": -0.590861439704895, + "step": 192 + }, + { + "epoch": 0.26, + "logps_train/chosen": -35.9684944152832, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -43.29087829589844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.036549150943756104, + "rewards_train/margins": 0.3875119686126709, + "rewards_train/rejected": -0.3509628176689148, + "step": 193 + }, + { + "epoch": 0.26, + "learning_rate": 4.997205878583687e-07, + "loss": 0.4882, + "step": 194 + }, + { + "epoch": 0.26, + "logps_train/chosen": -73.85863494873047, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -52.19065856933594, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.414917916059494, + "rewards_train/margins": 0.3871087599545717, + "rewards_train/rejected": 0.027809156104922295, + "step": 194 + }, + { + "epoch": 0.26, + "logps_train/chosen": -59.111663818359375, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -80.7671127319336, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.07145082205533981, + "rewards_train/margins": 0.3192560598254204, + "rewards_train/rejected": -0.24780523777008057, + "step": 195 + }, + { + "epoch": 0.26, + "learning_rate": 4.996939969971111e-07, + "loss": 0.5863, + "step": 196 + }, + { + "epoch": 0.26, + "logps_train/chosen": -86.3590316772461, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -96.23582458496094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 3.456324338912964e-05, + "rewards_train/margins": 0.06424279510974884, + "rewards_train/rejected": -0.06420823186635971, + "step": 196 + }, + { + "epoch": 0.26, + "logps_train/chosen": -51.63418197631836, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -77.60122680664062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.07642568647861481, + "rewards_train/margins": 0.7146729379892349, + "rewards_train/rejected": -0.6382472515106201, + "step": 197 + }, + { + "epoch": 0.26, + "learning_rate": 4.996661984689006e-07, + "loss": 0.564, + "step": 198 + }, + { + "epoch": 0.26, + "logps_train/chosen": -67.92711639404297, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -68.38245391845703, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07161794602870941, + "rewards_train/margins": 0.09943979978561401, + "rewards_train/rejected": -0.17105774581432343, + "step": 198 + }, + { + "epoch": 0.26, + "logps_train/chosen": -84.33158874511719, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -85.19648742675781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0538620799779892, + "rewards_train/margins": 0.5251610428094864, + "rewards_train/rejected": -0.5790231227874756, + "step": 199 + }, + { + "epoch": 0.27, + "learning_rate": 4.996371924081868e-07, + "loss": 0.5895, + "step": 200 + }, + { + "epoch": 0.27, + "logps_train/chosen": -36.751338958740234, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -41.08473205566406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.12564730644226074, + "rewards_train/margins": 0.32513612508773804, + "rewards_train/rejected": -0.1994888186454773, + "step": 200 + }, + { + "epoch": 0.27, + "logps_train/chosen": -72.52227783203125, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -66.40202331542969, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.008477356284856796, + "rewards_train/margins": -0.02452481910586357, + "rewards_train/rejected": 0.016047462821006775, + "step": 201 + }, + { + "epoch": 0.27, + "learning_rate": 4.996069789552604e-07, + "loss": 0.656, + "step": 202 + }, + { + "epoch": 0.27, + "logps_train/chosen": -46.98569869995117, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -46.165889739990234, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.09349186718463898, + "rewards_train/margins": 0.23715956509113312, + "rewards_train/rejected": -0.3306514322757721, + "step": 202 + }, + { + "epoch": 0.27, + "logps_train/chosen": -61.59912872314453, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -78.4246597290039, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.00366289378143847, + "rewards_train/margins": 0.22942834137938917, + "rewards_train/rejected": -0.23309123516082764, + "step": 203 + }, + { + "epoch": 0.27, + "learning_rate": 4.995755582562513e-07, + "loss": 0.621, + "step": 204 + }, + { + "epoch": 0.27, + "logps_train/chosen": -68.05762481689453, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -74.42012023925781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11611265689134598, + "rewards_train/margins": 0.24953097850084305, + "rewards_train/rejected": -0.13341832160949707, + "step": 204 + }, + { + "epoch": 0.27, + "logps_train/chosen": -50.00814437866211, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -40.646636962890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28199827671051025, + "rewards_train/margins": 0.37869328260421753, + "rewards_train/rejected": -0.09669500589370728, + "step": 205 + }, + { + "epoch": 0.27, + "learning_rate": 4.995429304631284e-07, + "loss": 0.5676, + "step": 206 + }, + { + "epoch": 0.27, + "logps_train/chosen": -45.341514587402344, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -50.673377990722656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.09397386014461517, + "rewards_train/margins": 0.4449055641889572, + "rewards_train/rejected": -0.35093170404434204, + "step": 206 + }, + { + "epoch": 0.27, + "logps_train/chosen": -72.52935791015625, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -77.76852416992188, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.01815762370824814, + "rewards_train/margins": 0.38563456386327744, + "rewards_train/rejected": -0.3674769401550293, + "step": 207 + }, + { + "epoch": 0.28, + "learning_rate": 4.99509095733699e-07, + "loss": 0.5756, + "step": 208 + }, + { + "epoch": 0.28, + "logps_train/chosen": -39.61733627319336, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -40.18269348144531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03592272102832794, + "rewards_train/margins": 0.1682545244693756, + "rewards_train/rejected": -0.13233180344104767, + "step": 208 + }, + { + "epoch": 0.28, + "logps_train/chosen": -41.508949279785156, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -50.5323486328125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.11051559448242188, + "rewards_train/margins": 0.19812536239624023, + "rewards_train/rejected": -0.08760976791381836, + "step": 209 + }, + { + "epoch": 0.28, + "learning_rate": 4.994740542316077e-07, + "loss": 0.6436, + "step": 210 + }, + { + "epoch": 0.28, + "logps_train/chosen": -67.76651763916016, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -93.08468627929688, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.036026522517204285, + "rewards_train/margins": 0.17088010907173157, + "rewards_train/rejected": -0.20690663158893585, + "step": 210 + }, + { + "epoch": 0.28, + "logps_train/chosen": -89.65243530273438, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -86.19413757324219, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.33319321274757385, + "rewards_train/margins": 0.45416951179504395, + "rewards_train/rejected": -0.12097629904747009, + "step": 211 + }, + { + "epoch": 0.28, + "learning_rate": 4.994378061263359e-07, + "loss": 0.6295, + "step": 212 + }, + { + "epoch": 0.28, + "logps_train/chosen": -70.17023468017578, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -80.16148376464844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4048513174057007, + "rewards_train/margins": 0.744436502456665, + "rewards_train/rejected": -0.33958518505096436, + "step": 212 + }, + { + "epoch": 0.28, + "logps_train/chosen": -58.20573806762695, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -63.61360549926758, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.10377678275108337, + "rewards_train/margins": 0.41852131485939026, + "rewards_train/rejected": -0.5222980976104736, + "step": 213 + }, + { + "epoch": 0.28, + "learning_rate": 4.994003515932005e-07, + "loss": 0.537, + "step": 214 + }, + { + "epoch": 0.28, + "logps_train/chosen": -73.37883758544922, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -80.34822082519531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.13867858052253723, + "rewards_train/margins": 0.3086574822664261, + "rewards_train/rejected": -0.16997890174388885, + "step": 214 + }, + { + "epoch": 0.29, + "logps_train/chosen": -82.57935333251953, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -89.07049560546875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2420646846294403, + "rewards_train/margins": 0.5491144061088562, + "rewards_train/rejected": -0.3070497214794159, + "step": 215 + }, + { + "epoch": 0.29, + "learning_rate": 4.993616908133538e-07, + "loss": 0.5552, + "step": 216 + }, + { + "epoch": 0.29, + "logps_train/chosen": -62.509071350097656, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -47.232276916503906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1608111560344696, + "rewards_train/margins": 0.2496638000011444, + "rewards_train/rejected": -0.0888526439666748, + "step": 216 + }, + { + "epoch": 0.29, + "logps_train/chosen": -46.436553955078125, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -73.767578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.37353235483169556, + "rewards_train/margins": 0.3127901256084442, + "rewards_train/rejected": 0.06074222922325134, + "step": 217 + }, + { + "epoch": 0.29, + "learning_rate": 4.993218239737822e-07, + "loss": 0.6078, + "step": 218 + }, + { + "epoch": 0.29, + "logps_train/chosen": -76.15928649902344, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -64.4079360961914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.03311648219823837, + "rewards_train/margins": 0.32017726451158524, + "rewards_train/rejected": -0.3532937467098236, + "step": 218 + }, + { + "epoch": 0.29, + "logps_train/chosen": -43.111488342285156, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -37.24299240112305, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.026741839945316315, + "rewards_train/margins": 0.2276037111878395, + "rewards_train/rejected": -0.2008618712425232, + "step": 219 + }, + { + "epoch": 0.29, + "learning_rate": 4.992807512673049e-07, + "loss": 0.5998, + "step": 220 + }, + { + "epoch": 0.29, + "logps_train/chosen": -72.62298583984375, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -58.47601318359375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.15711729228496552, + "rewards_train/margins": 0.21350783109664917, + "rewards_train/rejected": -0.056390538811683655, + "step": 220 + }, + { + "epoch": 0.29, + "logps_train/chosen": -45.64373779296875, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -38.30986785888672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07413950562477112, + "rewards_train/margins": 0.038097307085990906, + "rewards_train/rejected": -0.11223681271076202, + "step": 221 + }, + { + "epoch": 0.29, + "learning_rate": 4.992384728925738e-07, + "loss": 0.6771, + "step": 222 + }, + { + "epoch": 0.29, + "logps_train/chosen": -81.40364074707031, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -101.76524353027344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.009114399552345276, + "rewards_train/margins": 0.7064722627401352, + "rewards_train/rejected": -0.7155866622924805, + "step": 222 + }, + { + "epoch": 0.3, + "logps_train/chosen": -40.5022087097168, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -62.6319580078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.05690810829401016, + "rewards_train/margins": 0.8615098968148232, + "rewards_train/rejected": -0.804601788520813, + "step": 223 + }, + { + "epoch": 0.3, + "learning_rate": 4.99194989054072e-07, + "loss": 0.449, + "step": 224 + }, + { + "epoch": 0.3, + "logps_train/chosen": -56.36979293823242, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -71.65719604492188, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.37395814061164856, + "rewards_train/margins": 0.5599899291992188, + "rewards_train/rejected": -0.1860317885875702, + "step": 224 + }, + { + "epoch": 0.3, + "logps_train/chosen": -79.16781616210938, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -100.56978607177734, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.2839689254760742, + "rewards_train/margins": 0.3089473247528076, + "rewards_train/rejected": -0.5929162502288818, + "step": 225 + }, + { + "epoch": 0.3, + "learning_rate": 4.991502999621128e-07, + "loss": 0.6277, + "step": 226 + }, + { + "epoch": 0.3, + "logps_train/chosen": -39.173728942871094, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -48.77214050292969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.15918953716754913, + "rewards_train/margins": 0.5731227844953537, + "rewards_train/rejected": -0.41393324732780457, + "step": 226 + }, + { + "epoch": 0.3, + "logps_train/chosen": -86.06292724609375, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -70.32135009765625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.45151931047439575, + "rewards_train/margins": 0.5352165624499321, + "rewards_train/rejected": -0.08369725197553635, + "step": 227 + }, + { + "epoch": 0.3, + "learning_rate": 4.99104405832839e-07, + "loss": 0.5527, + "step": 228 + }, + { + "epoch": 0.3, + "logps_train/chosen": -48.39280700683594, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -72.61219024658203, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03923491761088371, + "rewards_train/margins": 0.37076691910624504, + "rewards_train/rejected": -0.33153200149536133, + "step": 228 + }, + { + "epoch": 0.3, + "logps_train/chosen": -81.25634765625, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -80.35000610351562, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.10219761729240417, + "rewards_train/margins": 0.15233466029167175, + "rewards_train/rejected": -0.2545322775840759, + "step": 229 + }, + { + "epoch": 0.31, + "learning_rate": 4.990573068882216e-07, + "loss": 0.6863, + "step": 230 + }, + { + "epoch": 0.31, + "logps_train/chosen": -53.20347213745117, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -60.157066345214844, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.22669494152069092, + "rewards_train/margins": -0.049269452691078186, + "rewards_train/rejected": -0.17742548882961273, + "step": 230 + }, + { + "epoch": 0.31, + "logps_train/chosen": -68.158447265625, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -70.55867004394531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2888917326927185, + "rewards_train/margins": 0.277912974357605, + "rewards_train/rejected": -0.5668047070503235, + "step": 231 + }, + { + "epoch": 0.31, + "learning_rate": 4.990090033560585e-07, + "loss": 0.6917, + "step": 232 + }, + { + "epoch": 0.31, + "logps_train/chosen": -61.90306854248047, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -68.86416625976562, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.21413478255271912, + "rewards_train/margins": 0.2761881649494171, + "rewards_train/rejected": -0.49032294750213623, + "step": 232 + }, + { + "epoch": 0.31, + "logps_train/chosen": -76.01016235351562, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -77.12147521972656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.30835872888565063, + "rewards_train/margins": 0.8986305594444275, + "rewards_train/rejected": -0.5902718305587769, + "step": 233 + }, + { + "epoch": 0.31, + "learning_rate": 4.989594954699745e-07, + "loss": 0.5612, + "step": 234 + }, + { + "epoch": 0.31, + "logps_train/chosen": -110.30442810058594, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -100.64962005615234, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.07580772787332535, + "rewards_train/margins": 0.5724109187722206, + "rewards_train/rejected": -0.49660319089889526, + "step": 234 + }, + { + "epoch": 0.31, + "logps_train/chosen": -54.16011428833008, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -35.82278060913086, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.15663626790046692, + "rewards_train/margins": -0.0583425834774971, + "rewards_train/rejected": -0.09829368442296982, + "step": 235 + }, + { + "epoch": 0.31, + "learning_rate": 4.989087834694185e-07, + "loss": 0.6368, + "step": 236 + }, + { + "epoch": 0.31, + "logps_train/chosen": -70.70149993896484, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -89.41949462890625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.01468108594417572, + "rewards_train/margins": 0.6382056027650833, + "rewards_train/rejected": -0.652886688709259, + "step": 236 + }, + { + "epoch": 0.31, + "logps_train/chosen": -53.71183776855469, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -71.12318420410156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.19852736592292786, + "rewards_train/margins": 0.6309789717197418, + "rewards_train/rejected": -0.8295063376426697, + "step": 237 + }, + { + "epoch": 0.32, + "learning_rate": 4.988568675996635e-07, + "loss": 0.5057, + "step": 238 + }, + { + "epoch": 0.32, + "logps_train/chosen": -66.60150146484375, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -58.6766242980957, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.051555898040533066, + "rewards_train/margins": 0.053606484085321426, + "rewards_train/rejected": -0.10516238212585449, + "step": 238 + }, + { + "epoch": 0.32, + "logps_train/chosen": -42.52677536010742, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -57.406883239746094, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.08267395198345184, + "rewards_train/margins": 0.4117414802312851, + "rewards_train/rejected": -0.32906752824783325, + "step": 239 + }, + { + "epoch": 0.32, + "learning_rate": 4.988037481118053e-07, + "loss": 0.6346, + "step": 240 + }, + { + "epoch": 0.32, + "logps_train/chosen": -46.27587890625, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -54.25221252441406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11057731509208679, + "rewards_train/margins": 0.35708776116371155, + "rewards_train/rejected": -0.24651044607162476, + "step": 240 + }, + { + "epoch": 0.32, + "logps_train/chosen": -52.66162109375, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -57.842891693115234, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.30415022373199463, + "rewards_train/margins": 0.932189404964447, + "rewards_train/rejected": -0.6280391812324524, + "step": 241 + }, + { + "epoch": 0.32, + "learning_rate": 4.987494252627611e-07, + "loss": 0.5422, + "step": 242 + }, + { + "epoch": 0.32, + "logps_train/chosen": -72.02131652832031, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -69.80036163330078, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.39744386076927185, + "rewards_train/margins": -0.30568890273571014, + "rewards_train/rejected": -0.0917549580335617, + "step": 242 + }, + { + "epoch": 0.32, + "logps_train/chosen": -56.40485763549805, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -61.2404670715332, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07290759682655334, + "rewards_train/margins": 0.22550436854362488, + "rewards_train/rejected": -0.2984119653701782, + "step": 243 + }, + { + "epoch": 0.32, + "learning_rate": 4.986938993152679e-07, + "loss": 0.7547, + "step": 244 + }, + { + "epoch": 0.32, + "logps_train/chosen": -57.6363525390625, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -61.31168746948242, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06597905606031418, + "rewards_train/margins": 0.12378331273794174, + "rewards_train/rejected": -0.18976236879825592, + "step": 244 + }, + { + "epoch": 0.33, + "logps_train/chosen": -56.0382080078125, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -64.04031372070312, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.31024178862571716, + "rewards_train/margins": 0.8728669583797455, + "rewards_train/rejected": -0.5626251697540283, + "step": 245 + }, + { + "epoch": 0.33, + "learning_rate": 4.986371705378818e-07, + "loss": 0.5424, + "step": 246 + }, + { + "epoch": 0.33, + "logps_train/chosen": -64.49261474609375, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -68.74340057373047, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08363626897335052, + "rewards_train/margins": 0.5039849430322647, + "rewards_train/rejected": -0.5876212120056152, + "step": 246 + }, + { + "epoch": 0.33, + "logps_train/chosen": -70.82987976074219, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -86.32960510253906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.14044982194900513, + "rewards_train/margins": 0.8562231659889221, + "rewards_train/rejected": -0.715773344039917, + "step": 247 + }, + { + "epoch": 0.33, + "learning_rate": 4.985792392049767e-07, + "loss": 0.4757, + "step": 248 + }, + { + "epoch": 0.33, + "logps_train/chosen": -62.25496292114258, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -61.74409484863281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.1362227350473404, + "rewards_train/margins": 0.9793819934129715, + "rewards_train/rejected": -0.8431592583656311, + "step": 248 + }, + { + "epoch": 0.33, + "logps_train/chosen": -50.92721176147461, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -95.39331817626953, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.15102873742580414, + "rewards_train/margins": 0.6309859603643417, + "rewards_train/rejected": -0.4799572229385376, + "step": 249 + }, + { + "epoch": 0.33, + "learning_rate": 4.985201055967425e-07, + "loss": 0.5244, + "step": 250 + }, + { + "epoch": 0.33, + "logps_train/chosen": -89.32518005371094, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -103.10430908203125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.27295079827308655, + "rewards_train/margins": 0.9575996100902557, + "rewards_train/rejected": -0.6846488118171692, + "step": 250 + }, + { + "epoch": 0.33, + "logps_train/chosen": -68.58715057373047, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -62.4371337890625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2209724485874176, + "rewards_train/margins": 0.6537483632564545, + "rewards_train/rejected": -0.43277591466903687, + "step": 251 + }, + { + "epoch": 0.33, + "learning_rate": 4.98459769999184e-07, + "loss": 0.5427, + "step": 252 + }, + { + "epoch": 0.33, + "logps_train/chosen": -86.50833129882812, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -92.8584976196289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.09926999360322952, + "rewards_train/margins": 0.26158010214567184, + "rewards_train/rejected": -0.36085009574890137, + "step": 252 + }, + { + "epoch": 0.34, + "logps_train/chosen": -81.6760482788086, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -75.13443756103516, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.08869902789592743, + "rewards_train/margins": -0.03736482560634613, + "rewards_train/rejected": -0.0513342022895813, + "step": 253 + }, + { + "epoch": 0.34, + "learning_rate": 4.983982327041198e-07, + "loss": 0.6613, + "step": 254 + }, + { + "epoch": 0.34, + "logps_train/chosen": -56.557308197021484, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -91.56390380859375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3580141067504883, + "rewards_train/margins": 1.1722168922424316, + "rewards_train/rejected": -0.8142027854919434, + "step": 254 + }, + { + "epoch": 0.34, + "logps_train/chosen": -62.262168884277344, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -65.96533203125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.10434179753065109, + "rewards_train/margins": 0.20469141751527786, + "rewards_train/rejected": -0.30903321504592896, + "step": 255 + }, + { + "epoch": 0.34, + "learning_rate": 4.983354940091803e-07, + "loss": 0.5755, + "step": 256 + }, + { + "epoch": 0.34, + "logps_train/chosen": -49.22135543823242, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -73.50614929199219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.08072912693023682, + "rewards_train/margins": 1.0987917184829712, + "rewards_train/rejected": -1.179520845413208, + "step": 256 + }, + { + "epoch": 0.34, + "logps_train/chosen": -68.78033447265625, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -84.46699523925781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.14306002855300903, + "rewards_train/margins": 1.2370248436927795, + "rewards_train/rejected": -1.0939648151397705, + "step": 257 + }, + { + "epoch": 0.34, + "learning_rate": 4.982715542178068e-07, + "loss": 0.4, + "step": 258 + }, + { + "epoch": 0.34, + "logps_train/chosen": -65.43258666992188, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -61.63449478149414, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.36982113122940063, + "rewards_train/margins": 0.0022222399711608887, + "rewards_train/rejected": -0.3720433712005615, + "step": 258 + }, + { + "epoch": 0.34, + "logps_train/chosen": -37.54210662841797, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -42.75946807861328, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2721795439720154, + "rewards_train/margins": 0.24439257383346558, + "rewards_train/rejected": -0.516572117805481, + "step": 259 + }, + { + "epoch": 0.35, + "learning_rate": 4.982064136392495e-07, + "loss": 0.6831, + "step": 260 + }, + { + "epoch": 0.35, + "logps_train/chosen": -73.95309448242188, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -64.70706176757812, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.27343499660491943, + "rewards_train/margins": -0.008197635412216187, + "rewards_train/rejected": -0.26523736119270325, + "step": 260 + }, + { + "epoch": 0.35, + "logps_train/chosen": -78.92915344238281, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -72.20160675048828, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.23900993168354034, + "rewards_train/margins": 0.2604477256536484, + "rewards_train/rejected": -0.4994576573371887, + "step": 261 + }, + { + "epoch": 0.35, + "learning_rate": 4.981400725885669e-07, + "loss": 0.728, + "step": 262 + }, + { + "epoch": 0.35, + "logps_train/chosen": -40.57674789428711, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -72.76495361328125, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.260977566242218, + "rewards_train/margins": 1.0370829701423645, + "rewards_train/rejected": -0.7761054039001465, + "step": 262 + }, + { + "epoch": 0.35, + "logps_train/chosen": -59.44086456298828, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -56.47510528564453, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.10580535978078842, + "rewards_train/margins": 0.4057677760720253, + "rewards_train/rejected": -0.5115731358528137, + "step": 263 + }, + { + "epoch": 0.35, + "learning_rate": 4.98072531386623e-07, + "loss": 0.5195, + "step": 264 + }, + { + "epoch": 0.35, + "logps_train/chosen": -68.55766296386719, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -65.55680847167969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.29735875129699707, + "rewards_train/margins": 0.7186649739742279, + "rewards_train/rejected": -0.42130622267723083, + "step": 264 + }, + { + "epoch": 0.35, + "logps_train/chosen": -59.90800476074219, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -75.57239532470703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.20314466953277588, + "rewards_train/margins": 1.250227689743042, + "rewards_train/rejected": -1.0470830202102661, + "step": 265 + }, + { + "epoch": 0.35, + "learning_rate": 4.98003790360087e-07, + "loss": 0.4394, + "step": 266 + }, + { + "epoch": 0.35, + "logps_train/chosen": -94.09783935546875, + "logps_train/ref_chosen": -92.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -76.35623168945312, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17072133719921112, + "rewards_train/margins": 0.32583968341350555, + "rewards_train/rejected": -0.4965610206127167, + "step": 266 + }, + { + "epoch": 0.35, + "logps_train/chosen": -50.88528060913086, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -44.34239196777344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17094998061656952, + "rewards_train/margins": 0.05645342171192169, + "rewards_train/rejected": -0.2274034023284912, + "step": 267 + }, + { + "epoch": 0.36, + "learning_rate": 4.979338498414306e-07, + "loss": 0.6395, + "step": 268 + }, + { + "epoch": 0.36, + "logps_train/chosen": -45.69984436035156, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -48.23707580566406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11126578599214554, + "rewards_train/margins": 0.2724729999899864, + "rewards_train/rejected": -0.16120721399784088, + "step": 268 + }, + { + "epoch": 0.36, + "logps_train/chosen": -71.47410583496094, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -74.85859680175781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.07133936882019043, + "rewards_train/margins": 0.7587611675262451, + "rewards_train/rejected": -0.6874217987060547, + "step": 269 + }, + { + "epoch": 0.36, + "learning_rate": 4.978627101689276e-07, + "loss": 0.562, + "step": 270 + }, + { + "epoch": 0.36, + "logps_train/chosen": -61.465003967285156, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -81.43618774414062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.01014012098312378, + "rewards_train/margins": 0.9752429127693176, + "rewards_train/rejected": -0.9651027917861938, + "step": 270 + }, + { + "epoch": 0.36, + "logps_train/chosen": -93.34466552734375, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -104.21041870117188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.20634135603904724, + "rewards_train/margins": 0.169388085603714, + "rewards_train/rejected": -0.37572944164276123, + "step": 271 + }, + { + "epoch": 0.36, + "learning_rate": 4.977903716866511e-07, + "loss": 0.7196, + "step": 272 + }, + { + "epoch": 0.36, + "logps_train/chosen": -45.60688018798828, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -54.93670654296875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07514083385467529, + "rewards_train/margins": 0.27477964758872986, + "rewards_train/rejected": -0.34992048144340515, + "step": 272 + }, + { + "epoch": 0.36, + "logps_train/chosen": -79.77778625488281, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -85.91455078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0809035375714302, + "rewards_train/margins": 0.8574260398745537, + "rewards_train/rejected": -0.9383295774459839, + "step": 273 + }, + { + "epoch": 0.36, + "learning_rate": 4.977168347444725e-07, + "loss": 0.5162, + "step": 274 + }, + { + "epoch": 0.36, + "logps_train/chosen": -47.941383361816406, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -57.00628662109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.17245863378047943, + "rewards_train/margins": 0.5937950760126114, + "rewards_train/rejected": -0.7662537097930908, + "step": 274 + }, + { + "epoch": 0.37, + "logps_train/chosen": -75.7536392211914, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -76.83694458007812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4878634810447693, + "rewards_train/margins": -0.008856594562530518, + "rewards_train/rejected": -0.47900688648223877, + "step": 275 + }, + { + "epoch": 0.37, + "learning_rate": 4.976420996980598e-07, + "loss": 0.6649, + "step": 276 + }, + { + "epoch": 0.37, + "logps_train/chosen": -42.84344482421875, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -38.799224853515625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.25628066062927246, + "rewards_train/margins": 0.5717495977878571, + "rewards_train/rejected": -0.3154689371585846, + "step": 276 + }, + { + "epoch": 0.37, + "logps_train/chosen": -54.45808792114258, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -76.51191711425781, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12705844640731812, + "rewards_train/margins": 0.3819460868835449, + "rewards_train/rejected": -0.509004533290863, + "step": 277 + }, + { + "epoch": 0.37, + "learning_rate": 4.975661669088754e-07, + "loss": 0.5544, + "step": 278 + }, + { + "epoch": 0.37, + "logps_train/chosen": -60.03647232055664, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -72.93626403808594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.08489708602428436, + "rewards_train/margins": 0.6290422230958939, + "rewards_train/rejected": -0.7139393091201782, + "step": 278 + }, + { + "epoch": 0.37, + "logps_train/chosen": -56.25327682495117, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -63.97766876220703, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03639087826013565, + "rewards_train/margins": 0.4865014925599098, + "rewards_train/rejected": -0.45011061429977417, + "step": 279 + }, + { + "epoch": 0.37, + "learning_rate": 4.974890367441752e-07, + "loss": 0.5193, + "step": 280 + }, + { + "epoch": 0.37, + "logps_train/chosen": -72.85212707519531, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -77.4080810546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.14801473915576935, + "rewards_train/margins": 0.6396040171384811, + "rewards_train/rejected": -0.4915892779827118, + "step": 280 + }, + { + "epoch": 0.37, + "logps_train/chosen": -62.62663269042969, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -71.96431732177734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0982101559638977, + "rewards_train/margins": 0.49509674310684204, + "rewards_train/rejected": -0.5933068990707397, + "step": 281 + }, + { + "epoch": 0.37, + "learning_rate": 4.974107095770059e-07, + "loss": 0.5013, + "step": 282 + }, + { + "epoch": 0.37, + "logps_train/chosen": -45.46825408935547, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -47.06733322143555, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.16098734736442566, + "rewards_train/margins": 0.6063923537731171, + "rewards_train/rejected": -0.4454050064086914, + "step": 282 + }, + { + "epoch": 0.38, + "logps_train/chosen": -60.39244842529297, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -59.32325744628906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.009557027369737625, + "rewards_train/margins": 0.445034246891737, + "rewards_train/rejected": -0.4545912742614746, + "step": 283 + }, + { + "epoch": 0.38, + "learning_rate": 4.973311857862036e-07, + "loss": 0.5232, + "step": 284 + }, + { + "epoch": 0.38, + "logps_train/chosen": -70.64202880859375, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -66.93512725830078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.18032830953598022, + "rewards_train/margins": 0.8500127792358398, + "rewards_train/rejected": -0.6696844696998596, + "step": 284 + }, + { + "epoch": 0.38, + "logps_train/chosen": -67.68699645996094, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -86.08522033691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04016879200935364, + "rewards_train/margins": 0.299603670835495, + "rewards_train/rejected": -0.33977246284484863, + "step": 285 + }, + { + "epoch": 0.38, + "learning_rate": 4.972504657563922e-07, + "loss": 0.6041, + "step": 286 + }, + { + "epoch": 0.38, + "logps_train/chosen": -63.0327033996582, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -91.54096221923828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.05795776844024658, + "rewards_train/margins": 1.0820763111114502, + "rewards_train/rejected": -1.1400340795516968, + "step": 286 + }, + { + "epoch": 0.38, + "logps_train/chosen": -56.81370162963867, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -87.04418182373047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.32331734895706177, + "rewards_train/margins": 1.3011727929115295, + "rewards_train/rejected": -0.9778554439544678, + "step": 287 + }, + { + "epoch": 0.38, + "learning_rate": 4.971685498779813e-07, + "loss": 0.3884, + "step": 288 + }, + { + "epoch": 0.38, + "logps_train/chosen": -24.749740600585938, + "logps_train/ref_chosen": -27.5, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -40.481956481933594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2730727791786194, + "rewards_train/margins": 0.765408992767334, + "rewards_train/rejected": -0.4923362135887146, + "step": 288 + }, + { + "epoch": 0.38, + "logps_train/chosen": -62.213356018066406, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -54.45060729980469, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2290196269750595, + "rewards_train/margins": 0.08283768594264984, + "rewards_train/rejected": -0.31185731291770935, + "step": 289 + }, + { + "epoch": 0.39, + "learning_rate": 4.970854385471642e-07, + "loss": 0.6327, + "step": 290 + }, + { + "epoch": 0.39, + "logps_train/chosen": -63.23533630371094, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -65.18846893310547, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.09492029249668121, + "rewards_train/margins": 0.31572385132312775, + "rewards_train/rejected": -0.41064414381980896, + "step": 290 + }, + { + "epoch": 0.39, + "logps_train/chosen": -60.477413177490234, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -72.69172668457031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.028821326792240143, + "rewards_train/margins": 0.66518185287714, + "rewards_train/rejected": -0.6363605260848999, + "step": 291 + }, + { + "epoch": 0.39, + "learning_rate": 4.97001132165916e-07, + "loss": 0.5527, + "step": 292 + }, + { + "epoch": 0.39, + "logps_train/chosen": -80.474853515625, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -68.58543395996094, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.5334223508834839, + "rewards_train/margins": -0.02644103765487671, + "rewards_train/rejected": -0.5069813132286072, + "step": 292 + }, + { + "epoch": 0.39, + "logps_train/chosen": -73.7633056640625, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -81.98731994628906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.19039292633533478, + "rewards_train/margins": 0.6599016040563583, + "rewards_train/rejected": -0.8502945303916931, + "step": 293 + }, + { + "epoch": 0.39, + "learning_rate": 4.969156311419921e-07, + "loss": 0.6576, + "step": 294 + }, + { + "epoch": 0.39, + "logps_train/chosen": -63.43901062011719, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -63.561363220214844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2873491942882538, + "rewards_train/margins": 0.6466104388237, + "rewards_train/rejected": -0.35926124453544617, + "step": 294 + }, + { + "epoch": 0.39, + "logps_train/chosen": -62.90460205078125, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -61.785762786865234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.15608543157577515, + "rewards_train/margins": 0.555303156375885, + "rewards_train/rejected": -0.7113885879516602, + "step": 295 + }, + { + "epoch": 0.39, + "learning_rate": 4.968289358889256e-07, + "loss": 0.5243, + "step": 296 + }, + { + "epoch": 0.39, + "logps_train/chosen": -47.86812973022461, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -57.66960906982422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.11415675282478333, + "rewards_train/margins": 0.3715541958808899, + "rewards_train/rejected": -0.4857109487056732, + "step": 296 + }, + { + "epoch": 0.39, + "logps_train/chosen": -70.55889129638672, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -93.11337280273438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.20198270678520203, + "rewards_train/margins": 0.23122942447662354, + "rewards_train/rejected": -0.43321213126182556, + "step": 297 + }, + { + "epoch": 0.4, + "learning_rate": 4.967410468260258e-07, + "loss": 0.6547, + "step": 298 + }, + { + "epoch": 0.4, + "logps_train/chosen": -68.3116455078125, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -74.50334930419922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.03568052500486374, + "rewards_train/margins": 0.6055467054247856, + "rewards_train/rejected": -0.5698661804199219, + "step": 298 + }, + { + "epoch": 0.4, + "logps_train/chosen": -70.03067016601562, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -75.57959747314453, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2819732427597046, + "rewards_train/margins": 0.19043996930122375, + "rewards_train/rejected": -0.47241321206092834, + "step": 299 + }, + { + "epoch": 0.4, + "learning_rate": 4.966519643783757e-07, + "loss": 0.6075, + "step": 300 + }, + { + "epoch": 0.4, + "logps_train/chosen": -63.84186553955078, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -99.1169204711914, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5275319218635559, + "rewards_train/margins": 1.7415676712989807, + "rewards_train/rejected": -1.2140357494354248, + "step": 300 + }, + { + "epoch": 0.4, + "logps_train/chosen": -47.97945785522461, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -57.391029357910156, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.0356481559574604, + "rewards_train/margins": 0.41518085077404976, + "rewards_train/rejected": -0.37953269481658936, + "step": 301 + }, + { + "epoch": 0.4, + "learning_rate": 4.965616889768307e-07, + "loss": 0.4504, + "step": 302 + }, + { + "epoch": 0.4, + "logps_train/chosen": -77.11512756347656, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -97.99443054199219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.22598733007907867, + "rewards_train/margins": 1.2152746468782425, + "rewards_train/rejected": -0.9892873167991638, + "step": 302 + }, + { + "epoch": 0.4, + "logps_train/chosen": -75.13955688476562, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -68.55375671386719, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.20692503452301025, + "rewards_train/margins": 0.06837305426597595, + "rewards_train/rejected": -0.2752980887889862, + "step": 303 + }, + { + "epoch": 0.4, + "learning_rate": 4.964702210580154e-07, + "loss": 0.5417, + "step": 304 + }, + { + "epoch": 0.4, + "logps_train/chosen": -77.9247817993164, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -62.123497009277344, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.11845962703227997, + "rewards_train/margins": 0.4901845306158066, + "rewards_train/rejected": -0.3717249035835266, + "step": 304 + }, + { + "epoch": 0.41, + "logps_train/chosen": -80.21438598632812, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -69.56214141845703, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2601107656955719, + "rewards_train/margins": -0.014052674174308777, + "rewards_train/rejected": -0.24605809152126312, + "step": 305 + }, + { + "epoch": 0.41, + "learning_rate": 4.963775610643226e-07, + "loss": 0.6385, + "step": 306 + }, + { + "epoch": 0.41, + "logps_train/chosen": -68.18052673339844, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -59.258644104003906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.06085354834794998, + "rewards_train/margins": 0.29843688756227493, + "rewards_train/rejected": -0.23758333921432495, + "step": 306 + }, + { + "epoch": 0.41, + "logps_train/chosen": -89.0968017578125, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -99.31160736083984, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.6780396699905396, + "rewards_train/margins": 0.31874561309814453, + "rewards_train/rejected": -0.9967852830886841, + "step": 307 + }, + { + "epoch": 0.41, + "learning_rate": 4.962837094439104e-07, + "loss": 0.6843, + "step": 308 + }, + { + "epoch": 0.41, + "logps_train/chosen": -61.58351135253906, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -74.03178405761719, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.047636453062295914, + "rewards_train/margins": 0.603549424558878, + "rewards_train/rejected": -0.555912971496582, + "step": 308 + }, + { + "epoch": 0.41, + "logps_train/chosen": -77.00076293945312, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -69.06582641601562, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08445076644420624, + "rewards_train/margins": 0.7791628986597061, + "rewards_train/rejected": -0.8636136651039124, + "step": 309 + }, + { + "epoch": 0.41, + "learning_rate": 4.961886666507005e-07, + "loss": 0.5575, + "step": 310 + }, + { + "epoch": 0.41, + "logps_train/chosen": -47.377220153808594, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -47.27829360961914, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": 0.047824859619140625, + "rewards_train/margins": 0.379560649394989, + "rewards_train/rejected": -0.3317357897758484, + "step": 310 + }, + { + "epoch": 0.41, + "logps_train/chosen": -79.80460357666016, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -89.61198425292969, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.10467958450317383, + "rewards_train/margins": 0.1393318474292755, + "rewards_train/rejected": -0.24401143193244934, + "step": 311 + }, + { + "epoch": 0.41, + "learning_rate": 4.960924331443757e-07, + "loss": 0.6267, + "step": 312 + }, + { + "epoch": 0.41, + "logps_train/chosen": -56.72797775268555, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -77.94141387939453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08423339575529099, + "rewards_train/margins": 0.7916556522250175, + "rewards_train/rejected": -0.7074222564697266, + "step": 312 + }, + { + "epoch": 0.42, + "logps_train/chosen": -85.17864990234375, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -111.50273132324219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3915092945098877, + "rewards_train/margins": 1.9886574745178223, + "rewards_train/rejected": -1.5971481800079346, + "step": 313 + }, + { + "epoch": 0.42, + "learning_rate": 4.959950093903778e-07, + "loss": 0.3627, + "step": 314 + }, + { + "epoch": 0.42, + "logps_train/chosen": -61.4759407043457, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -64.37782287597656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.4038441777229309, + "rewards_train/margins": 0.06831297278404236, + "rewards_train/rejected": -0.47215715050697327, + "step": 314 + }, + { + "epoch": 0.42, + "logps_train/chosen": -46.53078079223633, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -48.23991394042969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.27348440885543823, + "rewards_train/margins": 0.7396632432937622, + "rewards_train/rejected": -0.466178834438324, + "step": 315 + }, + { + "epoch": 0.42, + "learning_rate": 4.958963958599051e-07, + "loss": 0.6122, + "step": 316 + }, + { + "epoch": 0.42, + "logps_train/chosen": -85.80899047851562, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -72.19682312011719, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.22535136342048645, + "rewards_train/margins": 0.8200333416461945, + "rewards_train/rejected": -0.594681978225708, + "step": 316 + }, + { + "epoch": 0.42, + "logps_train/chosen": -66.50135040283203, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -60.84355926513672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.05638488382101059, + "rewards_train/margins": 0.779533363878727, + "rewards_train/rejected": -0.8359182476997375, + "step": 317 + }, + { + "epoch": 0.42, + "learning_rate": 4.957965930299111e-07, + "loss": 0.4619, + "step": 318 + }, + { + "epoch": 0.42, + "logps_train/chosen": -42.19713592529297, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -49.122840881347656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.06463585048913956, + "rewards_train/margins": 0.2484295293688774, + "rewards_train/rejected": -0.31306537985801697, + "step": 318 + }, + { + "epoch": 0.42, + "logps_train/chosen": -34.947113037109375, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -63.90047073364258, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.43302297592163086, + "rewards_train/margins": 1.2254139184951782, + "rewards_train/rejected": -0.7923909425735474, + "step": 319 + }, + { + "epoch": 0.42, + "learning_rate": 4.956956013831006e-07, + "loss": 0.5162, + "step": 320 + }, + { + "epoch": 0.42, + "logps_train/chosen": -42.32313537597656, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -62.905433654785156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.20831172168254852, + "rewards_train/margins": 0.8730739802122116, + "rewards_train/rejected": -0.6647622585296631, + "step": 320 + }, + { + "epoch": 0.43, + "logps_train/chosen": -77.35755920410156, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -86.58023071289062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.11934994906187057, + "rewards_train/margins": 0.780079610645771, + "rewards_train/rejected": -0.8994295597076416, + "step": 321 + }, + { + "epoch": 0.43, + "learning_rate": 4.955934214079287e-07, + "loss": 0.4413, + "step": 322 + }, + { + "epoch": 0.43, + "logps_train/chosen": -37.126502990722656, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -36.14399719238281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.13109973073005676, + "rewards_train/margins": 0.08143697679042816, + "rewards_train/rejected": 0.0496627539396286, + "step": 322 + }, + { + "epoch": 0.43, + "logps_train/chosen": -68.67786407470703, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -88.44883728027344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.1494007557630539, + "rewards_train/margins": 1.0177215188741684, + "rewards_train/rejected": -0.8683207631111145, + "step": 323 + }, + { + "epoch": 0.43, + "learning_rate": 4.954900535985977e-07, + "loss": 0.5993, + "step": 324 + }, + { + "epoch": 0.43, + "logps_train/chosen": -68.79336547851562, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -68.96935272216797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.015274770557880402, + "rewards_train/margins": 0.8582229539752007, + "rewards_train/rejected": -0.873497724533081, + "step": 324 + }, + { + "epoch": 0.43, + "logps_train/chosen": -107.9514389038086, + "logps_train/ref_chosen": -115.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -110.89006042480469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.6939187049865723, + "rewards_train/margins": 1.861049771308899, + "rewards_train/rejected": -1.1671310663223267, + "step": 325 + }, + { + "epoch": 0.43, + "learning_rate": 4.953854984550552e-07, + "loss": 0.3781, + "step": 326 + }, + { + "epoch": 0.43, + "logps_train/chosen": -37.977882385253906, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -45.1030158996582, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.17252424359321594, + "rewards_train/margins": 0.7726697027683258, + "rewards_train/rejected": -0.6001454591751099, + "step": 326 + }, + { + "epoch": 0.43, + "logps_train/chosen": -53.901161193847656, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -60.99518585205078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.23230373859405518, + "rewards_train/margins": 0.1328398585319519, + "rewards_train/rejected": -0.3651435971260071, + "step": 327 + }, + { + "epoch": 0.44, + "learning_rate": 4.952797564829914e-07, + "loss": 0.6096, + "step": 328 + }, + { + "epoch": 0.44, + "logps_train/chosen": -78.5294418334961, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -76.37509155273438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3529440760612488, + "rewards_train/margins": 0.21737802028656006, + "rewards_train/rejected": -0.5703220963478088, + "step": 328 + }, + { + "epoch": 0.44, + "logps_train/chosen": -81.8572006225586, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -99.53673553466797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.07556366920471191, + "rewards_train/margins": 1.1124851703643799, + "rewards_train/rejected": -1.1880488395690918, + "step": 329 + }, + { + "epoch": 0.44, + "learning_rate": 4.951728281938364e-07, + "loss": 0.5967, + "step": 330 + }, + { + "epoch": 0.44, + "logps_train/chosen": -49.91069030761719, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -73.83796691894531, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.2737748920917511, + "rewards_train/margins": 1.2731963098049164, + "rewards_train/rejected": -0.9994214177131653, + "step": 330 + }, + { + "epoch": 0.44, + "logps_train/chosen": -27.490650177001953, + "logps_train/ref_chosen": -28.375, + "logps_train/ref_rejected": -32.0, + "logps_train/rejected": -35.97441101074219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.09077858179807663, + "rewards_train/margins": 0.47845368832349777, + "rewards_train/rejected": -0.38767510652542114, + "step": 331 + }, + { + "epoch": 0.44, + "learning_rate": 4.950647141047585e-07, + "loss": 0.4559, + "step": 332 + }, + { + "epoch": 0.44, + "logps_train/chosen": -70.85807800292969, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -70.3080062866211, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0858074426651001, + "rewards_train/margins": 0.37936830520629883, + "rewards_train/rejected": -0.4651757478713989, + "step": 332 + }, + { + "epoch": 0.44, + "logps_train/chosen": -75.27079772949219, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -83.43508911132812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5645802021026611, + "rewards_train/margins": 0.29142916202545166, + "rewards_train/rejected": -0.8560093641281128, + "step": 333 + }, + { + "epoch": 0.44, + "learning_rate": 4.94955414738661e-07, + "loss": 0.6423, + "step": 334 + }, + { + "epoch": 0.44, + "logps_train/chosen": -70.99369812011719, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -85.11231231689453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.18218286335468292, + "rewards_train/margins": 1.3665484637022018, + "rewards_train/rejected": -1.5487313270568848, + "step": 334 + }, + { + "epoch": 0.44, + "logps_train/chosen": -43.42034149169922, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -40.67958450317383, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32984113693237305, + "rewards_train/margins": 0.8313934803009033, + "rewards_train/rejected": -0.5015523433685303, + "step": 335 + }, + { + "epoch": 0.45, + "learning_rate": 4.948449306241797e-07, + "loss": 0.3684, + "step": 336 + }, + { + "epoch": 0.45, + "logps_train/chosen": -61.599510192871094, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -76.62318420410156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.43145498633384705, + "rewards_train/margins": 0.9359607398509979, + "rewards_train/rejected": -0.5045057535171509, + "step": 336 + }, + { + "epoch": 0.45, + "logps_train/chosen": -56.99884033203125, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -47.48579406738281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.07800908386707306, + "rewards_train/margins": 0.7662735432386398, + "rewards_train/rejected": -0.8442826271057129, + "step": 337 + }, + { + "epoch": 0.45, + "learning_rate": 4.947332622956807e-07, + "loss": 0.6157, + "step": 338 + }, + { + "epoch": 0.45, + "logps_train/chosen": -59.299381256103516, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -52.84434509277344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.10650087147951126, + "rewards_train/margins": 0.5584022924304008, + "rewards_train/rejected": -0.6649031639099121, + "step": 338 + }, + { + "epoch": 0.45, + "logps_train/chosen": -79.95851135253906, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -68.88861846923828, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5856958031654358, + "rewards_train/margins": 0.3781662583351135, + "rewards_train/rejected": -0.9638620615005493, + "step": 339 + }, + { + "epoch": 0.45, + "learning_rate": 4.94620410293258e-07, + "loss": 0.6119, + "step": 340 + }, + { + "epoch": 0.45, + "logps_train/chosen": -105.9596176147461, + "logps_train/ref_chosen": -109.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -104.4390869140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31966346502304077, + "rewards_train/margins": 1.447946846485138, + "rewards_train/rejected": -1.1282833814620972, + "step": 340 + }, + { + "epoch": 0.45, + "logps_train/chosen": -65.1340560913086, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -70.03223419189453, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.30378198623657227, + "rewards_train/margins": 0.8163807392120361, + "rewards_train/rejected": -0.5125987529754639, + "step": 341 + }, + { + "epoch": 0.45, + "learning_rate": 4.945063751627299e-07, + "loss": 0.3778, + "step": 342 + }, + { + "epoch": 0.45, + "logps_train/chosen": -81.37132263183594, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -80.34870910644531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.05739879608154297, + "rewards_train/margins": 0.8703944683074951, + "rewards_train/rejected": -0.8129956722259521, + "step": 342 + }, + { + "epoch": 0.46, + "logps_train/chosen": -89.68537139892578, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -102.48727416992188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4041192829608917, + "rewards_train/margins": 1.5481591522693634, + "rewards_train/rejected": -1.1440398693084717, + "step": 343 + }, + { + "epoch": 0.46, + "learning_rate": 4.943911574556375e-07, + "loss": 0.449, + "step": 344 + }, + { + "epoch": 0.46, + "logps_train/chosen": -91.92752075195312, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -81.88960266113281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.30412301421165466, + "rewards_train/margins": 1.4133960902690887, + "rewards_train/rejected": -1.109273076057434, + "step": 344 + }, + { + "epoch": 0.46, + "logps_train/chosen": -48.4801025390625, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -51.764827728271484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.014416344463825226, + "rewards_train/margins": 0.4800353869795799, + "rewards_train/rejected": -0.49445173144340515, + "step": 345 + }, + { + "epoch": 0.46, + "learning_rate": 4.942747577292414e-07, + "loss": 0.5624, + "step": 346 + }, + { + "epoch": 0.46, + "logps_train/chosen": -58.10334777832031, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -64.0764389038086, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06730224192142487, + "rewards_train/margins": 0.6444771438837051, + "rewards_train/rejected": -0.5771749019622803, + "step": 346 + }, + { + "epoch": 0.46, + "logps_train/chosen": -48.601993560791016, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -74.01139068603516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.48296457529067993, + "rewards_train/margins": 1.589670479297638, + "rewards_train/rejected": -1.106705904006958, + "step": 347 + }, + { + "epoch": 0.46, + "learning_rate": 4.941571765465189e-07, + "loss": 0.425, + "step": 348 + }, + { + "epoch": 0.46, + "logps_train/chosen": -57.15316390991211, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -83.12677001953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4104648530483246, + "rewards_train/margins": 1.683297485113144, + "rewards_train/rejected": -1.2728326320648193, + "step": 348 + }, + { + "epoch": 0.46, + "logps_train/chosen": -76.943115234375, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -92.99237060546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.28850117325782776, + "rewards_train/margins": 1.4932072460651398, + "rewards_train/rejected": -1.204706072807312, + "step": 349 + }, + { + "epoch": 0.46, + "learning_rate": 4.940384144761619e-07, + "loss": 0.2798, + "step": 350 + }, + { + "epoch": 0.46, + "logps_train/chosen": -44.18724060058594, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -61.78169631958008, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.0042709968984127045, + "rewards_train/margins": 0.6879613734781742, + "rewards_train/rejected": -0.6922323703765869, + "step": 350 + }, + { + "epoch": 0.47, + "logps_train/chosen": -82.18058013916016, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -75.33881378173828, + "rewards_train/accuracies": 0.25, + "rewards_train/chosen": -0.18290191888809204, + "rewards_train/margins": 0.12285453081130981, + "rewards_train/rejected": -0.30575644969940186, + "step": 351 + }, + { + "epoch": 0.47, + "learning_rate": 4.939184720925734e-07, + "loss": 0.6843, + "step": 352 + }, + { + "epoch": 0.47, + "logps_train/chosen": -69.00310516357422, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -63.12940979003906, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.46046683192253113, + "rewards_train/margins": -0.21979157626628876, + "rewards_train/rejected": -0.24067525565624237, + "step": 352 + }, + { + "epoch": 0.47, + "logps_train/chosen": -66.58317565917969, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -61.301902770996094, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.28956806659698486, + "rewards_train/margins": 0.10624682903289795, + "rewards_train/rejected": -0.3958148956298828, + "step": 353 + }, + { + "epoch": 0.47, + "learning_rate": 4.937973499758656e-07, + "loss": 0.7655, + "step": 354 + }, + { + "epoch": 0.47, + "logps_train/chosen": -48.65770721435547, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -69.75018310546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1936039924621582, + "rewards_train/margins": 1.5748724937438965, + "rewards_train/rejected": -1.3812685012817383, + "step": 354 + }, + { + "epoch": 0.47, + "logps_train/chosen": -56.52252960205078, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -74.19181060791016, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.08023122698068619, + "rewards_train/margins": 0.788950227200985, + "rewards_train/rejected": -0.8691814541816711, + "step": 355 + }, + { + "epoch": 0.47, + "learning_rate": 4.93675048711856e-07, + "loss": 0.3794, + "step": 356 + }, + { + "epoch": 0.47, + "logps_train/chosen": -58.07610321044922, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -81.87864685058594, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.011139392852783203, + "rewards_train/margins": 1.0685354471206665, + "rewards_train/rejected": -1.0573960542678833, + "step": 356 + }, + { + "epoch": 0.47, + "logps_train/chosen": -67.69930267333984, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -93.54232788085938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4394451677799225, + "rewards_train/margins": 1.7280526459217072, + "rewards_train/rejected": -1.2886074781417847, + "step": 357 + }, + { + "epoch": 0.48, + "learning_rate": 4.935515688920653e-07, + "loss": 0.412, + "step": 358 + }, + { + "epoch": 0.48, + "logps_train/chosen": -68.78108215332031, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -83.67660522460938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.04420631378889084, + "rewards_train/margins": 0.30014824122190475, + "rewards_train/rejected": -0.2559419274330139, + "step": 358 + }, + { + "epoch": 0.48, + "logps_train/chosen": -46.6857795715332, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -58.41501235961914, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.16540639102458954, + "rewards_train/margins": 0.8850326985120773, + "rewards_train/rejected": -0.7196263074874878, + "step": 359 + }, + { + "epoch": 0.48, + "learning_rate": 4.934269111137148e-07, + "loss": 0.5129, + "step": 360 + }, + { + "epoch": 0.48, + "logps_train/chosen": -78.81183624267578, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -76.78553771972656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.5985039472579956, + "rewards_train/margins": 1.4020576477050781, + "rewards_train/rejected": -0.8035537004470825, + "step": 360 + }, + { + "epoch": 0.48, + "logps_train/chosen": -113.30257415771484, + "logps_train/ref_chosen": -116.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -102.77424621582031, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.3189617395401001, + "rewards_train/margins": 0.519042506814003, + "rewards_train/rejected": -0.2000807672739029, + "step": 361 + }, + { + "epoch": 0.48, + "learning_rate": 4.933010759797227e-07, + "loss": 0.4731, + "step": 362 + }, + { + "epoch": 0.48, + "logps_train/chosen": -32.92317199707031, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -34.44102478027344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.23668691515922546, + "rewards_train/margins": 0.6995397508144379, + "rewards_train/rejected": -0.4628528356552124, + "step": 362 + }, + { + "epoch": 0.48, + "logps_train/chosen": -46.1734733581543, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -56.22780227661133, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1868787705898285, + "rewards_train/margins": 0.1780889928340912, + "rewards_train/rejected": -0.3649677634239197, + "step": 363 + }, + { + "epoch": 0.48, + "learning_rate": 4.931740640987015e-07, + "loss": 0.5609, + "step": 364 + }, + { + "epoch": 0.48, + "logps_train/chosen": -72.57083129882812, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -94.20811462402344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.044582709670066833, + "rewards_train/margins": 0.6231044083833694, + "rewards_train/rejected": -0.6676871180534363, + "step": 364 + }, + { + "epoch": 0.48, + "logps_train/chosen": -51.9361572265625, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -83.75634765625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.33841538429260254, + "rewards_train/margins": 1.1796748638153076, + "rewards_train/rejected": -0.8412594795227051, + "step": 365 + }, + { + "epoch": 0.49, + "learning_rate": 4.930458760849557e-07, + "loss": 0.5417, + "step": 366 + }, + { + "epoch": 0.49, + "logps_train/chosen": -63.89988708496094, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -54.842193603515625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.01782386004924774, + "rewards_train/margins": 0.37235577404499054, + "rewards_train/rejected": -0.3545319139957428, + "step": 366 + }, + { + "epoch": 0.49, + "logps_train/chosen": -44.214874267578125, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -81.29801940917969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11367510259151459, + "rewards_train/margins": 1.0380017906427383, + "rewards_train/rejected": -1.151676893234253, + "step": 367 + }, + { + "epoch": 0.49, + "learning_rate": 4.929165125584775e-07, + "loss": 0.5475, + "step": 368 + }, + { + "epoch": 0.49, + "logps_train/chosen": -51.261497497558594, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -67.07583618164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4597877860069275, + "rewards_train/margins": 1.0126840472221375, + "rewards_train/rejected": -0.55289626121521, + "step": 368 + }, + { + "epoch": 0.49, + "logps_train/chosen": -29.50927734375, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -31.0, + "logps_train/rejected": -36.309104919433594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.0846189558506012, + "rewards_train/margins": 0.6108417809009552, + "rewards_train/rejected": -0.526222825050354, + "step": 369 + }, + { + "epoch": 0.49, + "learning_rate": 4.92785974144945e-07, + "loss": 0.4209, + "step": 370 + }, + { + "epoch": 0.49, + "logps_train/chosen": -89.02214050292969, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -112.5206069946289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.28216132521629333, + "rewards_train/margins": 1.4850035607814789, + "rewards_train/rejected": -1.2028422355651855, + "step": 370 + }, + { + "epoch": 0.49, + "logps_train/chosen": -62.36957550048828, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -67.99769592285156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19898012280464172, + "rewards_train/margins": 0.6932806968688965, + "rewards_train/rejected": -0.49430057406425476, + "step": 371 + }, + { + "epoch": 0.49, + "learning_rate": 4.92654261475719e-07, + "loss": 0.4435, + "step": 372 + }, + { + "epoch": 0.49, + "logps_train/chosen": -48.92306137084961, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -65.15996551513672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.02644379436969757, + "rewards_train/margins": 0.7869715243577957, + "rewards_train/rejected": -0.7605277299880981, + "step": 372 + }, + { + "epoch": 0.5, + "logps_train/chosen": -72.53135681152344, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -80.98039245605469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5171771049499512, + "rewards_train/margins": 1.6464661359786987, + "rewards_train/rejected": -1.1292890310287476, + "step": 373 + }, + { + "epoch": 0.5, + "learning_rate": 4.925213751878392e-07, + "loss": 0.4706, + "step": 374 + }, + { + "epoch": 0.5, + "logps_train/chosen": -74.22731018066406, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -73.24191284179688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.30851906538009644, + "rewards_train/margins": 1.0670854449272156, + "rewards_train/rejected": -0.7585663795471191, + "step": 374 + }, + { + "epoch": 0.5, + "logps_train/chosen": -61.54717254638672, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -72.13079833984375, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.15471746027469635, + "rewards_train/margins": 0.031800076365470886, + "rewards_train/rejected": -0.18651753664016724, + "step": 375 + }, + { + "epoch": 0.5, + "learning_rate": 4.923873159240218e-07, + "loss": 0.605, + "step": 376 + }, + { + "epoch": 0.5, + "logps_train/chosen": -46.21941375732422, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -54.117191314697266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.11633986979722977, + "rewards_train/margins": 1.2749341204762459, + "rewards_train/rejected": -1.1585942506790161, + "step": 376 + }, + { + "epoch": 0.5, + "logps_train/chosen": -73.43480682373047, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -92.40774536132812, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.058081869035959244, + "rewards_train/margins": 1.2426066435873508, + "rewards_train/rejected": -1.1845247745513916, + "step": 377 + }, + { + "epoch": 0.5, + "learning_rate": 4.922520843326562e-07, + "loss": 0.5104, + "step": 378 + }, + { + "epoch": 0.5, + "logps_train/chosen": -61.36712646484375, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -74.85710144042969, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.31641197204589844, + "rewards_train/margins": 0.6255594193935394, + "rewards_train/rejected": -0.309147447347641, + "step": 378 + }, + { + "epoch": 0.5, + "logps_train/chosen": -53.90198516845703, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -55.54674530029297, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.12542665004730225, + "rewards_train/margins": 0.7019765377044678, + "rewards_train/rejected": -0.5765498876571655, + "step": 379 + }, + { + "epoch": 0.5, + "learning_rate": 4.921156810678019e-07, + "loss": 0.592, + "step": 380 + }, + { + "epoch": 0.5, + "logps_train/chosen": -52.57203674316406, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -77.65272521972656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.1776473969221115, + "rewards_train/margins": 1.2010255306959152, + "rewards_train/rejected": -1.0233781337738037, + "step": 380 + }, + { + "epoch": 0.51, + "logps_train/chosen": -103.65930938720703, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -113.58880615234375, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.18780572712421417, + "rewards_train/margins": 1.203105852007866, + "rewards_train/rejected": -1.39091157913208, + "step": 381 + }, + { + "epoch": 0.51, + "learning_rate": 4.919781067891853e-07, + "loss": 0.5634, + "step": 382 + }, + { + "epoch": 0.51, + "logps_train/chosen": -57.32777404785156, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -96.48308563232422, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3375351130962372, + "rewards_train/margins": 1.1202187836170197, + "rewards_train/rejected": -0.7826836705207825, + "step": 382 + }, + { + "epoch": 0.51, + "logps_train/chosen": -54.46477508544922, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -69.27169036865234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.08397741615772247, + "rewards_train/margins": 0.9650664776563644, + "rewards_train/rejected": -1.049043893814087, + "step": 383 + }, + { + "epoch": 0.51, + "learning_rate": 4.918393621621964e-07, + "loss": 0.4372, + "step": 384 + }, + { + "epoch": 0.51, + "logps_train/chosen": -67.64324951171875, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -80.74447631835938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3679015338420868, + "rewards_train/margins": 0.7657865583896637, + "rewards_train/rejected": -0.3978850245475769, + "step": 384 + }, + { + "epoch": 0.51, + "logps_train/chosen": -55.20287322998047, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -75.34164428710938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.4390876889228821, + "rewards_train/margins": 1.5216899514198303, + "rewards_train/rejected": -1.0826022624969482, + "step": 385 + }, + { + "epoch": 0.51, + "learning_rate": 4.916994478578859e-07, + "loss": 0.454, + "step": 386 + }, + { + "epoch": 0.51, + "logps_train/chosen": -58.48170852661133, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -96.11143493652344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.4533919095993042, + "rewards_train/margins": 1.6254730224609375, + "rewards_train/rejected": -1.1720811128616333, + "step": 386 + }, + { + "epoch": 0.51, + "logps_train/chosen": -51.650306701660156, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -60.75696563720703, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.3501867949962616, + "rewards_train/margins": -0.10456836223602295, + "rewards_train/rejected": -0.24561843276023865, + "step": 387 + }, + { + "epoch": 0.52, + "learning_rate": 4.915583645529615e-07, + "loss": 0.6997, + "step": 388 + }, + { + "epoch": 0.52, + "logps_train/chosen": -59.05073547363281, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -44.821739196777344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0691358894109726, + "rewards_train/margins": 0.33647577464580536, + "rewards_train/rejected": -0.40561166405677795, + "step": 388 + }, + { + "epoch": 0.52, + "logps_train/chosen": -98.45674896240234, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -124.94722747802734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.24098728597164154, + "rewards_train/margins": 1.051001325249672, + "rewards_train/rejected": -1.2919886112213135, + "step": 389 + }, + { + "epoch": 0.52, + "learning_rate": 4.914161129297852e-07, + "loss": 0.5328, + "step": 390 + }, + { + "epoch": 0.52, + "logps_train/chosen": -68.8272476196289, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -79.08497619628906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7243063449859619, + "rewards_train/margins": 2.165616273880005, + "rewards_train/rejected": -1.441309928894043, + "step": 390 + }, + { + "epoch": 0.52, + "logps_train/chosen": -85.1076889038086, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -101.42495727539062, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.21936267614364624, + "rewards_train/margins": 0.6028200387954712, + "rewards_train/rejected": -0.8221827149391174, + "step": 391 + }, + { + "epoch": 0.52, + "learning_rate": 4.912726936763692e-07, + "loss": 0.4175, + "step": 392 + }, + { + "epoch": 0.52, + "logps_train/chosen": -49.854393005371094, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -72.9985122680664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.46846693754196167, + "rewards_train/margins": 1.4745680689811707, + "rewards_train/rejected": -1.006101131439209, + "step": 392 + }, + { + "epoch": 0.52, + "logps_train/chosen": -32.078857421875, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -63.90654754638672, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.18117690086364746, + "rewards_train/margins": 1.3562067747116089, + "rewards_train/rejected": -1.1750298738479614, + "step": 393 + }, + { + "epoch": 0.52, + "learning_rate": 4.911281074863735e-07, + "loss": 0.3996, + "step": 394 + }, + { + "epoch": 0.52, + "logps_train/chosen": -67.24622344970703, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -71.84961700439453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.30506497621536255, + "rewards_train/margins": 1.1228389143943787, + "rewards_train/rejected": -0.8177739381790161, + "step": 394 + }, + { + "epoch": 0.52, + "logps_train/chosen": -55.591651916503906, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -80.74789428710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8548972606658936, + "rewards_train/margins": 2.823436141014099, + "rewards_train/rejected": -1.9685388803482056, + "step": 395 + }, + { + "epoch": 0.53, + "learning_rate": 4.909823550591018e-07, + "loss": 0.2353, + "step": 396 + }, + { + "epoch": 0.53, + "logps_train/chosen": -64.13090515136719, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -79.94658660888672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.17597264051437378, + "rewards_train/margins": 1.6503190398216248, + "rewards_train/rejected": -1.474346399307251, + "step": 396 + }, + { + "epoch": 0.53, + "logps_train/chosen": -61.688507080078125, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -67.70826721191406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4420871138572693, + "rewards_train/margins": 1.6894757151603699, + "rewards_train/rejected": -1.2473886013031006, + "step": 397 + }, + { + "epoch": 0.53, + "learning_rate": 4.908354370994987e-07, + "loss": 0.3082, + "step": 398 + }, + { + "epoch": 0.53, + "logps_train/chosen": -83.1932373046875, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -104.8465576171875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.15994855761528015, + "rewards_train/margins": 1.0153322517871857, + "rewards_train/rejected": -1.1752808094024658, + "step": 398 + }, + { + "epoch": 0.53, + "logps_train/chosen": -61.557281494140625, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -80.44500732421875, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.07760316133499146, + "rewards_train/margins": 0.5653344988822937, + "rewards_train/rejected": -0.6429376602172852, + "step": 399 + }, + { + "epoch": 0.53, + "learning_rate": 4.906873543181456e-07, + "loss": 0.5385, + "step": 400 + }, + { + "epoch": 0.53, + "logps_train/chosen": -46.099403381347656, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -57.333839416503906, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1619347631931305, + "rewards_train/margins": 1.1343809068202972, + "rewards_train/rejected": -0.9724461436271667, + "step": 400 + }, + { + "epoch": 0.53, + "logps_train/chosen": -61.619873046875, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -46.24669647216797, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.041137948632240295, + "rewards_train/margins": 0.24080754816532135, + "rewards_train/rejected": -0.19966959953308105, + "step": 401 + }, + { + "epoch": 0.53, + "learning_rate": 4.90538107431258e-07, + "loss": 0.5621, + "step": 402 + }, + { + "epoch": 0.53, + "logps_train/chosen": -73.45378112792969, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -109.92085266113281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.018034063279628754, + "rewards_train/margins": 1.397488035261631, + "rewards_train/rejected": -1.4155220985412598, + "step": 402 + }, + { + "epoch": 0.54, + "logps_train/chosen": -84.02957916259766, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -86.36756896972656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5454791784286499, + "rewards_train/margins": 0.9900488555431366, + "rewards_train/rejected": -0.4445696771144867, + "step": 403 + }, + { + "epoch": 0.54, + "learning_rate": 4.903876971606817e-07, + "loss": 0.4382, + "step": 404 + }, + { + "epoch": 0.54, + "logps_train/chosen": -68.48152160644531, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -72.48524475097656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.28465989232063293, + "rewards_train/margins": 1.394121676683426, + "rewards_train/rejected": -1.109461784362793, + "step": 404 + }, + { + "epoch": 0.54, + "logps_train/chosen": -79.98915100097656, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -67.21395874023438, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.19110223650932312, + "rewards_train/margins": 0.39162173867225647, + "rewards_train/rejected": -0.5827239751815796, + "step": 405 + }, + { + "epoch": 0.54, + "learning_rate": 4.902361242338889e-07, + "loss": 0.4773, + "step": 406 + }, + { + "epoch": 0.54, + "logps_train/chosen": -94.2977066040039, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -112.37322998046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.13116693496704102, + "rewards_train/margins": 1.7677087783813477, + "rewards_train/rejected": -1.6365418434143066, + "step": 406 + }, + { + "epoch": 0.54, + "logps_train/chosen": -70.03573608398438, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -59.45812225341797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.20267634093761444, + "rewards_train/margins": 1.0203634351491928, + "rewards_train/rejected": -0.8176870942115784, + "step": 407 + }, + { + "epoch": 0.54, + "learning_rate": 4.900833893839756e-07, + "loss": 0.3619, + "step": 408 + }, + { + "epoch": 0.54, + "logps_train/chosen": -40.740821838378906, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -70.22329711914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42591768503189087, + "rewards_train/margins": 1.8652393221855164, + "rewards_train/rejected": -1.4393216371536255, + "step": 408 + }, + { + "epoch": 0.54, + "logps_train/chosen": -53.683387756347656, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -60.01963806152344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.10822384059429169, + "rewards_train/margins": 1.1664374321699142, + "rewards_train/rejected": -1.0582135915756226, + "step": 409 + }, + { + "epoch": 0.54, + "learning_rate": 4.899294933496571e-07, + "loss": 0.3035, + "step": 410 + }, + { + "epoch": 0.54, + "logps_train/chosen": -75.93618774414062, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -97.18470764160156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.1305999755859375, + "rewards_train/margins": 1.514695644378662, + "rewards_train/rejected": -1.3840956687927246, + "step": 410 + }, + { + "epoch": 0.55, + "logps_train/chosen": -66.03938293457031, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -95.2296142578125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.544499397277832, + "rewards_train/margins": 2.1408989429473877, + "rewards_train/rejected": -1.5963995456695557, + "step": 411 + }, + { + "epoch": 0.55, + "learning_rate": 4.897744368752655e-07, + "loss": 0.3635, + "step": 412 + }, + { + "epoch": 0.55, + "logps_train/chosen": -43.639827728271484, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -37.0, + "logps_train/rejected": -42.41300582885742, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.5869936943054199, + "rewards_train/margins": 1.12360680103302, + "rewards_train/rejected": -0.5366131067276001, + "step": 412 + }, + { + "epoch": 0.55, + "logps_train/chosen": -86.75970458984375, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -89.42134094238281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.29159581661224365, + "rewards_train/margins": 0.6411628723144531, + "rewards_train/rejected": -0.9327586889266968, + "step": 413 + }, + { + "epoch": 0.55, + "learning_rate": 4.896182207107446e-07, + "loss": 0.5745, + "step": 414 + }, + { + "epoch": 0.55, + "logps_train/chosen": -54.34934997558594, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -54.63396453857422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.19118517637252808, + "rewards_train/margins": 0.25502413511276245, + "rewards_train/rejected": -0.4462093114852905, + "step": 414 + }, + { + "epoch": 0.55, + "logps_train/chosen": -58.31639862060547, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -80.00607299804688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5683600306510925, + "rewards_train/margins": 1.5939671397209167, + "rewards_train/rejected": -1.0256071090698242, + "step": 415 + }, + { + "epoch": 0.55, + "learning_rate": 4.894608456116479e-07, + "loss": 0.4535, + "step": 416 + }, + { + "epoch": 0.55, + "logps_train/chosen": -86.6167984008789, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -95.35467529296875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.20863284170627594, + "rewards_train/margins": 1.4565999656915665, + "rewards_train/rejected": -1.2479671239852905, + "step": 416 + }, + { + "epoch": 0.55, + "logps_train/chosen": -46.06257629394531, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -85.15351867675781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.35311707854270935, + "rewards_train/margins": 1.7778445184230804, + "rewards_train/rejected": -1.424727439880371, + "step": 417 + }, + { + "epoch": 0.56, + "learning_rate": 4.893023123391337e-07, + "loss": 0.4398, + "step": 418 + }, + { + "epoch": 0.56, + "logps_train/chosen": -73.14177703857422, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -97.67268371582031, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.2333184778690338, + "rewards_train/margins": 0.7245757281780243, + "rewards_train/rejected": -0.9578942060470581, + "step": 418 + }, + { + "epoch": 0.56, + "logps_train/chosen": -61.0244140625, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -71.52328491210938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.26201167702674866, + "rewards_train/margins": 1.4994970858097076, + "rewards_train/rejected": -1.237485408782959, + "step": 419 + }, + { + "epoch": 0.56, + "learning_rate": 4.891426216599623e-07, + "loss": 0.5166, + "step": 420 + }, + { + "epoch": 0.56, + "logps_train/chosen": -76.00321960449219, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -73.52159118652344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.28938475251197815, + "rewards_train/margins": 0.634649246931076, + "rewards_train/rejected": -0.9240339994430542, + "step": 420 + }, + { + "epoch": 0.56, + "logps_train/chosen": -62.26316833496094, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -71.09805297851562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.06694143265485764, + "rewards_train/margins": 0.6334886029362679, + "rewards_train/rejected": -0.7004300355911255, + "step": 421 + }, + { + "epoch": 0.56, + "learning_rate": 4.889817743464916e-07, + "loss": 0.5631, + "step": 422 + }, + { + "epoch": 0.56, + "logps_train/chosen": -54.261268615722656, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -73.1705093383789, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.12143951654434204, + "rewards_train/margins": 1.2471739649772644, + "rewards_train/rejected": -1.3686134815216064, + "step": 422 + }, + { + "epoch": 0.56, + "logps_train/chosen": -40.85841369628906, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -57.837738037109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.143846333026886, + "rewards_train/margins": 0.8303542137145996, + "rewards_train/rejected": -0.6865078806877136, + "step": 423 + }, + { + "epoch": 0.56, + "learning_rate": 4.888197711766736e-07, + "loss": 0.4953, + "step": 424 + }, + { + "epoch": 0.56, + "logps_train/chosen": -48.88616180419922, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -64.90988159179688, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3746649920940399, + "rewards_train/margins": 1.9347938001155853, + "rewards_train/rejected": -1.5601288080215454, + "step": 424 + }, + { + "epoch": 0.56, + "logps_train/chosen": -39.213130950927734, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -50.036067962646484, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.011937960982322693, + "rewards_train/margins": 0.6338563114404678, + "rewards_train/rejected": -0.6457942724227905, + "step": 425 + }, + { + "epoch": 0.57, + "learning_rate": 4.886566129340512e-07, + "loss": 0.486, + "step": 426 + }, + { + "epoch": 0.57, + "logps_train/chosen": -48.407718658447266, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -71.31959533691406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09594693779945374, + "rewards_train/margins": 1.4154061377048492, + "rewards_train/rejected": -1.3194591999053955, + "step": 426 + }, + { + "epoch": 0.57, + "logps_train/chosen": -72.96916198730469, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -78.33598327636719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.23129120469093323, + "rewards_train/margins": 0.38121286034584045, + "rewards_train/rejected": -0.6125040650367737, + "step": 427 + }, + { + "epoch": 0.57, + "learning_rate": 4.884923004077534e-07, + "loss": 0.5291, + "step": 428 + }, + { + "epoch": 0.57, + "logps_train/chosen": -72.09370422363281, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -85.7626724243164, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.02187054604291916, + "rewards_train/margins": 0.2200218364596367, + "rewards_train/rejected": -0.24189238250255585, + "step": 428 + }, + { + "epoch": 0.57, + "logps_train/chosen": -87.04708862304688, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -107.77870178222656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.8859164118766785, + "rewards_train/margins": 1.9637873768806458, + "rewards_train/rejected": -1.0778709650039673, + "step": 429 + }, + { + "epoch": 0.57, + "learning_rate": 4.88326834392492e-07, + "loss": 0.5969, + "step": 430 + }, + { + "epoch": 0.57, + "logps_train/chosen": -67.62312316894531, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -67.78407287597656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.13887476921081543, + "rewards_train/margins": 0.2731262147426605, + "rewards_train/rejected": -0.41200098395347595, + "step": 430 + }, + { + "epoch": 0.57, + "logps_train/chosen": -84.49793243408203, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -104.53606414794922, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.11739447712898254, + "rewards_train/margins": 1.5178757011890411, + "rewards_train/rejected": -1.4004812240600586, + "step": 431 + }, + { + "epoch": 0.57, + "learning_rate": 4.881602156885582e-07, + "loss": 0.6548, + "step": 432 + }, + { + "epoch": 0.57, + "logps_train/chosen": -60.758872985839844, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -75.25013732910156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.36083099246025085, + "rewards_train/margins": 1.157719761133194, + "rewards_train/rejected": -0.7968887686729431, + "step": 432 + }, + { + "epoch": 0.58, + "logps_train/chosen": -64.21223449707031, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -90.84576416015625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.8115890622138977, + "rewards_train/margins": 1.600852906703949, + "rewards_train/rejected": -0.7892638444900513, + "step": 433 + }, + { + "epoch": 0.58, + "learning_rate": 4.87992445101818e-07, + "loss": 0.4658, + "step": 434 + }, + { + "epoch": 0.58, + "logps_train/chosen": -98.69319915771484, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -102.20870971679688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.06931973248720169, + "rewards_train/margins": 0.5703013464808464, + "rewards_train/rejected": -0.6396210789680481, + "step": 434 + }, + { + "epoch": 0.58, + "logps_train/chosen": -50.661983489990234, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -58.152793884277344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.08692646026611328, + "rewards_train/margins": 1.2787686586380005, + "rewards_train/rejected": -1.1918421983718872, + "step": 435 + }, + { + "epoch": 0.58, + "learning_rate": 4.878235234437083e-07, + "loss": 0.5462, + "step": 436 + }, + { + "epoch": 0.58, + "logps_train/chosen": -52.73915100097656, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -69.75190734863281, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.009852513670921326, + "rewards_train/margins": 0.29268230497837067, + "rewards_train/rejected": -0.302534818649292, + "step": 436 + }, + { + "epoch": 0.58, + "logps_train/chosen": -66.21002197265625, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -64.44618225097656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.45009124279022217, + "rewards_train/margins": 1.8634592294692993, + "rewards_train/rejected": -1.4133679866790771, + "step": 437 + }, + { + "epoch": 0.58, + "learning_rate": 4.876534515312337e-07, + "loss": 0.4549, + "step": 438 + }, + { + "epoch": 0.58, + "logps_train/chosen": -37.91630554199219, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -30.625, + "logps_train/rejected": -33.68592834472656, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.027900636196136475, + "rewards_train/margins": 0.3359465003013611, + "rewards_train/rejected": -0.3080458641052246, + "step": 438 + }, + { + "epoch": 0.58, + "logps_train/chosen": -44.24502182006836, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -60.00847625732422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12334935367107391, + "rewards_train/margins": 1.4113061875104904, + "rewards_train/rejected": -1.2879568338394165, + "step": 439 + }, + { + "epoch": 0.58, + "learning_rate": 4.87482230186962e-07, + "loss": 0.4918, + "step": 440 + }, + { + "epoch": 0.58, + "logps_train/chosen": -93.13862609863281, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -78.01073455810547, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7580126523971558, + "rewards_train/margins": 1.4481490850448608, + "rewards_train/rejected": -0.6901364326477051, + "step": 440 + }, + { + "epoch": 0.59, + "logps_train/chosen": -61.005889892578125, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -75.66497039794922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1103547215461731, + "rewards_train/margins": 0.7170796990394592, + "rewards_train/rejected": -0.8274344205856323, + "step": 441 + }, + { + "epoch": 0.59, + "learning_rate": 4.873098602390202e-07, + "loss": 0.4477, + "step": 442 + }, + { + "epoch": 0.59, + "logps_train/chosen": -60.18880844116211, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -61.91142272949219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.36705663800239563, + "rewards_train/margins": 1.0706988871097565, + "rewards_train/rejected": -0.7036422491073608, + "step": 442 + }, + { + "epoch": 0.59, + "logps_train/chosen": -53.55289840698242, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -71.93734741210938, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.03260091692209244, + "rewards_train/margins": 1.4294608011841774, + "rewards_train/rejected": -1.396859884262085, + "step": 443 + }, + { + "epoch": 0.59, + "learning_rate": 4.871363425210907e-07, + "loss": 0.4319, + "step": 444 + }, + { + "epoch": 0.59, + "logps_train/chosen": -43.11422348022461, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -52.44865417480469, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.11435892432928085, + "rewards_train/margins": 0.9166460260748863, + "rewards_train/rejected": -0.8022871017456055, + "step": 444 + }, + { + "epoch": 0.59, + "logps_train/chosen": -51.13971710205078, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -61.27005386352539, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.32274702191352844, + "rewards_train/margins": 0.973190039396286, + "rewards_train/rejected": -0.6504430174827576, + "step": 445 + }, + { + "epoch": 0.59, + "learning_rate": 4.869616778724073e-07, + "loss": 0.5021, + "step": 446 + }, + { + "epoch": 0.59, + "logps_train/chosen": -43.573551177978516, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -58.84613037109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.16516748070716858, + "rewards_train/margins": 0.9741333425045013, + "rewards_train/rejected": -1.13930082321167, + "step": 446 + }, + { + "epoch": 0.59, + "logps_train/chosen": -64.185302734375, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -69.64631652832031, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.40022003650665283, + "rewards_train/margins": 0.8390710055828094, + "rewards_train/rejected": -0.4388509690761566, + "step": 447 + }, + { + "epoch": 0.59, + "learning_rate": 4.867858671377508e-07, + "loss": 0.4529, + "step": 448 + }, + { + "epoch": 0.59, + "logps_train/chosen": -57.2574348449707, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -68.23394012451172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.08746234327554703, + "rewards_train/margins": 0.857806883752346, + "rewards_train/rejected": -0.9452692270278931, + "step": 448 + }, + { + "epoch": 0.6, + "logps_train/chosen": -56.99577331542969, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -63.00306701660156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.6629226207733154, + "rewards_train/margins": 1.1366665363311768, + "rewards_train/rejected": -0.47374391555786133, + "step": 449 + }, + { + "epoch": 0.6, + "learning_rate": 4.866089111674452e-07, + "loss": 0.4792, + "step": 450 + }, + { + "epoch": 0.6, + "logps_train/chosen": -73.94206237792969, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -112.22356414794922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5557940006256104, + "rewards_train/margins": 2.2969000339508057, + "rewards_train/rejected": -1.7411060333251953, + "step": 450 + }, + { + "epoch": 0.6, + "logps_train/chosen": -58.695640563964844, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -72.12869262695312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.15231075882911682, + "rewards_train/margins": 0.9808046519756317, + "rewards_train/rejected": -0.8284938931465149, + "step": 451 + }, + { + "epoch": 0.6, + "learning_rate": 4.864308108173538e-07, + "loss": 0.3683, + "step": 452 + }, + { + "epoch": 0.6, + "logps_train/chosen": -86.11430358886719, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -81.59931945800781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.34033703804016113, + "rewards_train/margins": 0.1844393014907837, + "rewards_train/rejected": -0.5247763395309448, + "step": 452 + }, + { + "epoch": 0.6, + "logps_train/chosen": -73.1890869140625, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -99.24513244628906, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7810919284820557, + "rewards_train/margins": 1.422011375427246, + "rewards_train/rejected": -0.6409194469451904, + "step": 453 + }, + { + "epoch": 0.6, + "learning_rate": 4.862515669488744e-07, + "loss": 0.6111, + "step": 454 + }, + { + "epoch": 0.6, + "logps_train/chosen": -60.15624237060547, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -72.10064697265625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.36875075101852417, + "rewards_train/margins": 1.6006905436515808, + "rewards_train/rejected": -1.2319397926330566, + "step": 454 + }, + { + "epoch": 0.6, + "logps_train/chosen": -74.43499755859375, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -93.25853729248047, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7174383997917175, + "rewards_train/margins": 1.777666985988617, + "rewards_train/rejected": -1.0602285861968994, + "step": 455 + }, + { + "epoch": 0.61, + "learning_rate": 4.86071180428936e-07, + "loss": 0.3871, + "step": 456 + }, + { + "epoch": 0.61, + "logps_train/chosen": -67.68987274169922, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -92.97109985351562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.41695016622543335, + "rewards_train/margins": 1.7437474131584167, + "rewards_train/rejected": -1.3267972469329834, + "step": 456 + }, + { + "epoch": 0.61, + "logps_train/chosen": -62.74115753173828, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -95.05789947509766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8883844017982483, + "rewards_train/margins": 2.741049349308014, + "rewards_train/rejected": -1.8526649475097656, + "step": 457 + }, + { + "epoch": 0.61, + "learning_rate": 4.858896521299934e-07, + "loss": 0.2323, + "step": 458 + }, + { + "epoch": 0.61, + "logps_train/chosen": -63.582603454589844, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -65.38656616210938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.09798998385667801, + "rewards_train/margins": 0.6616466119885445, + "rewards_train/rejected": -0.5636566281318665, + "step": 458 + }, + { + "epoch": 0.61, + "logps_train/chosen": -89.062255859375, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -106.24118041992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4718993306159973, + "rewards_train/margins": 2.10383003950119, + "rewards_train/rejected": -1.6319307088851929, + "step": 459 + }, + { + "epoch": 0.61, + "learning_rate": 4.857069829300246e-07, + "loss": 0.3481, + "step": 460 + }, + { + "epoch": 0.61, + "logps_train/chosen": -61.54225158691406, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -93.00715637207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.19108706712722778, + "rewards_train/margins": 1.9386783242225647, + "rewards_train/rejected": -1.747591257095337, + "step": 460 + }, + { + "epoch": 0.61, + "logps_train/chosen": -51.11946105957031, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -89.09141540527344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3427412807941437, + "rewards_train/margins": 1.519070416688919, + "rewards_train/rejected": -1.1763291358947754, + "step": 461 + }, + { + "epoch": 0.61, + "learning_rate": 4.855231737125249e-07, + "loss": 0.3517, + "step": 462 + }, + { + "epoch": 0.61, + "logps_train/chosen": -50.43305206298828, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -73.903076171875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4086480736732483, + "rewards_train/margins": 1.6661434769630432, + "rewards_train/rejected": -1.257495403289795, + "step": 462 + }, + { + "epoch": 0.61, + "logps_train/chosen": -48.7312126159668, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -64.36223602294922, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.21320679783821106, + "rewards_train/margins": 1.1892743408679962, + "rewards_train/rejected": -0.9760675430297852, + "step": 463 + }, + { + "epoch": 0.62, + "learning_rate": 4.85338225366504e-07, + "loss": 0.4167, + "step": 464 + }, + { + "epoch": 0.62, + "logps_train/chosen": -76.38656616210938, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -72.82841491699219, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.9097812175750732, + "rewards_train/margins": 2.301215887069702, + "rewards_train/rejected": -1.391434669494629, + "step": 464 + }, + { + "epoch": 0.62, + "logps_train/chosen": -55.65221405029297, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -52.41108703613281, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.33240920305252075, + "rewards_train/margins": 0.3176838159561157, + "rewards_train/rejected": -0.6500930190086365, + "step": 465 + }, + { + "epoch": 0.62, + "learning_rate": 4.851521387864806e-07, + "loss": 0.5332, + "step": 466 + }, + { + "epoch": 0.62, + "logps_train/chosen": -95.60820007324219, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -118.2801513671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5579298734664917, + "rewards_train/margins": 2.7578204870224, + "rewards_train/rejected": -2.199890613555908, + "step": 466 + }, + { + "epoch": 0.62, + "logps_train/chosen": -48.76190948486328, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -52.718875885009766, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.19494125247001648, + "rewards_train/margins": 0.5378837287425995, + "rewards_train/rejected": -0.732824981212616, + "step": 467 + }, + { + "epoch": 0.62, + "learning_rate": 4.849649148724789e-07, + "loss": 0.4533, + "step": 468 + }, + { + "epoch": 0.62, + "logps_train/chosen": -55.786354064941406, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -77.19148254394531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.8362081050872803, + "rewards_train/margins": 1.538950800895691, + "rewards_train/rejected": -0.7027426958084106, + "step": 468 + }, + { + "epoch": 0.62, + "logps_train/chosen": -75.44213104248047, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -87.67589569091797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.6971932053565979, + "rewards_train/margins": 1.175720751285553, + "rewards_train/rejected": -0.4785275459289551, + "step": 469 + }, + { + "epoch": 0.62, + "learning_rate": 4.847765545300238e-07, + "loss": 0.377, + "step": 470 + }, + { + "epoch": 0.62, + "logps_train/chosen": -72.38058471679688, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -78.2701187133789, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.1575891077518463, + "rewards_train/margins": 1.2209852635860443, + "rewards_train/rejected": -1.3785743713378906, + "step": 470 + }, + { + "epoch": 0.63, + "logps_train/chosen": -80.75613403320312, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -77.01436614990234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6556369662284851, + "rewards_train/margins": 1.0344175100326538, + "rewards_train/rejected": -0.3787805438041687, + "step": 471 + }, + { + "epoch": 0.63, + "learning_rate": 4.845870586701367e-07, + "loss": 0.4686, + "step": 472 + }, + { + "epoch": 0.63, + "logps_train/chosen": -42.87356948852539, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -66.19390869140625, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2454555183649063, + "rewards_train/margins": 1.0968775898218155, + "rewards_train/rejected": -0.8514220714569092, + "step": 472 + }, + { + "epoch": 0.63, + "logps_train/chosen": -45.4461669921875, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -61.40019607543945, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18194574117660522, + "rewards_train/margins": 1.5563400387763977, + "rewards_train/rejected": -1.3743942975997925, + "step": 473 + }, + { + "epoch": 0.63, + "learning_rate": 4.84396428209331e-07, + "loss": 0.4146, + "step": 474 + }, + { + "epoch": 0.63, + "logps_train/chosen": -86.17939758300781, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -84.92279052734375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4413773715496063, + "rewards_train/margins": 0.2665269672870636, + "rewards_train/rejected": -0.7079043388366699, + "step": 474 + }, + { + "epoch": 0.63, + "logps_train/chosen": -42.84635925292969, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -48.5110969543457, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23216086626052856, + "rewards_train/margins": 1.226239264011383, + "rewards_train/rejected": -0.9940783977508545, + "step": 475 + }, + { + "epoch": 0.63, + "learning_rate": 4.842046640696075e-07, + "loss": 0.5151, + "step": 476 + }, + { + "epoch": 0.63, + "logps_train/chosen": -70.34722900390625, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -85.06442260742188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5965275168418884, + "rewards_train/margins": 1.881094515323639, + "rewards_train/rejected": -1.2845669984817505, + "step": 476 + }, + { + "epoch": 0.63, + "logps_train/chosen": -70.40995788574219, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -79.18315887451172, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.21326139569282532, + "rewards_train/margins": 1.004273384809494, + "rewards_train/rejected": -1.2175347805023193, + "step": 477 + }, + { + "epoch": 0.63, + "learning_rate": 4.840117671784504e-07, + "loss": 0.3941, + "step": 478 + }, + { + "epoch": 0.63, + "logps_train/chosen": -65.40243530273438, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -77.40702819824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7550691962242126, + "rewards_train/margins": 1.822334110736847, + "rewards_train/rejected": -1.0672649145126343, + "step": 478 + }, + { + "epoch": 0.64, + "logps_train/chosen": -84.39859771728516, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -119.66218566894531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 1.0163898468017578, + "rewards_train/margins": 2.5365145206451416, + "rewards_train/rejected": -1.5201246738433838, + "step": 479 + }, + { + "epoch": 0.64, + "learning_rate": 4.838177384688225e-07, + "loss": 0.2242, + "step": 480 + }, + { + "epoch": 0.64, + "logps_train/chosen": -74.55684661865234, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -108.84405517578125, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.39431503415107727, + "rewards_train/margins": 0.969345360994339, + "rewards_train/rejected": -0.5750303268432617, + "step": 480 + }, + { + "epoch": 0.64, + "logps_train/chosen": -74.06786346435547, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -77.08515167236328, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.04389568418264389, + "rewards_train/margins": 0.8724319115281105, + "rewards_train/rejected": -0.9163275957107544, + "step": 481 + }, + { + "epoch": 0.64, + "learning_rate": 4.836225788791606e-07, + "loss": 0.5093, + "step": 482 + }, + { + "epoch": 0.64, + "logps_train/chosen": -78.9788818359375, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -97.8614501953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2822626829147339, + "rewards_train/margins": 0.9585698843002319, + "rewards_train/rejected": -1.2408325672149658, + "step": 482 + }, + { + "epoch": 0.64, + "logps_train/chosen": -39.03144836425781, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -50.362998962402344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5556445121765137, + "rewards_train/margins": 1.6556164026260376, + "rewards_train/rejected": -1.099971890449524, + "step": 483 + }, + { + "epoch": 0.64, + "learning_rate": 4.834262893533713e-07, + "loss": 0.3714, + "step": 484 + }, + { + "epoch": 0.64, + "logps_train/chosen": -103.36849975585938, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -126.58375549316406, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.12513086199760437, + "rewards_train/margins": 0.9703533947467804, + "rewards_train/rejected": -1.0954842567443848, + "step": 484 + }, + { + "epoch": 0.64, + "logps_train/chosen": -91.72317504882812, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -84.20274353027344, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.03794288635253906, + "rewards_train/margins": 0.865924596786499, + "rewards_train/rejected": -0.9038674831390381, + "step": 485 + }, + { + "epoch": 0.65, + "learning_rate": 4.83228870840826e-07, + "loss": 0.6328, + "step": 486 + }, + { + "epoch": 0.65, + "logps_train/chosen": -67.49491119384766, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -89.57960510253906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.36160022020339966, + "rewards_train/margins": 0.3533911108970642, + "rewards_train/rejected": -0.7149913311004639, + "step": 486 + }, + { + "epoch": 0.65, + "logps_train/chosen": -55.67702102661133, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -59.93408203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.13503268361091614, + "rewards_train/margins": 0.6487534940242767, + "rewards_train/rejected": -0.5137208104133606, + "step": 487 + }, + { + "epoch": 0.65, + "learning_rate": 4.830303242963569e-07, + "loss": 0.6315, + "step": 488 + }, + { + "epoch": 0.65, + "logps_train/chosen": -57.108734130859375, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -71.58772277832031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3375643491744995, + "rewards_train/margins": 1.7150697708129883, + "rewards_train/rejected": -1.3775054216384888, + "step": 488 + }, + { + "epoch": 0.65, + "logps_train/chosen": -81.05008697509766, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -100.58329010009766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.416866272687912, + "rewards_train/margins": 1.5095703303813934, + "rewards_train/rejected": -1.0927040576934814, + "step": 489 + }, + { + "epoch": 0.65, + "learning_rate": 4.828306506802516e-07, + "loss": 0.3959, + "step": 490 + }, + { + "epoch": 0.65, + "logps_train/chosen": -43.09886169433594, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -54.933494567871094, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.30730152130126953, + "rewards_train/margins": 1.191275954246521, + "rewards_train/rejected": -0.8839744329452515, + "step": 490 + }, + { + "epoch": 0.65, + "logps_train/chosen": -38.58753204345703, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -56.39495086669922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.060315392911434174, + "rewards_train/margins": 0.8994922414422035, + "rewards_train/rejected": -0.9598076343536377, + "step": 491 + }, + { + "epoch": 0.65, + "learning_rate": 4.826298509582492e-07, + "loss": 0.4602, + "step": 492 + }, + { + "epoch": 0.65, + "logps_train/chosen": -46.844520568847656, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -64.6305160522461, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.24304601550102234, + "rewards_train/margins": 0.1325054168701172, + "rewards_train/rejected": -0.3755514323711395, + "step": 492 + }, + { + "epoch": 0.65, + "logps_train/chosen": -86.06369018554688, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -116.0888900756836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7295681238174438, + "rewards_train/margins": 2.379863142967224, + "rewards_train/rejected": -1.6502950191497803, + "step": 493 + }, + { + "epoch": 0.66, + "learning_rate": 4.824279261015352e-07, + "loss": 0.4359, + "step": 494 + }, + { + "epoch": 0.66, + "logps_train/chosen": -48.86855697631836, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -87.00870513916016, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.10814464092254639, + "rewards_train/margins": 1.6630383729934692, + "rewards_train/rejected": -1.7711830139160156, + "step": 494 + }, + { + "epoch": 0.66, + "logps_train/chosen": -54.31206512451172, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -76.25498962402344, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.376605749130249, + "rewards_train/margins": 0.7646042108535767, + "rewards_train/rejected": -0.38799846172332764, + "step": 495 + }, + { + "epoch": 0.66, + "learning_rate": 4.82224877086737e-07, + "loss": 0.4099, + "step": 496 + }, + { + "epoch": 0.66, + "logps_train/chosen": -66.955810546875, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -103.06253814697266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.5559815168380737, + "rewards_train/margins": 1.7966104745864868, + "rewards_train/rejected": -1.240628957748413, + "step": 496 + }, + { + "epoch": 0.66, + "logps_train/chosen": -42.07504653930664, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -71.94281768798828, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2909327745437622, + "rewards_train/margins": 1.5977145433425903, + "rewards_train/rejected": -1.3067817687988281, + "step": 497 + }, + { + "epoch": 0.66, + "learning_rate": 4.820207048959186e-07, + "loss": 0.3154, + "step": 498 + }, + { + "epoch": 0.66, + "logps_train/chosen": -69.0484848022461, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -66.33747100830078, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6654641628265381, + "rewards_train/margins": 1.6960864067077637, + "rewards_train/rejected": -1.0306222438812256, + "step": 498 + }, + { + "epoch": 0.66, + "logps_train/chosen": -58.36433029174805, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -83.72349548339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10262942314147949, + "rewards_train/margins": 1.709354281425476, + "rewards_train/rejected": -1.6067248582839966, + "step": 499 + }, + { + "epoch": 0.66, + "learning_rate": 4.818154105165772e-07, + "loss": 0.3195, + "step": 500 + }, + { + "epoch": 0.66, + "logps_train/chosen": -64.72539520263672, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -82.59965515136719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.03722620755434036, + "rewards_train/margins": 1.0901601389050484, + "rewards_train/rejected": -1.052933931350708, + "step": 500 + }, + { + "epoch": 0.67, + "logps_train/chosen": -78.46614837646484, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -116.8248062133789, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.8393229246139526, + "rewards_train/margins": 2.340553641319275, + "rewards_train/rejected": -1.5012307167053223, + "step": 501 + }, + { + "epoch": 0.67, + "learning_rate": 4.816089949416369e-07, + "loss": 0.3819, + "step": 502 + }, + { + "epoch": 0.67, + "logps_train/chosen": -44.76847457885742, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -56.458106994628906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.05284015089273453, + "rewards_train/margins": 0.9736508503556252, + "rewards_train/rejected": -0.9208106994628906, + "step": 502 + }, + { + "epoch": 0.67, + "logps_train/chosen": -68.35040283203125, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -107.94994354248047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.04464678838849068, + "rewards_train/margins": 1.8271412141621113, + "rewards_train/rejected": -1.7824944257736206, + "step": 503 + }, + { + "epoch": 0.67, + "learning_rate": 4.814014591694448e-07, + "loss": 0.4376, + "step": 504 + }, + { + "epoch": 0.67, + "logps_train/chosen": -35.27831268310547, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -55.747501373291016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.544044017791748, + "rewards_train/margins": 1.4187942147254944, + "rewards_train/rejected": -0.8747501969337463, + "step": 504 + }, + { + "epoch": 0.67, + "logps_train/chosen": -66.682861328125, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -114.08990478515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.49577608704566956, + "rewards_train/margins": 1.9907038509845734, + "rewards_train/rejected": -1.4949277639389038, + "step": 505 + }, + { + "epoch": 0.67, + "learning_rate": 4.811928042037658e-07, + "loss": 0.3063, + "step": 506 + }, + { + "epoch": 0.67, + "logps_train/chosen": -60.73814010620117, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -92.0196304321289, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.04649835824966431, + "rewards_train/margins": 1.5498284697532654, + "rewards_train/rejected": -1.503330111503601, + "step": 506 + }, + { + "epoch": 0.67, + "logps_train/chosen": -61.18023681640625, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -82.28202819824219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.599945068359375, + "rewards_train/margins": 1.6086167097091675, + "rewards_train/rejected": -1.0086716413497925, + "step": 507 + }, + { + "epoch": 0.67, + "learning_rate": 4.809830310537781e-07, + "loss": 0.4885, + "step": 508 + }, + { + "epoch": 0.67, + "logps_train/chosen": -79.5849609375, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -91.89620208740234, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.2649410367012024, + "rewards_train/margins": 1.0717484951019287, + "rewards_train/rejected": -0.8068074584007263, + "step": 508 + }, + { + "epoch": 0.68, + "logps_train/chosen": -74.50267028808594, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -75.73983764648438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 1.0059832334518433, + "rewards_train/margins": 2.048716425895691, + "rewards_train/rejected": -1.0427331924438477, + "step": 509 + }, + { + "epoch": 0.68, + "learning_rate": 4.807721407340679e-07, + "loss": 0.4407, + "step": 510 + }, + { + "epoch": 0.68, + "logps_train/chosen": -101.51117706298828, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -145.41726684570312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.36450710892677307, + "rewards_train/margins": 1.9242017567157745, + "rewards_train/rejected": -1.5596946477890015, + "step": 510 + }, + { + "epoch": 0.68, + "logps_train/chosen": -94.90898895263672, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -119.2308578491211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.002851039171218872, + "rewards_train/margins": 1.7821872532367706, + "rewards_train/rejected": -1.7793362140655518, + "step": 511 + }, + { + "epoch": 0.68, + "learning_rate": 4.805601342646248e-07, + "loss": 0.4452, + "step": 512 + }, + { + "epoch": 0.68, + "logps_train/chosen": -97.69891357421875, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -124.135009765625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.2769832909107208, + "rewards_train/margins": 1.5795471966266632, + "rewards_train/rejected": -1.3025639057159424, + "step": 512 + }, + { + "epoch": 0.68, + "logps_train/chosen": -63.78382110595703, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -70.51071166992188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.012756690382957458, + "rewards_train/margins": 0.8789393454790115, + "rewards_train/rejected": -0.891696035861969, + "step": 513 + }, + { + "epoch": 0.68, + "learning_rate": 4.803470126708366e-07, + "loss": 0.5079, + "step": 514 + }, + { + "epoch": 0.68, + "logps_train/chosen": -51.104488372802734, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -66.3725357055664, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5988103151321411, + "rewards_train/margins": 1.5354778170585632, + "rewards_train/rejected": -0.9366675019264221, + "step": 514 + }, + { + "epoch": 0.68, + "logps_train/chosen": -49.2669792175293, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -53.96672058105469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4954700469970703, + "rewards_train/margins": 1.2980016469955444, + "rewards_train/rejected": -0.8025315999984741, + "step": 515 + }, + { + "epoch": 0.69, + "learning_rate": 4.801327769834847e-07, + "loss": 0.3506, + "step": 516 + }, + { + "epoch": 0.69, + "logps_train/chosen": -70.37693786621094, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -71.87669372558594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2626972496509552, + "rewards_train/margins": 0.9868898689746857, + "rewards_train/rejected": -0.7241926193237305, + "step": 516 + }, + { + "epoch": 0.69, + "logps_train/chosen": -43.73329162597656, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.25, + "logps_train/rejected": -48.40955352783203, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.025889843702316284, + "rewards_train/margins": 0.7465327084064484, + "rewards_train/rejected": -0.7206428647041321, + "step": 517 + }, + { + "epoch": 0.69, + "learning_rate": 4.79917428238739e-07, + "loss": 0.4547, + "step": 518 + }, + { + "epoch": 0.69, + "logps_train/chosen": -43.598175048828125, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -66.93354797363281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6667448878288269, + "rewards_train/margins": 1.8444743752479553, + "rewards_train/rejected": -1.1777294874191284, + "step": 518 + }, + { + "epoch": 0.69, + "logps_train/chosen": -43.813804626464844, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -71.97845458984375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2615884244441986, + "rewards_train/margins": 1.4438091218471527, + "rewards_train/rejected": -1.182220697402954, + "step": 519 + }, + { + "epoch": 0.69, + "learning_rate": 4.797009674781523e-07, + "loss": 0.3965, + "step": 520 + }, + { + "epoch": 0.69, + "logps_train/chosen": -39.76426696777344, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -63.16413116455078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.47044843435287476, + "rewards_train/margins": 1.7993616461753845, + "rewards_train/rejected": -1.3289132118225098, + "step": 520 + }, + { + "epoch": 0.69, + "logps_train/chosen": -36.2171745300293, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -49.16302490234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.15289199352264404, + "rewards_train/margins": 1.0125537514686584, + "rewards_train/rejected": -0.8596617579460144, + "step": 521 + }, + { + "epoch": 0.69, + "learning_rate": 4.794833957486562e-07, + "loss": 0.3429, + "step": 522 + }, + { + "epoch": 0.69, + "logps_train/chosen": -89.44718933105469, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -113.53695678710938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.4443441331386566, + "rewards_train/margins": 2.4218682944774628, + "rewards_train/rejected": -1.9775241613388062, + "step": 522 + }, + { + "epoch": 0.69, + "logps_train/chosen": -65.70970916748047, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -92.09840393066406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 1.1571540832519531, + "rewards_train/margins": 2.8185572624206543, + "rewards_train/rejected": -1.6614031791687012, + "step": 523 + }, + { + "epoch": 0.7, + "learning_rate": 4.792647141025557e-07, + "loss": 0.2876, + "step": 524 + }, + { + "epoch": 0.7, + "logps_train/chosen": -85.04306030273438, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -118.11507415771484, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.9925685524940491, + "rewards_train/margins": 2.74313884973526, + "rewards_train/rejected": -1.750570297241211, + "step": 524 + }, + { + "epoch": 0.7, + "logps_train/chosen": -47.077213287353516, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -64.46700286865234, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.05098319053649902, + "rewards_train/margins": 0.4019668400287628, + "rewards_train/rejected": -0.45295003056526184, + "step": 525 + }, + { + "epoch": 0.7, + "learning_rate": 4.790449235975235e-07, + "loss": 0.5276, + "step": 526 + }, + { + "epoch": 0.7, + "logps_train/chosen": -65.16095733642578, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -65.19416809082031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.42966973781585693, + "rewards_train/margins": 0.9237319231033325, + "rewards_train/rejected": -1.3534016609191895, + "step": 526 + }, + { + "epoch": 0.7, + "logps_train/chosen": -58.66972351074219, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -94.27264404296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6892780065536499, + "rewards_train/margins": 2.7020891904830933, + "rewards_train/rejected": -2.0128111839294434, + "step": 527 + }, + { + "epoch": 0.7, + "learning_rate": 4.788240252965957e-07, + "loss": 0.3182, + "step": 528 + }, + { + "epoch": 0.7, + "logps_train/chosen": -55.83493423461914, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -80.23184204101562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.444430947303772, + "rewards_train/margins": 1.1334413290023804, + "rewards_train/rejected": -1.5778722763061523, + "step": 528 + }, + { + "epoch": 0.7, + "logps_train/chosen": -47.8358039855957, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -59.1377067565918, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5781385898590088, + "rewards_train/margins": 1.2555812001228333, + "rewards_train/rejected": -0.6774426102638245, + "step": 529 + }, + { + "epoch": 0.7, + "learning_rate": 4.786020202681666e-07, + "loss": 0.4616, + "step": 530 + }, + { + "epoch": 0.7, + "logps_train/chosen": -60.454132080078125, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -86.97035217285156, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.436618447303772, + "rewards_train/margins": 1.9867793321609497, + "rewards_train/rejected": -1.5501608848571777, + "step": 530 + }, + { + "epoch": 0.71, + "logps_train/chosen": -60.42911911010742, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -81.11143493652344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.17740082740783691, + "rewards_train/margins": 1.4744820594787598, + "rewards_train/rejected": -1.2970812320709229, + "step": 531 + }, + { + "epoch": 0.71, + "learning_rate": 4.783789095859828e-07, + "loss": 0.3121, + "step": 532 + }, + { + "epoch": 0.71, + "logps_train/chosen": -53.50497817993164, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -66.90718078613281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.21434569358825684, + "rewards_train/margins": 1.379673719406128, + "rewards_train/rejected": -1.165328025817871, + "step": 532 + }, + { + "epoch": 0.71, + "logps_train/chosen": -55.165855407714844, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -58.01957702636719, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.24630485475063324, + "rewards_train/margins": 1.100215658545494, + "rewards_train/rejected": -0.8539108037948608, + "step": 533 + }, + { + "epoch": 0.71, + "learning_rate": 4.781546943291387e-07, + "loss": 0.454, + "step": 534 + }, + { + "epoch": 0.71, + "logps_train/chosen": -81.78358459472656, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -103.94087219238281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.08460818231105804, + "rewards_train/margins": 1.2782291322946548, + "rewards_train/rejected": -1.362837314605713, + "step": 534 + }, + { + "epoch": 0.71, + "logps_train/chosen": -58.39218521118164, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -62.64170455932617, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4240626096725464, + "rewards_train/margins": 1.6991705894470215, + "rewards_train/rejected": -1.275107979774475, + "step": 535 + }, + { + "epoch": 0.71, + "learning_rate": 4.779293755820712e-07, + "loss": 0.4349, + "step": 536 + }, + { + "epoch": 0.71, + "logps_train/chosen": -61.329803466796875, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -90.47566223144531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.02204272150993347, + "rewards_train/margins": 1.566148191690445, + "rewards_train/rejected": -1.5881909132003784, + "step": 536 + }, + { + "epoch": 0.71, + "logps_train/chosen": -80.10034942626953, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -81.34458923339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.47277745604515076, + "rewards_train/margins": 1.5916114747524261, + "rewards_train/rejected": -1.1188340187072754, + "step": 537 + }, + { + "epoch": 0.71, + "learning_rate": 4.777029544345543e-07, + "loss": 0.302, + "step": 538 + }, + { + "epoch": 0.71, + "logps_train/chosen": -62.506080627441406, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -94.4066390991211, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.8689229488372803, + "rewards_train/margins": 2.4548988342285156, + "rewards_train/rejected": -1.5859758853912354, + "step": 538 + }, + { + "epoch": 0.72, + "logps_train/chosen": -53.01947021484375, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -61.01969909667969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.0628846287727356, + "rewards_train/margins": 1.023460328578949, + "rewards_train/rejected": -1.0863449573516846, + "step": 539 + }, + { + "epoch": 0.72, + "learning_rate": 4.774754319816936e-07, + "loss": 0.3846, + "step": 540 + }, + { + "epoch": 0.72, + "logps_train/chosen": -86.12532043457031, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -99.66450500488281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5593426823616028, + "rewards_train/margins": 2.1148563027381897, + "rewards_train/rejected": -1.555513620376587, + "step": 540 + }, + { + "epoch": 0.72, + "logps_train/chosen": -89.7861328125, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -121.96415710449219, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.15673795342445374, + "rewards_train/margins": 1.1709276139736176, + "rewards_train/rejected": -1.3276655673980713, + "step": 541 + }, + { + "epoch": 0.72, + "learning_rate": 4.772468093239214e-07, + "loss": 0.3335, + "step": 542 + }, + { + "epoch": 0.72, + "logps_train/chosen": -65.43281555175781, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -88.52119445800781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8801558017730713, + "rewards_train/margins": 2.388525128364563, + "rewards_train/rejected": -1.5083693265914917, + "step": 542 + }, + { + "epoch": 0.72, + "logps_train/chosen": -100.86902618408203, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -112.65705871582031, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.17127755284309387, + "rewards_train/margins": 0.7600526511669159, + "rewards_train/rejected": -0.9313302040100098, + "step": 543 + }, + { + "epoch": 0.72, + "learning_rate": 4.770170875669915e-07, + "loss": 0.4514, + "step": 544 + }, + { + "epoch": 0.72, + "logps_train/chosen": -75.40251159667969, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -103.71463775634766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.41287341713905334, + "rewards_train/margins": 2.479649692773819, + "rewards_train/rejected": -2.0667762756347656, + "step": 544 + }, + { + "epoch": 0.72, + "logps_train/chosen": -78.18917846679688, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -67.2279281616211, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.4767308235168457, + "rewards_train/margins": 0.06949913501739502, + "rewards_train/rejected": -0.5462299585342407, + "step": 545 + }, + { + "epoch": 0.73, + "learning_rate": 4.767862678219731e-07, + "loss": 0.5378, + "step": 546 + }, + { + "epoch": 0.73, + "logps_train/chosen": -86.35397338867188, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -106.66950988769531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.055226948112249374, + "rewards_train/margins": 2.0128040574491024, + "rewards_train/rejected": -1.957577109336853, + "step": 546 + }, + { + "epoch": 0.73, + "logps_train/chosen": -34.413246154785156, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -50.791778564453125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.25437861680984497, + "rewards_train/margins": 0.895861029624939, + "rewards_train/rejected": -0.641482412815094, + "step": 547 + }, + { + "epoch": 0.73, + "learning_rate": 4.765543512052463e-07, + "loss": 0.4256, + "step": 548 + }, + { + "epoch": 0.73, + "logps_train/chosen": -64.68608093261719, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -61.228912353515625, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -0.7217331528663635, + "rewards_train/margins": -0.45431074500083923, + "rewards_train/rejected": -0.2674224078655243, + "step": 548 + }, + { + "epoch": 0.73, + "logps_train/chosen": -86.13417053222656, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -89.27899169921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2092394232749939, + "rewards_train/margins": 1.8133100867271423, + "rewards_train/rejected": -1.6040706634521484, + "step": 549 + }, + { + "epoch": 0.73, + "learning_rate": 4.7632133883849623e-07, + "loss": 0.699, + "step": 550 + }, + { + "epoch": 0.73, + "logps_train/chosen": -77.55008697509766, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -79.2755126953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.18420809507369995, + "rewards_train/margins": 0.9898279309272766, + "rewards_train/rejected": -1.1740360260009766, + "step": 550 + }, + { + "epoch": 0.73, + "logps_train/chosen": -53.07927703857422, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -74.06660461425781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2993340790271759, + "rewards_train/margins": 1.4542015492916107, + "rewards_train/rejected": -1.7535356283187866, + "step": 551 + }, + { + "epoch": 0.73, + "learning_rate": 4.7608723184870757e-07, + "loss": 0.4229, + "step": 552 + }, + { + "epoch": 0.73, + "logps_train/chosen": -63.72083282470703, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -93.95362854003906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.35447901487350464, + "rewards_train/margins": 2.702966630458832, + "rewards_train/rejected": -2.348487615585327, + "step": 552 + }, + { + "epoch": 0.73, + "logps_train/chosen": -63.52777862548828, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -75.32018280029297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.13168418407440186, + "rewards_train/margins": 1.4268964529037476, + "rewards_train/rejected": -1.5585806369781494, + "step": 553 + }, + { + "epoch": 0.74, + "learning_rate": 4.7585203136815945e-07, + "loss": 0.2584, + "step": 554 + }, + { + "epoch": 0.74, + "logps_train/chosen": -75.23867797851562, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -94.06930541992188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15738201141357422, + "rewards_train/margins": 2.759624719619751, + "rewards_train/rejected": -2.6022427082061768, + "step": 554 + }, + { + "epoch": 0.74, + "logps_train/chosen": -57.15555191040039, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -84.36032104492188, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.6344447731971741, + "rewards_train/margins": 1.9454770684242249, + "rewards_train/rejected": -1.3110322952270508, + "step": 555 + }, + { + "epoch": 0.74, + "learning_rate": 4.756157385344195e-07, + "loss": 0.2589, + "step": 556 + }, + { + "epoch": 0.74, + "logps_train/chosen": -50.42674255371094, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -39.773258209228516, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.03486183285713196, + "rewards_train/margins": 0.3577960431575775, + "rewards_train/rejected": -0.3926578760147095, + "step": 556 + }, + { + "epoch": 0.74, + "logps_train/chosen": -44.924530029296875, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -66.36564636230469, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.443484902381897, + "rewards_train/margins": 1.3737996220588684, + "rewards_train/rejected": -0.9303147196769714, + "step": 557 + }, + { + "epoch": 0.74, + "learning_rate": 4.75378354490339e-07, + "loss": 0.4999, + "step": 558 + }, + { + "epoch": 0.74, + "logps_train/chosen": -26.142358779907227, + "logps_train/ref_chosen": -30.5, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -44.81050109863281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.43498295545578003, + "rewards_train/margins": 1.6269704699516296, + "rewards_train/rejected": -1.1919875144958496, + "step": 558 + }, + { + "epoch": 0.74, + "logps_train/chosen": -38.286808013916016, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -56.23208236694336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.034930914640426636, + "rewards_train/margins": 1.289449006319046, + "rewards_train/rejected": -1.3243799209594727, + "step": 559 + }, + { + "epoch": 0.74, + "learning_rate": 4.7513988038404653e-07, + "loss": 0.3509, + "step": 560 + }, + { + "epoch": 0.74, + "logps_train/chosen": -48.065032958984375, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -52.52865982055664, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2533780634403229, + "rewards_train/margins": 0.9471441209316254, + "rewards_train/rejected": -1.2005221843719482, + "step": 560 + }, + { + "epoch": 0.75, + "logps_train/chosen": -51.579593658447266, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -62.28582000732422, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2532718777656555, + "rewards_train/margins": 0.3842945694923401, + "rewards_train/rejected": -0.6375664472579956, + "step": 561 + }, + { + "epoch": 0.75, + "learning_rate": 4.749003173689432e-07, + "loss": 0.5397, + "step": 562 + }, + { + "epoch": 0.75, + "logps_train/chosen": -72.37049102783203, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -105.22444152832031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.19107604026794434, + "rewards_train/margins": 2.619770050048828, + "rewards_train/rejected": -2.428694009780884, + "step": 562 + }, + { + "epoch": 0.75, + "logps_train/chosen": -88.8067855834961, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -90.50834655761719, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4817650318145752, + "rewards_train/margins": 0.8550066947937012, + "rewards_train/rejected": -1.3367717266082764, + "step": 563 + }, + { + "epoch": 0.75, + "learning_rate": 4.746596666036964e-07, + "loss": 0.3867, + "step": 564 + }, + { + "epoch": 0.75, + "logps_train/chosen": -92.42796325683594, + "logps_train/ref_chosen": -97.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -92.03213500976562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5228289365768433, + "rewards_train/margins": 1.8104172945022583, + "rewards_train/rejected": -1.287588357925415, + "step": 564 + }, + { + "epoch": 0.75, + "logps_train/chosen": -65.4759292602539, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -69.63304138183594, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.053969502449035645, + "rewards_train/margins": 0.7282111644744873, + "rewards_train/rejected": -0.6742416620254517, + "step": 565 + }, + { + "epoch": 0.75, + "learning_rate": 4.744179292522349e-07, + "loss": 0.4414, + "step": 566 + }, + { + "epoch": 0.75, + "logps_train/chosen": -38.7037239074707, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -64.80675506591797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.059435099363327026, + "rewards_train/margins": 1.1368655264377594, + "rewards_train/rejected": -1.1963006258010864, + "step": 566 + }, + { + "epoch": 0.75, + "logps_train/chosen": -57.235206604003906, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -77.59497833251953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2553855776786804, + "rewards_train/margins": 1.805508315563202, + "rewards_train/rejected": -1.5501227378845215, + "step": 567 + }, + { + "epoch": 0.75, + "learning_rate": 4.741751064837426e-07, + "loss": 0.3489, + "step": 568 + }, + { + "epoch": 0.75, + "logps_train/chosen": -79.77310180664062, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -95.32447814941406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09768963605165482, + "rewards_train/margins": 1.492636926472187, + "rewards_train/rejected": -1.3949472904205322, + "step": 568 + }, + { + "epoch": 0.76, + "logps_train/chosen": -77.84187316894531, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -102.27454376220703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22518739104270935, + "rewards_train/margins": 1.972954124212265, + "rewards_train/rejected": -1.7477667331695557, + "step": 569 + }, + { + "epoch": 0.76, + "learning_rate": 4.7393119947265303e-07, + "loss": 0.3305, + "step": 570 + }, + { + "epoch": 0.76, + "logps_train/chosen": -51.39000701904297, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -68.27716827392578, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3172490894794464, + "rewards_train/margins": 1.2996536791324615, + "rewards_train/rejected": -0.9824045896530151, + "step": 570 + }, + { + "epoch": 0.76, + "logps_train/chosen": -55.66794967651367, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -77.81556701660156, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3058575987815857, + "rewards_train/margins": 1.2678866982460022, + "rewards_train/rejected": -1.573744297027588, + "step": 571 + }, + { + "epoch": 0.76, + "learning_rate": 4.7368620939864395e-07, + "loss": 0.49, + "step": 572 + }, + { + "epoch": 0.76, + "logps_train/chosen": -55.084468841552734, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -67.74577331542969, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.1571781039237976, + "rewards_train/margins": 1.2743340134620667, + "rewards_train/rejected": -1.117155909538269, + "step": 572 + }, + { + "epoch": 0.76, + "logps_train/chosen": -38.189327239990234, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -46.556495666503906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.5357547998428345, + "rewards_train/margins": 1.4703106880187988, + "rewards_train/rejected": -0.9345558881759644, + "step": 573 + }, + { + "epoch": 0.76, + "learning_rate": 4.7344013744663137e-07, + "loss": 0.3834, + "step": 574 + }, + { + "epoch": 0.76, + "logps_train/chosen": -79.90370178222656, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -95.92884826660156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5768181085586548, + "rewards_train/margins": 2.593140721321106, + "rewards_train/rejected": -2.016322612762451, + "step": 574 + }, + { + "epoch": 0.76, + "logps_train/chosen": -68.4347152709961, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -73.60333251953125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22115738689899445, + "rewards_train/margins": 1.2821438759565353, + "rewards_train/rejected": -1.5033012628555298, + "step": 575 + }, + { + "epoch": 0.76, + "learning_rate": 4.7319298480676393e-07, + "loss": 0.3204, + "step": 576 + }, + { + "epoch": 0.76, + "logps_train/chosen": -73.40341186523438, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -71.17469787597656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4700283408164978, + "rewards_train/margins": 0.9177539944648743, + "rewards_train/rejected": -1.387782335281372, + "step": 576 + }, + { + "epoch": 0.77, + "logps_train/chosen": -59.3175163269043, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -69.69569396972656, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.93175208568573, + "rewards_train/margins": 0.5768803358078003, + "rewards_train/rejected": -1.5086324214935303, + "step": 577 + }, + { + "epoch": 0.77, + "learning_rate": 4.7294475267441703e-07, + "loss": 0.5989, + "step": 578 + }, + { + "epoch": 0.77, + "logps_train/chosen": -41.835750579833984, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -58.98059844970703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3351374864578247, + "rewards_train/margins": 1.003546953201294, + "rewards_train/rejected": -1.3386844396591187, + "step": 578 + }, + { + "epoch": 0.77, + "logps_train/chosen": -49.31288146972656, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -74.24522399902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19175879657268524, + "rewards_train/margins": 1.8944060057401657, + "rewards_train/rejected": -1.7026472091674805, + "step": 579 + }, + { + "epoch": 0.77, + "learning_rate": 4.726954422501873e-07, + "loss": 0.3095, + "step": 580 + }, + { + "epoch": 0.77, + "logps_train/chosen": -51.600826263427734, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -62.637237548828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.41437941789627075, + "rewards_train/margins": 0.5985633730888367, + "rewards_train/rejected": -1.0129427909851074, + "step": 580 + }, + { + "epoch": 0.77, + "logps_train/chosen": -57.84700393676758, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -108.85273742675781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.551237165927887, + "rewards_train/margins": 2.521667182445526, + "rewards_train/rejected": -1.9704300165176392, + "step": 581 + }, + { + "epoch": 0.77, + "learning_rate": 4.724450547398864e-07, + "loss": 0.4508, + "step": 582 + }, + { + "epoch": 0.77, + "logps_train/chosen": -63.33903503417969, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -76.01646423339844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.16202855110168457, + "rewards_train/margins": 1.342742681503296, + "rewards_train/rejected": -1.5047712326049805, + "step": 582 + }, + { + "epoch": 0.77, + "logps_train/chosen": -68.54399108886719, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -75.78659057617188, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.29502445459365845, + "rewards_train/margins": 0.4758225083351135, + "rewards_train/rejected": -0.770846962928772, + "step": 583 + }, + { + "epoch": 0.78, + "learning_rate": 4.7219359135453554e-07, + "loss": 0.4989, + "step": 584 + }, + { + "epoch": 0.78, + "logps_train/chosen": -44.65009307861328, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -70.82889556884766, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4951471984386444, + "rewards_train/margins": 1.8256931006908417, + "rewards_train/rejected": -1.3305459022521973, + "step": 584 + }, + { + "epoch": 0.78, + "logps_train/chosen": -62.56684112548828, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -63.533782958984375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.04487829655408859, + "rewards_train/margins": 0.42325635999441147, + "rewards_train/rejected": -0.3783780634403229, + "step": 585 + }, + { + "epoch": 0.78, + "learning_rate": 4.719410533103595e-07, + "loss": 0.5448, + "step": 586 + }, + { + "epoch": 0.78, + "logps_train/chosen": -66.92024230957031, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -98.66958618164062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.49235033988952637, + "rewards_train/margins": 2.4733721017837524, + "rewards_train/rejected": -1.981021761894226, + "step": 586 + }, + { + "epoch": 0.78, + "logps_train/chosen": -56.734954833984375, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -77.53395080566406, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.04212941229343414, + "rewards_train/margins": 1.367399349808693, + "rewards_train/rejected": -1.3252699375152588, + "step": 587 + }, + { + "epoch": 0.78, + "learning_rate": 4.7168744182878065e-07, + "loss": 0.3397, + "step": 588 + }, + { + "epoch": 0.78, + "logps_train/chosen": -65.14064025878906, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -89.91858673095703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19062352180480957, + "rewards_train/margins": 2.379357099533081, + "rewards_train/rejected": -2.1887335777282715, + "step": 588 + }, + { + "epoch": 0.78, + "logps_train/chosen": -67.61961364746094, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -76.6443099975586, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.039305105805397034, + "rewards_train/margins": 1.3173136562108994, + "rewards_train/rejected": -1.3566187620162964, + "step": 589 + }, + { + "epoch": 0.78, + "learning_rate": 4.7143275813641336e-07, + "loss": 0.3752, + "step": 590 + }, + { + "epoch": 0.78, + "logps_train/chosen": -54.337440490722656, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -73.7213134765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.050931595265865326, + "rewards_train/margins": 1.2258866801857948, + "rewards_train/rejected": -1.2768182754516602, + "step": 590 + }, + { + "epoch": 0.78, + "logps_train/chosen": -57.397369384765625, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -63.53017807006836, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.06416883319616318, + "rewards_train/margins": 1.4984368309378624, + "rewards_train/rejected": -1.4342679977416992, + "step": 591 + }, + { + "epoch": 0.79, + "learning_rate": 4.711770034650575e-07, + "loss": 0.3846, + "step": 592 + }, + { + "epoch": 0.79, + "logps_train/chosen": -48.90729522705078, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -61.438758850097656, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.030363978818058968, + "rewards_train/margins": 1.6310763973742723, + "rewards_train/rejected": -1.6007124185562134, + "step": 592 + }, + { + "epoch": 0.79, + "logps_train/chosen": -69.3033447265625, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -90.70845794677734, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.29466527700424194, + "rewards_train/margins": 2.1362144351005554, + "rewards_train/rejected": -1.8415491580963135, + "step": 593 + }, + { + "epoch": 0.79, + "learning_rate": 4.7092017905169315e-07, + "loss": 0.4115, + "step": 594 + }, + { + "epoch": 0.79, + "logps_train/chosen": -50.840511322021484, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -60.229576110839844, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.39420759677886963, + "rewards_train/margins": 0.7975000143051147, + "rewards_train/rejected": -1.1917076110839844, + "step": 594 + }, + { + "epoch": 0.79, + "logps_train/chosen": -75.55046081542969, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -111.72606658935547, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0347975492477417, + "rewards_train/margins": 2.419904589653015, + "rewards_train/rejected": -2.3851070404052734, + "step": 595 + }, + { + "epoch": 0.79, + "learning_rate": 4.7066228613847405e-07, + "loss": 0.6544, + "step": 596 + }, + { + "epoch": 0.79, + "logps_train/chosen": -32.979820251464844, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -44.78044891357422, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.26998698711395264, + "rewards_train/margins": 1.061313509941101, + "rewards_train/rejected": -0.7913265228271484, + "step": 596 + }, + { + "epoch": 0.79, + "logps_train/chosen": -74.37362670898438, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -80.77278137207031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04908075928688049, + "rewards_train/margins": 0.9125727117061615, + "rewards_train/rejected": -0.961653470993042, + "step": 597 + }, + { + "epoch": 0.79, + "learning_rate": 4.704033259727219e-07, + "loss": 0.4005, + "step": 598 + }, + { + "epoch": 0.79, + "logps_train/chosen": -62.520835876464844, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -93.00614166259766, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.1364586502313614, + "rewards_train/margins": 0.7469682544469833, + "rewards_train/rejected": -0.8834269046783447, + "step": 598 + }, + { + "epoch": 0.8, + "logps_train/chosen": -46.40895080566406, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -100.99507904052734, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5051011443138123, + "rewards_train/margins": 2.7905468344688416, + "rewards_train/rejected": -2.2854456901550293, + "step": 599 + }, + { + "epoch": 0.8, + "learning_rate": 4.701432998069205e-07, + "loss": 0.3991, + "step": 600 + }, + { + "epoch": 0.8, + "logps_train/chosen": -44.63404083251953, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -73.91123962402344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3678460717201233, + "rewards_train/margins": 1.933970034122467, + "rewards_train/rejected": -1.5661239624023438, + "step": 600 + }, + { + "epoch": 0.8, + "logps_train/chosen": -53.41337203979492, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -64.40904235839844, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.011649668216705322, + "rewards_train/margins": 1.3730050921440125, + "rewards_train/rejected": -1.3846547603607178, + "step": 601 + }, + { + "epoch": 0.8, + "learning_rate": 4.69882208898709e-07, + "loss": 0.4513, + "step": 602 + }, + { + "epoch": 0.8, + "logps_train/chosen": -66.7323226928711, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -77.54193115234375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.1904197335243225, + "rewards_train/margins": 1.1668007969856262, + "rewards_train/rejected": -1.3572205305099487, + "step": 602 + }, + { + "epoch": 0.8, + "logps_train/chosen": -47.96442413330078, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -86.06922912597656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.667229175567627, + "rewards_train/margins": 1.8507150411605835, + "rewards_train/rejected": -1.1834858655929565, + "step": 603 + }, + { + "epoch": 0.8, + "learning_rate": 4.696200545108767e-07, + "loss": 0.3027, + "step": 604 + }, + { + "epoch": 0.8, + "logps_train/chosen": -46.41473388671875, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -60.637451171875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.07428610324859619, + "rewards_train/margins": 0.7341855764389038, + "rewards_train/rejected": -0.8084716796875, + "step": 604 + }, + { + "epoch": 0.8, + "logps_train/chosen": -64.93851470947266, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -88.19393920898438, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.9686485528945923, + "rewards_train/margins": 2.5130425691604614, + "rewards_train/rejected": -1.5443940162658691, + "step": 605 + }, + { + "epoch": 0.8, + "learning_rate": 4.693568379113562e-07, + "loss": 0.4029, + "step": 606 + }, + { + "epoch": 0.8, + "logps_train/chosen": -87.84767150878906, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -89.39312744140625, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.05820435285568237, + "rewards_train/margins": 1.8061079382896423, + "rewards_train/rejected": -1.8643122911453247, + "step": 606 + }, + { + "epoch": 0.81, + "logps_train/chosen": -75.93603515625, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -112.70449829101562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7161625623703003, + "rewards_train/margins": 3.1756755113601685, + "rewards_train/rejected": -2.459512948989868, + "step": 607 + }, + { + "epoch": 0.81, + "learning_rate": 4.6909256037321775e-07, + "loss": 0.3524, + "step": 608 + }, + { + "epoch": 0.81, + "logps_train/chosen": -84.64514923095703, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -114.46732330322266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4057977497577667, + "rewards_train/margins": 2.3462802469730377, + "rewards_train/rejected": -1.940482497215271, + "step": 608 + }, + { + "epoch": 0.81, + "logps_train/chosen": -70.9254150390625, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -96.82177734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4920283257961273, + "rewards_train/margins": 2.728503078222275, + "rewards_train/rejected": -2.2364747524261475, + "step": 609 + }, + { + "epoch": 0.81, + "learning_rate": 4.688272231746629e-07, + "loss": 0.2993, + "step": 610 + }, + { + "epoch": 0.81, + "logps_train/chosen": -67.98867797851562, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -74.17568969726562, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.22222566604614258, + "rewards_train/margins": 0.9976067543029785, + "rewards_train/rejected": -0.7753810882568359, + "step": 610 + }, + { + "epoch": 0.81, + "logps_train/chosen": -79.0730972290039, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -142.41741943359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8614403605461121, + "rewards_train/margins": 3.7375559210777283, + "rewards_train/rejected": -2.876115560531616, + "step": 611 + }, + { + "epoch": 0.81, + "learning_rate": 4.6856082759901825e-07, + "loss": 0.3766, + "step": 612 + }, + { + "epoch": 0.81, + "logps_train/chosen": -56.294185638427734, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -74.82308959960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07761275768280029, + "rewards_train/margins": 1.2661720514297485, + "rewards_train/rejected": -1.1885592937469482, + "step": 612 + }, + { + "epoch": 0.81, + "logps_train/chosen": -62.906375885009766, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -84.80770111083984, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.5328003168106079, + "rewards_train/margins": 2.062007784843445, + "rewards_train/rejected": -1.529207468032837, + "step": 613 + }, + { + "epoch": 0.82, + "learning_rate": 4.682933749347296e-07, + "loss": 0.3217, + "step": 614 + }, + { + "epoch": 0.82, + "logps_train/chosen": -47.23320007324219, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -53.342132568359375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.014179930090904236, + "rewards_train/margins": 1.314409002661705, + "rewards_train/rejected": -1.3002290725708008, + "step": 614 + }, + { + "epoch": 0.82, + "logps_train/chosen": -47.72241973876953, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -73.51026153564453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9652582406997681, + "rewards_train/margins": 1.9389408230781555, + "rewards_train/rejected": -0.9736825823783875, + "step": 615 + }, + { + "epoch": 0.82, + "learning_rate": 4.6802486647535505e-07, + "loss": 0.4198, + "step": 616 + }, + { + "epoch": 0.82, + "logps_train/chosen": -59.407447814941406, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -84.48947143554688, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7764426469802856, + "rewards_train/margins": 2.597265601158142, + "rewards_train/rejected": -1.8208229541778564, + "step": 616 + }, + { + "epoch": 0.82, + "logps_train/chosen": -66.02926635742188, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -80.5020980834961, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3130892813205719, + "rewards_train/margins": 1.5679869949817657, + "rewards_train/rejected": -1.2548977136611938, + "step": 617 + }, + { + "epoch": 0.82, + "learning_rate": 4.677553035195594e-07, + "loss": 0.2156, + "step": 618 + }, + { + "epoch": 0.82, + "logps_train/chosen": -38.977821350097656, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -58.88930130004883, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.3470008671283722, + "rewards_train/margins": 0.7302102744579315, + "rewards_train/rejected": -1.0772111415863037, + "step": 618 + }, + { + "epoch": 0.82, + "logps_train/chosen": -51.70515060424805, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -73.06501007080078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2525317072868347, + "rewards_train/margins": 2.369188964366913, + "rewards_train/rejected": -2.116657257080078, + "step": 619 + }, + { + "epoch": 0.82, + "learning_rate": 4.6748468737110764e-07, + "loss": 0.482, + "step": 620 + }, + { + "epoch": 0.82, + "logps_train/chosen": -81.810791015625, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -103.40557861328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6767330169677734, + "rewards_train/margins": 2.3571348190307617, + "rewards_train/rejected": -1.6804018020629883, + "step": 620 + }, + { + "epoch": 0.82, + "logps_train/chosen": -61.70752716064453, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -96.13740539550781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5980952978134155, + "rewards_train/margins": 2.403046488761902, + "rewards_train/rejected": -1.8049511909484863, + "step": 621 + }, + { + "epoch": 0.83, + "learning_rate": 4.672130193388585e-07, + "loss": 0.2022, + "step": 622 + }, + { + "epoch": 0.83, + "logps_train/chosen": -62.559471130371094, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -66.36294555664062, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.03485351800918579, + "rewards_train/margins": 1.0897228121757507, + "rewards_train/rejected": -1.1245763301849365, + "step": 622 + }, + { + "epoch": 0.83, + "logps_train/chosen": -51.45391845703125, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -87.12033081054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6155455708503723, + "rewards_train/margins": 2.421328842639923, + "rewards_train/rejected": -1.8057832717895508, + "step": 623 + }, + { + "epoch": 0.83, + "learning_rate": 4.6694030073675826e-07, + "loss": 0.3773, + "step": 624 + }, + { + "epoch": 0.83, + "logps_train/chosen": -84.90255737304688, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -122.0093002319336, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1941189467906952, + "rewards_train/margins": 2.598174065351486, + "rewards_train/rejected": -2.404055118560791, + "step": 624 + }, + { + "epoch": 0.83, + "logps_train/chosen": -57.79358673095703, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -79.19303131103516, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.2270151674747467, + "rewards_train/margins": 1.1704131066799164, + "rewards_train/rejected": -1.397428274154663, + "step": 625 + }, + { + "epoch": 0.83, + "learning_rate": 4.666665328838344e-07, + "loss": 0.376, + "step": 626 + }, + { + "epoch": 0.83, + "logps_train/chosen": -38.393798828125, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -66.88373565673828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.30327117443084717, + "rewards_train/margins": 1.9635199308395386, + "rewards_train/rejected": -1.6602487564086914, + "step": 626 + }, + { + "epoch": 0.83, + "logps_train/chosen": -52.85305404663086, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -78.20223999023438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7006322145462036, + "rewards_train/margins": 2.7958565950393677, + "rewards_train/rejected": -2.095224380493164, + "step": 627 + }, + { + "epoch": 0.83, + "learning_rate": 4.663917171041893e-07, + "loss": 0.2327, + "step": 628 + }, + { + "epoch": 0.83, + "logps_train/chosen": -44.11957550048828, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -72.79949951171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.14194902777671814, + "rewards_train/margins": 1.2289305627346039, + "rewards_train/rejected": -1.0869815349578857, + "step": 628 + }, + { + "epoch": 0.84, + "logps_train/chosen": -65.86761474609375, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -74.30508422851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0835514068603516, + "rewards_train/margins": 2.748434543609619, + "rewards_train/rejected": -1.6648831367492676, + "step": 629 + }, + { + "epoch": 0.84, + "learning_rate": 4.6611585472699344e-07, + "loss": 0.2665, + "step": 630 + }, + { + "epoch": 0.84, + "logps_train/chosen": -60.44417953491211, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -85.952880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.388394296169281, + "rewards_train/margins": 2.166495144367218, + "rewards_train/rejected": -1.778100848197937, + "step": 630 + }, + { + "epoch": 0.84, + "logps_train/chosen": -61.03472137451172, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -87.6741943359375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7902776598930359, + "rewards_train/margins": 2.656915843486786, + "rewards_train/rejected": -1.86663818359375, + "step": 631 + }, + { + "epoch": 0.84, + "learning_rate": 4.658389470864796e-07, + "loss": 0.2038, + "step": 632 + }, + { + "epoch": 0.84, + "logps_train/chosen": -86.6244125366211, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -92.52574157714844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42037108540534973, + "rewards_train/margins": 2.318257600069046, + "rewards_train/rejected": -1.8978865146636963, + "step": 632 + }, + { + "epoch": 0.84, + "logps_train/chosen": -43.07469177246094, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -70.0228271484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.37612465023994446, + "rewards_train/margins": 1.9957411587238312, + "rewards_train/rejected": -1.6196165084838867, + "step": 633 + }, + { + "epoch": 0.84, + "learning_rate": 4.6556099552193583e-07, + "loss": 0.2876, + "step": 634 + }, + { + "epoch": 0.84, + "logps_train/chosen": -59.48136901855469, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -87.19058227539062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02217533439397812, + "rewards_train/margins": 1.0662338957190514, + "rewards_train/rejected": -1.0440585613250732, + "step": 634 + }, + { + "epoch": 0.84, + "logps_train/chosen": -61.226661682128906, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -87.68091583251953, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2226659059524536, + "rewards_train/margins": 1.1673005819320679, + "rewards_train/rejected": -1.3899664878845215, + "step": 635 + }, + { + "epoch": 0.84, + "learning_rate": 4.6528200137769935e-07, + "loss": 0.4699, + "step": 636 + }, + { + "epoch": 0.84, + "logps_train/chosen": -77.63619232177734, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -99.56551361083984, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.19106817245483398, + "rewards_train/margins": 1.1944947242736816, + "rewards_train/rejected": -1.0034265518188477, + "step": 636 + }, + { + "epoch": 0.85, + "logps_train/chosen": -43.92618179321289, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -71.39808654785156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5339444279670715, + "rewards_train/margins": 2.8354724049568176, + "rewards_train/rejected": -2.301527976989746, + "step": 637 + }, + { + "epoch": 0.85, + "learning_rate": 4.650019660031498e-07, + "loss": 0.3389, + "step": 638 + }, + { + "epoch": 0.85, + "logps_train/chosen": -85.78387451171875, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -122.40510559082031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4106752872467041, + "rewards_train/margins": 2.593374013900757, + "rewards_train/rejected": -2.1826987266540527, + "step": 638 + }, + { + "epoch": 0.85, + "logps_train/chosen": -78.40846252441406, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -91.34258270263672, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.2771225869655609, + "rewards_train/margins": 1.4066929519176483, + "rewards_train/rejected": -1.1295703649520874, + "step": 639 + }, + { + "epoch": 0.85, + "learning_rate": 4.6472089075270296e-07, + "loss": 0.3054, + "step": 640 + }, + { + "epoch": 0.85, + "logps_train/chosen": -67.42459106445312, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -93.22029113769531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.008517671376466751, + "rewards_train/margins": 1.6871870048344135, + "rewards_train/rejected": -1.6786693334579468, + "step": 640 + }, + { + "epoch": 0.85, + "logps_train/chosen": -56.728477478027344, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -84.64283752441406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12909775972366333, + "rewards_train/margins": 1.3039358258247375, + "rewards_train/rejected": -1.4330335855484009, + "step": 641 + }, + { + "epoch": 0.85, + "learning_rate": 4.6443877698580373e-07, + "loss": 0.3351, + "step": 642 + }, + { + "epoch": 0.85, + "logps_train/chosen": -45.55366897583008, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -77.58822631835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6110394597053528, + "rewards_train/margins": 2.5198622345924377, + "rewards_train/rejected": -1.908822774887085, + "step": 642 + }, + { + "epoch": 0.85, + "logps_train/chosen": -50.26631164550781, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -71.51134490966797, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.48274391889572144, + "rewards_train/margins": 1.5401285290718079, + "rewards_train/rejected": -1.0573846101760864, + "step": 643 + }, + { + "epoch": 0.86, + "learning_rate": 4.641556260669204e-07, + "loss": 0.2628, + "step": 644 + }, + { + "epoch": 0.86, + "logps_train/chosen": -59.88041687011719, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -79.37681579589844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.32445842027664185, + "rewards_train/margins": 1.2433894276618958, + "rewards_train/rejected": -0.9189310073852539, + "step": 644 + }, + { + "epoch": 0.86, + "logps_train/chosen": -58.12947463989258, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -87.97662353515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.36439618468284607, + "rewards_train/margins": 1.1671367585659027, + "rewards_train/rejected": -0.8027405738830566, + "step": 645 + }, + { + "epoch": 0.86, + "learning_rate": 4.638714393655372e-07, + "loss": 0.4209, + "step": 646 + }, + { + "epoch": 0.86, + "logps_train/chosen": -46.86067199707031, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -72.31005859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3084641098976135, + "rewards_train/margins": 2.6027517914772034, + "rewards_train/rejected": -2.29428768157959, + "step": 646 + }, + { + "epoch": 0.86, + "logps_train/chosen": -63.42823028564453, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -72.93348693847656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3899889588356018, + "rewards_train/margins": 1.8958377242088318, + "rewards_train/rejected": -1.50584876537323, + "step": 647 + }, + { + "epoch": 0.86, + "learning_rate": 4.63586218256148e-07, + "loss": 0.2343, + "step": 648 + }, + { + "epoch": 0.86, + "logps_train/chosen": -75.07272338867188, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -90.54558563232422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 1.2747584581375122, + "rewards_train/margins": 3.251191735267639, + "rewards_train/rejected": -1.976433277130127, + "step": 648 + }, + { + "epoch": 0.86, + "logps_train/chosen": -36.89295196533203, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -72.14701080322266, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.5247675776481628, + "rewards_train/margins": 1.6832186579704285, + "rewards_train/rejected": -1.1584510803222656, + "step": 649 + }, + { + "epoch": 0.86, + "learning_rate": 4.6329996411824967e-07, + "loss": 0.34, + "step": 650 + }, + { + "epoch": 0.86, + "logps_train/chosen": -77.40049743652344, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -90.66729736328125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.46698179841041565, + "rewards_train/margins": 2.3087117969989777, + "rewards_train/rejected": -1.841729998588562, + "step": 650 + }, + { + "epoch": 0.86, + "logps_train/chosen": -44.888511657714844, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -72.77568054199219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7658365964889526, + "rewards_train/margins": 2.4144986867904663, + "rewards_train/rejected": -1.6486620903015137, + "step": 651 + }, + { + "epoch": 0.87, + "learning_rate": 4.630126783363357e-07, + "loss": 0.3182, + "step": 652 + }, + { + "epoch": 0.87, + "logps_train/chosen": -58.66350555419922, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -92.73113250732422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.03666294366121292, + "rewards_train/margins": 2.4958256408572197, + "rewards_train/rejected": -2.5324885845184326, + "step": 652 + }, + { + "epoch": 0.87, + "logps_train/chosen": -50.652225494384766, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -84.22692108154297, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.7629026174545288, + "rewards_train/margins": 2.276610255241394, + "rewards_train/rejected": -1.5137076377868652, + "step": 653 + }, + { + "epoch": 0.87, + "learning_rate": 4.62724362299889e-07, + "loss": 0.2203, + "step": 654 + }, + { + "epoch": 0.87, + "logps_train/chosen": -51.455596923828125, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -74.4891586303711, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3560027480125427, + "rewards_train/margins": 1.6361684203147888, + "rewards_train/rejected": -1.280165672302246, + "step": 654 + }, + { + "epoch": 0.87, + "logps_train/chosen": -58.885311126708984, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -83.89421081542969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.47709405422210693, + "rewards_train/margins": 2.321202278137207, + "rewards_train/rejected": -1.8441082239151, + "step": 655 + }, + { + "epoch": 0.87, + "learning_rate": 4.6243501740337533e-07, + "loss": 0.3013, + "step": 656 + }, + { + "epoch": 0.87, + "logps_train/chosen": -76.16789245605469, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -95.82583618164062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5207105278968811, + "rewards_train/margins": 3.203294575214386, + "rewards_train/rejected": -2.682584047317505, + "step": 656 + }, + { + "epoch": 0.87, + "logps_train/chosen": -38.421749114990234, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -38.714141845703125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.7054815292358398, + "rewards_train/margins": 1.1366615295410156, + "rewards_train/rejected": -0.4311800003051758, + "step": 657 + }, + { + "epoch": 0.87, + "learning_rate": 4.621446450462366e-07, + "loss": 0.3042, + "step": 658 + }, + { + "epoch": 0.87, + "logps_train/chosen": -49.40616226196289, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -68.30242919921875, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.3336026072502136, + "rewards_train/margins": 2.0818141102790833, + "rewards_train/rejected": -1.7482115030288696, + "step": 658 + }, + { + "epoch": 0.88, + "logps_train/chosen": -92.52722930908203, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -123.93988037109375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.18709762394428253, + "rewards_train/margins": 2.013141080737114, + "rewards_train/rejected": -2.2002387046813965, + "step": 659 + }, + { + "epoch": 0.88, + "learning_rate": 4.618532466328845e-07, + "loss": 0.3679, + "step": 660 + }, + { + "epoch": 0.88, + "logps_train/chosen": -59.85950469970703, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -91.0672378540039, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.21600297093391418, + "rewards_train/margins": 2.184445410966873, + "rewards_train/rejected": -1.968442440032959, + "step": 660 + }, + { + "epoch": 0.88, + "logps_train/chosen": -74.84912872314453, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -120.09780883789062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2744620740413666, + "rewards_train/margins": 2.246742933988571, + "rewards_train/rejected": -1.9722808599472046, + "step": 661 + }, + { + "epoch": 0.88, + "learning_rate": 4.6156082357269277e-07, + "loss": 0.2698, + "step": 662 + }, + { + "epoch": 0.88, + "logps_train/chosen": -42.33638381958008, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -57.59506607055664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28979912400245667, + "rewards_train/margins": 1.525086909532547, + "rewards_train/rejected": -1.2352877855300903, + "step": 662 + }, + { + "epoch": 0.88, + "logps_train/chosen": -70.05482482910156, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -84.63400268554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10701710730791092, + "rewards_train/margins": 1.741121120750904, + "rewards_train/rejected": -1.6341040134429932, + "step": 663 + }, + { + "epoch": 0.88, + "learning_rate": 4.612673772799914e-07, + "loss": 0.2698, + "step": 664 + }, + { + "epoch": 0.88, + "logps_train/chosen": -50.864524841308594, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -77.48883056640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.21901603043079376, + "rewards_train/margins": 2.4303988963365555, + "rewards_train/rejected": -2.2113828659057617, + "step": 664 + }, + { + "epoch": 0.88, + "logps_train/chosen": -36.89140319824219, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -79.10737609863281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.26758599281311035, + "rewards_train/margins": 2.490823984146118, + "rewards_train/rejected": -2.223237991333008, + "step": 665 + }, + { + "epoch": 0.88, + "learning_rate": 4.609729091740592e-07, + "loss": 0.198, + "step": 666 + }, + { + "epoch": 0.88, + "logps_train/chosen": -42.206085205078125, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -56.44530487060547, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.516891598701477, + "rewards_train/margins": 1.7512658834457397, + "rewards_train/rejected": -1.2343742847442627, + "step": 666 + }, + { + "epoch": 0.89, + "logps_train/chosen": -55.27179718017578, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -100.38606262207031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.6599295139312744, + "rewards_train/margins": 3.447364091873169, + "rewards_train/rejected": -2.7874345779418945, + "step": 667 + }, + { + "epoch": 0.89, + "learning_rate": 4.6067742067911685e-07, + "loss": 0.2073, + "step": 668 + }, + { + "epoch": 0.89, + "logps_train/chosen": -53.89271545410156, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -73.7257080078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.23260357975959778, + "rewards_train/margins": 1.7473616898059845, + "rewards_train/rejected": -1.5147581100463867, + "step": 668 + }, + { + "epoch": 0.89, + "logps_train/chosen": -70.37216186523438, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -100.627685546875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2971585988998413, + "rewards_train/margins": 2.5364898443222046, + "rewards_train/rejected": -2.2393312454223633, + "step": 669 + }, + { + "epoch": 0.89, + "learning_rate": 4.603809132243205e-07, + "loss": 0.3615, + "step": 670 + }, + { + "epoch": 0.89, + "logps_train/chosen": -59.09383773803711, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -85.25665283203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.41329044103622437, + "rewards_train/margins": 1.0139370560646057, + "rewards_train/rejected": -1.42722749710083, + "step": 670 + }, + { + "epoch": 0.89, + "logps_train/chosen": -71.18547821044922, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -96.42425537109375, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.5794854760169983, + "rewards_train/margins": 1.5285653471946716, + "rewards_train/rejected": -2.10805082321167, + "step": 671 + }, + { + "epoch": 0.89, + "learning_rate": 4.6008338824375457e-07, + "loss": 0.4862, + "step": 672 + }, + { + "epoch": 0.89, + "logps_train/chosen": -102.10392761230469, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -121.85100555419922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2630443572998047, + "rewards_train/margins": 2.5442392826080322, + "rewards_train/rejected": -2.2811949253082275, + "step": 672 + }, + { + "epoch": 0.89, + "logps_train/chosen": -80.49946594238281, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -101.16230773925781, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.3679158091545105, + "rewards_train/margins": 1.3440185189247131, + "rewards_train/rejected": -1.7119343280792236, + "step": 673 + }, + { + "epoch": 0.9, + "learning_rate": 4.597848471764248e-07, + "loss": 0.3464, + "step": 674 + }, + { + "epoch": 0.9, + "logps_train/chosen": -40.62296676635742, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -72.34452819824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7470782995223999, + "rewards_train/margins": 2.914734721183777, + "rewards_train/rejected": -2.167656421661377, + "step": 674 + }, + { + "epoch": 0.9, + "logps_train/chosen": -86.5342788696289, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -104.23762512207031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08875927329063416, + "rewards_train/margins": 2.117208808660507, + "rewards_train/rejected": -2.028449535369873, + "step": 675 + }, + { + "epoch": 0.9, + "learning_rate": 4.5948529146625115e-07, + "loss": 0.2071, + "step": 676 + }, + { + "epoch": 0.9, + "logps_train/chosen": -37.710025787353516, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -63.450538635253906, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.24678394198417664, + "rewards_train/margins": 1.2150667011737823, + "rewards_train/rejected": -1.461850643157959, + "step": 676 + }, + { + "epoch": 0.9, + "logps_train/chosen": -90.51107025146484, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -127.83465576171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5727205276489258, + "rewards_train/margins": 2.931185722351074, + "rewards_train/rejected": -2.3584651947021484, + "step": 677 + }, + { + "epoch": 0.9, + "learning_rate": 4.591847225620612e-07, + "loss": 0.3685, + "step": 678 + }, + { + "epoch": 0.9, + "logps_train/chosen": -82.14286804199219, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -96.72637176513672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5700880289077759, + "rewards_train/margins": 3.014600396156311, + "rewards_train/rejected": -2.444512367248535, + "step": 678 + }, + { + "epoch": 0.9, + "logps_train/chosen": -70.38860321044922, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -82.55715942382812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08651609718799591, + "rewards_train/margins": 1.546543762087822, + "rewards_train/rejected": -1.6330598592758179, + "step": 679 + }, + { + "epoch": 0.9, + "learning_rate": 4.588831419175828e-07, + "loss": 0.2551, + "step": 680 + }, + { + "epoch": 0.9, + "logps_train/chosen": -55.20639419555664, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -96.28793334960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.40757089853286743, + "rewards_train/margins": 2.2351924777030945, + "rewards_train/rejected": -1.827621579170227, + "step": 680 + }, + { + "epoch": 0.9, + "logps_train/chosen": -47.29354476928711, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -74.71430969238281, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.369083046913147, + "rewards_train/margins": 2.21238911151886, + "rewards_train/rejected": -1.843306064605713, + "step": 681 + }, + { + "epoch": 0.91, + "learning_rate": 4.585805509914374e-07, + "loss": 0.3413, + "step": 682 + }, + { + "epoch": 0.91, + "logps_train/chosen": -66.68551635742188, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -107.8899917602539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3392612040042877, + "rewards_train/margins": 3.1876353323459625, + "rewards_train/rejected": -2.848374128341675, + "step": 682 + }, + { + "epoch": 0.91, + "logps_train/chosen": -57.441612243652344, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -94.74931335449219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.30744242668151855, + "rewards_train/margins": 2.2448320388793945, + "rewards_train/rejected": -2.552274465560913, + "step": 683 + }, + { + "epoch": 0.91, + "learning_rate": 4.582769512471324e-07, + "loss": 0.1799, + "step": 684 + }, + { + "epoch": 0.91, + "logps_train/chosen": -54.16114044189453, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -78.58146667480469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.49404197931289673, + "rewards_train/margins": 2.8709381222724915, + "rewards_train/rejected": -2.3768961429595947, + "step": 684 + }, + { + "epoch": 0.91, + "logps_train/chosen": -77.51919555664062, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -68.96359252929688, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": -0.3886374831199646, + "rewards_train/margins": 0.7249093651771545, + "rewards_train/rejected": -1.1135468482971191, + "step": 685 + }, + { + "epoch": 0.91, + "learning_rate": 4.579723441530547e-07, + "loss": 0.4346, + "step": 686 + }, + { + "epoch": 0.91, + "logps_train/chosen": -34.131839752197266, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -66.95238494873047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.48525357246398926, + "rewards_train/margins": 2.4324450492858887, + "rewards_train/rejected": -1.9471914768218994, + "step": 686 + }, + { + "epoch": 0.91, + "logps_train/chosen": -38.947418212890625, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -59.30021286010742, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09119536727666855, + "rewards_train/margins": 2.0321542248129845, + "rewards_train/rejected": -1.940958857536316, + "step": 687 + }, + { + "epoch": 0.91, + "learning_rate": 4.5766673118246333e-07, + "loss": 0.2346, + "step": 688 + }, + { + "epoch": 0.91, + "logps_train/chosen": -45.25869369506836, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -51.026123046875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.7766506671905518, + "rewards_train/margins": 0.6806492805480957, + "rewards_train/rejected": -1.4572999477386475, + "step": 688 + }, + { + "epoch": 0.92, + "logps_train/chosen": -78.32164001464844, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -148.12022399902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08528921008110046, + "rewards_train/margins": 3.92985799908638, + "rewards_train/rejected": -4.0151472091674805, + "step": 689 + }, + { + "epoch": 0.92, + "learning_rate": 4.573601138134823e-07, + "loss": 0.2725, + "step": 690 + }, + { + "epoch": 0.92, + "logps_train/chosen": -83.4992904663086, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -100.03118133544922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.10149146616458893, + "rewards_train/margins": 2.1328768581151962, + "rewards_train/rejected": -2.234368324279785, + "step": 690 + }, + { + "epoch": 0.92, + "logps_train/chosen": -74.4854736328125, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -93.02217102050781, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.36729687452316284, + "rewards_train/margins": 1.2177329659461975, + "rewards_train/rejected": -1.5850298404693604, + "step": 691 + }, + { + "epoch": 0.92, + "learning_rate": 4.570524935290934e-07, + "loss": 0.4091, + "step": 692 + }, + { + "epoch": 0.92, + "logps_train/chosen": -34.78716278076172, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -79.95111083984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10839307308197021, + "rewards_train/margins": 2.5113171339035034, + "rewards_train/rejected": -2.402924060821533, + "step": 692 + }, + { + "epoch": 0.92, + "logps_train/chosen": -74.54714965820312, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -63.44025421142578, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.4172154664993286, + "rewards_train/margins": 1.0455600023269653, + "rewards_train/rejected": -1.462775468826294, + "step": 693 + }, + { + "epoch": 0.92, + "learning_rate": 4.5674387181712904e-07, + "loss": 0.3039, + "step": 694 + }, + { + "epoch": 0.92, + "logps_train/chosen": -49.48997116088867, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -73.1671142578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45295602083206177, + "rewards_train/margins": 2.5520885586738586, + "rewards_train/rejected": -2.099132537841797, + "step": 694 + }, + { + "epoch": 0.92, + "logps_train/chosen": -84.90861511230469, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -119.9660415649414, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5013256072998047, + "rewards_train/margins": 3.685429811477661, + "rewards_train/rejected": -3.1841042041778564, + "step": 695 + }, + { + "epoch": 0.92, + "learning_rate": 4.564342501702654e-07, + "loss": 0.1737, + "step": 696 + }, + { + "epoch": 0.92, + "logps_train/chosen": -56.27851104736328, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -81.65951538085938, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.11535114049911499, + "rewards_train/margins": 1.8834131360054016, + "rewards_train/rejected": -1.9987642765045166, + "step": 696 + }, + { + "epoch": 0.93, + "logps_train/chosen": -61.96138000488281, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -109.97370147705078, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.052387937903404236, + "rewards_train/margins": 2.344982400536537, + "rewards_train/rejected": -2.3973703384399414, + "step": 697 + }, + { + "epoch": 0.93, + "learning_rate": 4.5612363008601474e-07, + "loss": 0.3469, + "step": 698 + }, + { + "epoch": 0.93, + "logps_train/chosen": -42.722694396972656, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -111.35914611816406, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.22616808116436005, + "rewards_train/margins": 3.968332216143608, + "rewards_train/rejected": -3.742164134979248, + "step": 698 + }, + { + "epoch": 0.93, + "logps_train/chosen": -50.69742202758789, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -53.74729537963867, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.053238146007061005, + "rewards_train/margins": 1.148834951221943, + "rewards_train/rejected": -1.202073097229004, + "step": 699 + }, + { + "epoch": 0.93, + "learning_rate": 4.5581201306671835e-07, + "loss": 0.2556, + "step": 700 + }, + { + "epoch": 0.93, + "logps_train/chosen": -80.26610565185547, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -100.37316131591797, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.9812982082366943, + "rewards_train/margins": 0.9841432571411133, + "rewards_train/rejected": -1.9654414653778076, + "step": 700 + }, + { + "epoch": 0.93, + "logps_train/chosen": -46.84473419189453, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -67.64947509765625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.14365153014659882, + "rewards_train/margins": 2.397661730647087, + "rewards_train/rejected": -2.2540102005004883, + "step": 701 + }, + { + "epoch": 0.93, + "learning_rate": 4.5549940061953934e-07, + "loss": 0.4381, + "step": 702 + }, + { + "epoch": 0.93, + "logps_train/chosen": -76.68373107910156, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -106.86540985107422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.018372446298599243, + "rewards_train/margins": 2.3791061341762543, + "rewards_train/rejected": -2.3974785804748535, + "step": 702 + }, + { + "epoch": 0.93, + "logps_train/chosen": -92.51097869873047, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -120.3010482788086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.14968341588974, + "rewards_train/margins": 3.161038815975189, + "rewards_train/rejected": -3.011355400085449, + "step": 703 + }, + { + "epoch": 0.93, + "learning_rate": 4.551857942564553e-07, + "loss": 0.3566, + "step": 704 + }, + { + "epoch": 0.93, + "logps_train/chosen": -64.683837890625, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -74.9739990234375, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.09763213992118835, + "rewards_train/margins": 1.050109714269638, + "rewards_train/rejected": -0.9524775743484497, + "step": 704 + }, + { + "epoch": 0.94, + "logps_train/chosen": -73.48666381835938, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -98.82264709472656, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.22015084326267242, + "rewards_train/margins": 2.1777390092611313, + "rewards_train/rejected": -2.3978898525238037, + "step": 705 + }, + { + "epoch": 0.94, + "learning_rate": 4.548711954942509e-07, + "loss": 0.397, + "step": 706 + }, + { + "epoch": 0.94, + "logps_train/chosen": -48.78385925292969, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -74.55316925048828, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.02786429226398468, + "rewards_train/margins": 1.659743383526802, + "rewards_train/rejected": -1.6318790912628174, + "step": 706 + }, + { + "epoch": 0.94, + "logps_train/chosen": -58.168251037597656, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -74.8335952758789, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.5363562107086182, + "rewards_train/margins": 1.3251283168792725, + "rewards_train/rejected": -1.8614845275878906, + "step": 707 + }, + { + "epoch": 0.94, + "learning_rate": 4.545556058545108e-07, + "loss": 0.4259, + "step": 708 + }, + { + "epoch": 0.94, + "logps_train/chosen": -123.55560302734375, + "logps_train/ref_chosen": -120.5, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -138.19094848632812, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2821231484413147, + "rewards_train/margins": 2.266659438610077, + "rewards_train/rejected": -2.5487825870513916, + "step": 708 + }, + { + "epoch": 0.94, + "logps_train/chosen": -56.70548629760742, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -93.85681915283203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.920857846736908, + "rewards_train/margins": 2.8893527388572693, + "rewards_train/rejected": -1.9684948921203613, + "step": 709 + }, + { + "epoch": 0.94, + "learning_rate": 4.542390268636119e-07, + "loss": 0.3146, + "step": 710 + }, + { + "epoch": 0.94, + "logps_train/chosen": -61.79959487915039, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -110.82610321044922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2856650650501251, + "rewards_train/margins": 3.362416058778763, + "rewards_train/rejected": -3.0767509937286377, + "step": 710 + }, + { + "epoch": 0.94, + "logps_train/chosen": -52.4349479675293, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -84.93408203125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.14400531351566315, + "rewards_train/margins": 1.994445100426674, + "rewards_train/rejected": -1.8504397869110107, + "step": 711 + }, + { + "epoch": 0.95, + "learning_rate": 4.5392146005271635e-07, + "loss": 0.2052, + "step": 712 + }, + { + "epoch": 0.95, + "logps_train/chosen": -50.81437683105469, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -64.52072143554688, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.06895294785499573, + "rewards_train/margins": 1.2913378775119781, + "rewards_train/rejected": -1.2223849296569824, + "step": 712 + }, + { + "epoch": 0.95, + "logps_train/chosen": -42.27362823486328, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -74.12132263183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3402157425880432, + "rewards_train/margins": 3.3289108872413635, + "rewards_train/rejected": -2.9886951446533203, + "step": 713 + }, + { + "epoch": 0.95, + "learning_rate": 4.536029069577639e-07, + "loss": 0.3486, + "step": 714 + }, + { + "epoch": 0.95, + "logps_train/chosen": -41.063987731933594, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -55.044532775878906, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.17172624170780182, + "rewards_train/margins": 1.1371169835329056, + "rewards_train/rejected": -0.9653907418251038, + "step": 714 + }, + { + "epoch": 0.95, + "logps_train/chosen": -87.18270874023438, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -100.08426666259766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04717755317687988, + "rewards_train/margins": 2.071796178817749, + "rewards_train/rejected": -2.118973731994629, + "step": 715 + }, + { + "epoch": 0.95, + "learning_rate": 4.532833691194647e-07, + "loss": 0.4849, + "step": 716 + }, + { + "epoch": 0.95, + "logps_train/chosen": -95.56340026855469, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -111.3342056274414, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.928034782409668, + "rewards_train/margins": 3.498955249786377, + "rewards_train/rejected": -2.570920467376709, + "step": 716 + }, + { + "epoch": 0.95, + "logps_train/chosen": -52.635711669921875, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -84.30975341796875, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.21611656248569489, + "rewards_train/margins": 2.9564668089151382, + "rewards_train/rejected": -2.7403502464294434, + "step": 717 + }, + { + "epoch": 0.95, + "learning_rate": 4.5296284808329146e-07, + "loss": 0.4621, + "step": 718 + }, + { + "epoch": 0.95, + "logps_train/chosen": -69.68258666992188, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -94.92339324951172, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.18017911911010742, + "rewards_train/margins": 2.367830753326416, + "rewards_train/rejected": -2.1876516342163086, + "step": 718 + }, + { + "epoch": 0.95, + "logps_train/chosen": -64.23617553710938, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -90.95034790039062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.24357017874717712, + "rewards_train/margins": 2.9589172303676605, + "rewards_train/rejected": -2.7153470516204834, + "step": 719 + }, + { + "epoch": 0.96, + "learning_rate": 4.526413453994723e-07, + "loss": 0.2166, + "step": 720 + }, + { + "epoch": 0.96, + "logps_train/chosen": -40.586029052734375, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -63.831321716308594, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3554595708847046, + "rewards_train/margins": 1.4214046001434326, + "rewards_train/rejected": -1.065945029258728, + "step": 720 + }, + { + "epoch": 0.96, + "logps_train/chosen": -84.15662384033203, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -108.63420867919922, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2937871813774109, + "rewards_train/margins": 1.6274459958076477, + "rewards_train/rejected": -1.9212331771850586, + "step": 721 + }, + { + "epoch": 0.96, + "learning_rate": 4.523188626229834e-07, + "loss": 0.5752, + "step": 722 + }, + { + "epoch": 0.96, + "logps_train/chosen": -61.15065383911133, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -84.80239868164062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.24104198813438416, + "rewards_train/margins": 1.2579480707645416, + "rewards_train/rejected": -1.4989900588989258, + "step": 722 + }, + { + "epoch": 0.96, + "logps_train/chosen": -46.31986999511719, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -84.28214263916016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4547317922115326, + "rewards_train/margins": 2.214196056127548, + "rewards_train/rejected": -1.7594642639160156, + "step": 723 + }, + { + "epoch": 0.96, + "learning_rate": 4.5199540131354075e-07, + "loss": 0.4115, + "step": 724 + }, + { + "epoch": 0.96, + "logps_train/chosen": -66.86719512939453, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -89.36840057373047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3429681062698364, + "rewards_train/margins": 2.328245520591736, + "rewards_train/rejected": -1.9852774143218994, + "step": 724 + }, + { + "epoch": 0.96, + "logps_train/chosen": -84.69651794433594, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -117.97013092041016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.12277615070343018, + "rewards_train/margins": 2.4367364645004272, + "rewards_train/rejected": -2.5595126152038574, + "step": 725 + }, + { + "epoch": 0.96, + "learning_rate": 4.5167096303559356e-07, + "loss": 0.2787, + "step": 726 + }, + { + "epoch": 0.96, + "logps_train/chosen": -48.453407287597656, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -74.62812805175781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08362184464931488, + "rewards_train/margins": 2.262785330414772, + "rewards_train/rejected": -2.346407175064087, + "step": 726 + }, + { + "epoch": 0.97, + "logps_train/chosen": -52.40443801879883, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -68.63796997070312, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4413379728794098, + "rewards_train/margins": 0.8755845725536346, + "rewards_train/rejected": -1.3169225454330444, + "step": 727 + }, + { + "epoch": 0.97, + "learning_rate": 4.5134554935831604e-07, + "loss": 0.3611, + "step": 728 + }, + { + "epoch": 0.97, + "logps_train/chosen": -87.60906982421875, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -121.25372314453125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.04840691387653351, + "rewards_train/margins": 2.3191534727811813, + "rewards_train/rejected": -2.367560386657715, + "step": 728 + }, + { + "epoch": 0.97, + "logps_train/chosen": -69.85812377929688, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -77.41719055175781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4936249852180481, + "rewards_train/margins": 1.409032166004181, + "rewards_train/rejected": -1.902657151222229, + "step": 729 + }, + { + "epoch": 0.97, + "learning_rate": 4.5101916185560005e-07, + "loss": 0.3447, + "step": 730 + }, + { + "epoch": 0.97, + "logps_train/chosen": -82.50523376464844, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -104.87754821777344, + "rewards_train/accuracies": 0.375, + "rewards_train/chosen": -1.1114604473114014, + "rewards_train/margins": -0.17214328050613403, + "rewards_train/rejected": -0.9393171668052673, + "step": 730 + }, + { + "epoch": 0.97, + "logps_train/chosen": -62.417701721191406, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -99.60670471191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8504173755645752, + "rewards_train/margins": 3.742337942123413, + "rewards_train/rejected": -2.891920566558838, + "step": 731 + }, + { + "epoch": 0.97, + "learning_rate": 4.506918021060474e-07, + "loss": 0.5334, + "step": 732 + }, + { + "epoch": 0.97, + "logps_train/chosen": -61.03002166748047, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -101.71241760253906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1899665743112564, + "rewards_train/margins": 3.1283959299325943, + "rewards_train/rejected": -2.938429355621338, + "step": 732 + }, + { + "epoch": 0.97, + "logps_train/chosen": -70.00407409667969, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -104.87008666992188, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.18396775424480438, + "rewards_train/margins": 2.7678515166044235, + "rewards_train/rejected": -2.583883762359619, + "step": 733 + }, + { + "epoch": 0.97, + "learning_rate": 4.5036347169296227e-07, + "loss": 0.2146, + "step": 734 + }, + { + "epoch": 0.97, + "logps_train/chosen": -70.09794616699219, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -91.58123779296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4808884263038635, + "rewards_train/margins": 1.6428598761558533, + "rewards_train/rejected": -2.123748302459717, + "step": 734 + }, + { + "epoch": 0.98, + "logps_train/chosen": -62.288578033447266, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -81.8724365234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7055174112319946, + "rewards_train/margins": 2.3990107774734497, + "rewards_train/rejected": -1.693493366241455, + "step": 735 + }, + { + "epoch": 0.98, + "learning_rate": 4.500341722043436e-07, + "loss": 0.2486, + "step": 736 + }, + { + "epoch": 0.98, + "logps_train/chosen": -66.16824340820312, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -85.70519256591797, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.31369999051094055, + "rewards_train/margins": 1.7849442660808563, + "rewards_train/rejected": -2.098644256591797, + "step": 736 + }, + { + "epoch": 0.98, + "logps_train/chosen": -34.342872619628906, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -64.12893676757812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5120015144348145, + "rewards_train/margins": 2.242473602294922, + "rewards_train/rejected": -1.7304720878601074, + "step": 737 + }, + { + "epoch": 0.98, + "learning_rate": 4.4970390523287714e-07, + "loss": 0.3299, + "step": 738 + }, + { + "epoch": 0.98, + "logps_train/chosen": -55.41838836669922, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -112.56553649902344, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.0659736692905426, + "rewards_train/margins": 2.5209650099277496, + "rewards_train/rejected": -2.454991340637207, + "step": 738 + }, + { + "epoch": 0.98, + "logps_train/chosen": -51.32921600341797, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -80.92022705078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8358285427093506, + "rewards_train/margins": 1.9247263669967651, + "rewards_train/rejected": -1.0888978242874146, + "step": 739 + }, + { + "epoch": 0.98, + "learning_rate": 4.493726723759284e-07, + "loss": 0.2085, + "step": 740 + }, + { + "epoch": 0.98, + "logps_train/chosen": -53.95283126831055, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -88.43611145019531, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.10315460711717606, + "rewards_train/margins": 1.8795777037739754, + "rewards_train/rejected": -1.7764230966567993, + "step": 740 + }, + { + "epoch": 0.98, + "logps_train/chosen": -60.53403854370117, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -117.7838134765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7934710383415222, + "rewards_train/margins": 3.574976861476898, + "rewards_train/rejected": -2.781505823135376, + "step": 741 + }, + { + "epoch": 0.99, + "learning_rate": 4.490404752355339e-07, + "loss": 0.1912, + "step": 742 + }, + { + "epoch": 0.99, + "logps_train/chosen": -55.65306854248047, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -64.404052734375, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08313048630952835, + "rewards_train/margins": 0.9758799150586128, + "rewards_train/rejected": -0.8927494287490845, + "step": 742 + }, + { + "epoch": 0.99, + "logps_train/chosen": -47.5407829284668, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -68.78717041015625, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.2595471143722534, + "rewards_train/margins": 1.4488579034805298, + "rewards_train/rejected": -1.7084050178527832, + "step": 743 + }, + { + "epoch": 0.99, + "learning_rate": 4.487073154183944e-07, + "loss": 0.4031, + "step": 744 + }, + { + "epoch": 0.99, + "logps_train/chosen": -74.14268493652344, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -69.08543395996094, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.22793997824192047, + "rewards_train/margins": -0.0350215882062912, + "rewards_train/rejected": -0.19291839003562927, + "step": 744 + }, + { + "epoch": 0.99, + "logps_train/chosen": -78.53152465820312, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -104.10023498535156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.20622184872627258, + "rewards_train/margins": 2.5287459194660187, + "rewards_train/rejected": -2.322524070739746, + "step": 745 + }, + { + "epoch": 0.99, + "learning_rate": 4.4837319453586664e-07, + "loss": 0.6077, + "step": 746 + }, + { + "epoch": 0.99, + "logps_train/chosen": -65.99571228027344, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -101.71450805664062, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.31246185302734375, + "rewards_train/margins": 2.214848041534424, + "rewards_train/rejected": -2.5273098945617676, + "step": 746 + }, + { + "epoch": 0.99, + "logps_train/chosen": -67.29641723632812, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -74.00518798828125, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.2702668309211731, + "rewards_train/margins": 1.7958771586418152, + "rewards_train/rejected": -2.0661439895629883, + "step": 747 + }, + { + "epoch": 0.99, + "learning_rate": 4.4803811420395566e-07, + "loss": 0.3117, + "step": 748 + }, + { + "epoch": 0.99, + "logps_train/chosen": -84.3549575805664, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -129.38941955566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6551293134689331, + "rewards_train/margins": 3.9331332445144653, + "rewards_train/rejected": -3.2780039310455322, + "step": 748 + }, + { + "epoch": 0.99, + "logps_train/chosen": -39.928504943847656, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -63.72937774658203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09933708608150482, + "rewards_train/margins": 2.127352699637413, + "rewards_train/rejected": -2.028015613555908, + "step": 749 + }, + { + "epoch": 1.0, + "learning_rate": 4.477020760433069e-07, + "loss": 0.1322, + "step": 750 + }, + { + "epoch": 1.0, + "logps_train/chosen": -62.46615219116211, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -93.07612609863281, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": 0.00807209312915802, + "rewards_train/margins": 1.75006003677845, + "rewards_train/rejected": -1.741987943649292, + "step": 750 + }, + { + "epoch": 1.0, + "logps_train/chosen": -48.001670837402344, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -73.4656982421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.021707601845264435, + "rewards_train/margins": 2.780777521431446, + "rewards_train/rejected": -2.7590699195861816, + "step": 751 + }, + { + "epoch": 1.0, + "learning_rate": 4.473650816791984e-07, + "loss": 0.2478, + "step": 752 + }, + { + "epoch": 1.0, + "logps_train/chosen": -70.86872863769531, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -102.60623931884766, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.20609530806541443, + "rewards_train/margins": 2.0167196691036224, + "rewards_train/rejected": -1.810624361038208, + "step": 752 + }, + { + "epoch": 1.0, + "logps_train/chosen": -72.06849670410156, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -123.32846069335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1486189365386963, + "rewards_train/margins": 4.213496208190918, + "rewards_train/rejected": -3.0648772716522217, + "step": 753 + }, + { + "epoch": 1.0, + "learning_rate": 4.47027132741533e-07, + "loss": 0.2285, + "step": 754 + }, + { + "epoch": 1.0, + "logps_train/chosen": -89.99444580078125, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -135.48133850097656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20211690664291382, + "rewards_train/margins": 3.2643126845359802, + "rewards_train/rejected": -3.0621957778930664, + "step": 754 + }, + { + "epoch": 1.0, + "logps_train/chosen": -82.13720703125, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -136.76634216308594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7550290822982788, + "rewards_train/margins": 4.859788298606873, + "rewards_train/rejected": -4.104759216308594, + "step": 755 + }, + { + "epoch": 1.0, + "learning_rate": 4.4668823086483056e-07, + "loss": 0.1162, + "step": 756 + }, + { + "epoch": 1.0, + "logps_train/chosen": -65.73158264160156, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -94.82227325439453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2627795934677124, + "rewards_train/margins": 3.087194323539734, + "rewards_train/rejected": -2.8244147300720215, + "step": 756 + }, + { + "epoch": 1.01, + "logps_train/chosen": -58.29868698120117, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -66.34871673583984, + "rewards_train/accuracies": 0.5, + "rewards_train/chosen": 0.29981881380081177, + "rewards_train/margins": 1.3237531781196594, + "rewards_train/rejected": -1.0239343643188477, + "step": 757 + }, + { + "epoch": 1.01, + "learning_rate": 4.4634837768821963e-07, + "loss": 0.2896, + "step": 758 + }, + { + "epoch": 1.01, + "logps_train/chosen": -68.19380187988281, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -84.99763488769531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.13500526547431946, + "rewards_train/margins": 1.7491337954998016, + "rewards_train/rejected": -1.884139060974121, + "step": 758 + }, + { + "epoch": 1.01, + "logps_train/chosen": -58.27888870239258, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -100.84491729736328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3580487370491028, + "rewards_train/margins": 3.1316030621528625, + "rewards_train/rejected": -2.7735543251037598, + "step": 759 + }, + { + "epoch": 1.01, + "learning_rate": 4.4600757485543006e-07, + "loss": 0.1651, + "step": 760 + }, + { + "epoch": 1.01, + "logps_train/chosen": -88.1324234008789, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -115.2786865234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2055070400238037, + "rewards_train/margins": 3.667750120162964, + "rewards_train/rejected": -2.46224308013916, + "step": 760 + }, + { + "epoch": 1.01, + "logps_train/chosen": -70.9462661743164, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -101.74241638183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09013915061950684, + "rewards_train/margins": 2.7737557888031006, + "rewards_train/rejected": -2.6836166381835938, + "step": 761 + }, + { + "epoch": 1.01, + "learning_rate": 4.456658240147846e-07, + "loss": 0.1019, + "step": 762 + }, + { + "epoch": 1.01, + "logps_train/chosen": -57.915916442871094, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -79.24769592285156, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.21034187078475952, + "rewards_train/margins": 2.0753656029701233, + "rewards_train/rejected": -2.285707473754883, + "step": 762 + }, + { + "epoch": 1.01, + "logps_train/chosen": -46.743648529052734, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -79.69302368164062, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3154788315296173, + "rewards_train/margins": 2.466812163591385, + "rewards_train/rejected": -2.1513333320617676, + "step": 763 + }, + { + "epoch": 1.01, + "learning_rate": 4.4532312681919127e-07, + "loss": 0.2739, + "step": 764 + }, + { + "epoch": 1.01, + "logps_train/chosen": -50.862674713134766, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -83.14360809326172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1151735782623291, + "rewards_train/margins": 2.1413750648498535, + "rewards_train/rejected": -2.2565486431121826, + "step": 764 + }, + { + "epoch": 1.02, + "logps_train/chosen": -55.300506591796875, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -91.79389953613281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2855740189552307, + "rewards_train/margins": 2.6727762818336487, + "rewards_train/rejected": -2.387202262878418, + "step": 765 + }, + { + "epoch": 1.02, + "learning_rate": 4.449794849261351e-07, + "loss": 0.2408, + "step": 766 + }, + { + "epoch": 1.02, + "logps_train/chosen": -50.523902893066406, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -79.38288879394531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.014890104532241821, + "rewards_train/margins": 2.151524156332016, + "rewards_train/rejected": -2.166414260864258, + "step": 766 + }, + { + "epoch": 1.02, + "logps_train/chosen": -75.68093872070312, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -108.38290405273438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.22253058850765228, + "rewards_train/margins": 2.5108210891485214, + "rewards_train/rejected": -2.288290500640869, + "step": 767 + }, + { + "epoch": 1.02, + "learning_rate": 4.4463489999767047e-07, + "loss": 0.2223, + "step": 768 + }, + { + "epoch": 1.02, + "logps_train/chosen": -48.434234619140625, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -74.44261169433594, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": -0.4074857831001282, + "rewards_train/margins": 2.009432017803192, + "rewards_train/rejected": -2.4169178009033203, + "step": 768 + }, + { + "epoch": 1.02, + "logps_train/chosen": -42.807106018066406, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -60.74381637573242, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.15053941309452057, + "rewards_train/margins": 1.4507022053003311, + "rewards_train/rejected": -1.3001627922058105, + "step": 769 + }, + { + "epoch": 1.02, + "learning_rate": 4.442893737004124e-07, + "loss": 0.3038, + "step": 770 + }, + { + "epoch": 1.02, + "logps_train/chosen": -37.18920135498047, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -60.47882080078125, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.31232985854148865, + "rewards_train/margins": 1.7059153020381927, + "rewards_train/rejected": -1.393585443496704, + "step": 770 + }, + { + "epoch": 1.02, + "logps_train/chosen": -42.942779541015625, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -61.62098693847656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10572205483913422, + "rewards_train/margins": 2.1170397847890854, + "rewards_train/rejected": -2.011317729949951, + "step": 771 + }, + { + "epoch": 1.03, + "learning_rate": 4.439429077055294e-07, + "loss": 0.3675, + "step": 772 + }, + { + "epoch": 1.03, + "logps_train/chosen": -50.876564025878906, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -85.4740219116211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1873438060283661, + "rewards_train/margins": 2.519121140241623, + "rewards_train/rejected": -2.331777334213257, + "step": 772 + }, + { + "epoch": 1.03, + "logps_train/chosen": -65.5296859741211, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -72.11442565917969, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4720316231250763, + "rewards_train/margins": 1.8662865459918976, + "rewards_train/rejected": -1.3942549228668213, + "step": 773 + }, + { + "epoch": 1.03, + "learning_rate": 4.4359550368873463e-07, + "loss": 0.2622, + "step": 774 + }, + { + "epoch": 1.03, + "logps_train/chosen": -82.28711700439453, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -107.65868377685547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6009758710861206, + "rewards_train/margins": 2.4785631895065308, + "rewards_train/rejected": -1.8775873184204102, + "step": 774 + }, + { + "epoch": 1.03, + "logps_train/chosen": -65.40483093261719, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -102.3169174194336, + "rewards_train/accuracies": 0.75, + "rewards_train/chosen": 0.26303285360336304, + "rewards_train/margins": 1.7728497385978699, + "rewards_train/rejected": -1.5098168849945068, + "step": 775 + }, + { + "epoch": 1.03, + "learning_rate": 4.4324716333027814e-07, + "loss": 0.3129, + "step": 776 + }, + { + "epoch": 1.03, + "logps_train/chosen": -64.79135131835938, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -126.04985046386719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.48102080821990967, + "rewards_train/margins": 3.9922555685043335, + "rewards_train/rejected": -3.511234760284424, + "step": 776 + }, + { + "epoch": 1.03, + "logps_train/chosen": -34.996681213378906, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -62.42568588256836, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.209706649184227, + "rewards_train/margins": 2.2413378804922104, + "rewards_train/rejected": -2.0316312313079834, + "step": 777 + }, + { + "epoch": 1.03, + "learning_rate": 4.428978883149386e-07, + "loss": 0.1648, + "step": 778 + }, + { + "epoch": 1.03, + "logps_train/chosen": -36.58696746826172, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -60.58026123046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04520928114652634, + "rewards_train/margins": 1.5059699937701225, + "rewards_train/rejected": -1.4607607126235962, + "step": 778 + }, + { + "epoch": 1.03, + "logps_train/chosen": -62.581119537353516, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -94.9194564819336, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.47548168897628784, + "rewards_train/margins": 2.951802670955658, + "rewards_train/rejected": -2.47632098197937, + "step": 779 + }, + { + "epoch": 1.04, + "learning_rate": 4.425476803320153e-07, + "loss": 0.2512, + "step": 780 + }, + { + "epoch": 1.04, + "logps_train/chosen": -58.24970626831055, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -95.43296813964844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6062796115875244, + "rewards_train/margins": 2.908951997756958, + "rewards_train/rejected": -2.3026723861694336, + "step": 780 + }, + { + "epoch": 1.04, + "logps_train/chosen": -44.945281982421875, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -95.61111450195312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.007815830409526825, + "rewards_train/margins": 2.7704892084002495, + "rewards_train/rejected": -2.7626733779907227, + "step": 781 + }, + { + "epoch": 1.04, + "learning_rate": 4.421965410753201e-07, + "loss": 0.1053, + "step": 782 + }, + { + "epoch": 1.04, + "logps_train/chosen": -48.387107849121094, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -68.50040435791016, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.9409765005111694, + "rewards_train/margins": 3.0230478048324585, + "rewards_train/rejected": -2.082071304321289, + "step": 782 + }, + { + "epoch": 1.04, + "logps_train/chosen": -74.5488052368164, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -110.64103698730469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5529320240020752, + "rewards_train/margins": 2.904536485671997, + "rewards_train/rejected": -2.351604461669922, + "step": 783 + }, + { + "epoch": 1.04, + "learning_rate": 4.418444722431687e-07, + "loss": 0.164, + "step": 784 + }, + { + "epoch": 1.04, + "logps_train/chosen": -40.978477478027344, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -74.71595001220703, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4229532480239868, + "rewards_train/margins": 2.5535322427749634, + "rewards_train/rejected": -2.1305789947509766, + "step": 784 + }, + { + "epoch": 1.04, + "logps_train/chosen": -66.73492431640625, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -95.45710754394531, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.0086488276720047, + "rewards_train/margins": 1.5128434151411057, + "rewards_train/rejected": -1.5214922428131104, + "step": 785 + }, + { + "epoch": 1.04, + "learning_rate": 4.41491475538373e-07, + "loss": 0.2578, + "step": 786 + }, + { + "epoch": 1.04, + "logps_train/chosen": -65.47206115722656, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -74.70218658447266, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4887310266494751, + "rewards_train/margins": 1.338637351989746, + "rewards_train/rejected": -0.849906325340271, + "step": 786 + }, + { + "epoch": 1.05, + "logps_train/chosen": -38.711307525634766, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -64.75392150878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.39468950033187866, + "rewards_train/margins": 2.166956841945648, + "rewards_train/rejected": -1.7722673416137695, + "step": 787 + }, + { + "epoch": 1.05, + "learning_rate": 4.411375526682326e-07, + "loss": 0.3052, + "step": 788 + }, + { + "epoch": 1.05, + "logps_train/chosen": -56.16551208496094, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -70.11296844482422, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.054832443594932556, + "rewards_train/margins": 1.6361520439386368, + "rewards_train/rejected": -1.6909844875335693, + "step": 788 + }, + { + "epoch": 1.05, + "logps_train/chosen": -36.383453369140625, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -76.16281127929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.235611230134964, + "rewards_train/margins": 1.9259819686412811, + "rewards_train/rejected": -2.161593198776245, + "step": 789 + }, + { + "epoch": 1.05, + "learning_rate": 4.4078270534452644e-07, + "loss": 0.2719, + "step": 790 + }, + { + "epoch": 1.05, + "logps_train/chosen": -83.35650634765625, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -94.11077880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08544264733791351, + "rewards_train/margins": 2.3142934888601303, + "rewards_train/rejected": -2.228850841522217, + "step": 790 + }, + { + "epoch": 1.05, + "logps_train/chosen": -48.94058609008789, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -80.95195007324219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1692226231098175, + "rewards_train/margins": 3.1191051304340363, + "rewards_train/rejected": -2.9498825073242188, + "step": 791 + }, + { + "epoch": 1.05, + "learning_rate": 4.4042693528350487e-07, + "loss": 0.1622, + "step": 792 + }, + { + "epoch": 1.05, + "logps_train/chosen": -58.48086166381836, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -59.32624053955078, + "rewards_train/accuracies": 0.625, + "rewards_train/chosen": -0.694961428642273, + "rewards_train/margins": 0.8486001491546631, + "rewards_train/rejected": -1.543561577796936, + "step": 792 + }, + { + "epoch": 1.05, + "logps_train/chosen": -59.933555603027344, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -78.39978790283203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2878941297531128, + "rewards_train/margins": 2.332560420036316, + "rewards_train/rejected": -2.044666290283203, + "step": 793 + }, + { + "epoch": 1.05, + "learning_rate": 4.4007024420588103e-07, + "loss": 0.3822, + "step": 794 + }, + { + "epoch": 1.05, + "logps_train/chosen": -93.20623779296875, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -111.58499145507812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.017498541623353958, + "rewards_train/margins": 2.68006319925189, + "rewards_train/rejected": -2.697561740875244, + "step": 794 + }, + { + "epoch": 1.06, + "logps_train/chosen": -63.818111419677734, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -94.49789428710938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4624147117137909, + "rewards_train/margins": 2.370016783475876, + "rewards_train/rejected": -1.907602071762085, + "step": 795 + }, + { + "epoch": 1.06, + "learning_rate": 4.397126338368227e-07, + "loss": 0.2426, + "step": 796 + }, + { + "epoch": 1.06, + "logps_train/chosen": -39.92469787597656, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -70.1273193359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12723538279533386, + "rewards_train/margins": 1.8843248188495636, + "rewards_train/rejected": -2.0115602016448975, + "step": 796 + }, + { + "epoch": 1.06, + "logps_train/chosen": -84.03326416015625, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -136.01007080078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1310492902994156, + "rewards_train/margins": 4.457055702805519, + "rewards_train/rejected": -4.3260064125061035, + "step": 797 + }, + { + "epoch": 1.06, + "learning_rate": 4.393541059059437e-07, + "loss": 0.15, + "step": 798 + }, + { + "epoch": 1.06, + "logps_train/chosen": -45.70124053955078, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -67.06489562988281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10643841326236725, + "rewards_train/margins": 1.7265999466180801, + "rewards_train/rejected": -1.620161533355713, + "step": 798 + }, + { + "epoch": 1.06, + "logps_train/chosen": -40.22917175292969, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -52.668575286865234, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9114576578140259, + "rewards_train/margins": 2.1478464603424072, + "rewards_train/rejected": -1.2363888025283813, + "step": 799 + }, + { + "epoch": 1.06, + "learning_rate": 4.38994662147296e-07, + "loss": 0.2226, + "step": 800 + }, + { + "epoch": 1.06, + "logps_train/chosen": -78.17513275146484, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -92.73866271972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2918616831302643, + "rewards_train/margins": 3.039556175470352, + "rewards_train/rejected": -2.747694492340088, + "step": 800 + }, + { + "epoch": 1.06, + "logps_train/chosen": -65.75807189941406, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -68.25862121582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19763082265853882, + "rewards_train/margins": 1.5164613127708435, + "rewards_train/rejected": -1.3188304901123047, + "step": 801 + }, + { + "epoch": 1.07, + "learning_rate": 4.3863430429936087e-07, + "loss": 0.1679, + "step": 802 + }, + { + "epoch": 1.07, + "logps_train/chosen": -66.94666290283203, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -103.9609375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4467398524284363, + "rewards_train/margins": 2.7303337454795837, + "rewards_train/rejected": -2.2835938930511475, + "step": 802 + }, + { + "epoch": 1.07, + "logps_train/chosen": -68.72746276855469, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -121.6444091796875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7897540926933289, + "rewards_train/margins": 3.685445487499237, + "rewards_train/rejected": -2.895691394805908, + "step": 803 + }, + { + "epoch": 1.07, + "learning_rate": 4.382730341050408e-07, + "loss": 0.1476, + "step": 804 + }, + { + "epoch": 1.07, + "logps_train/chosen": -21.226465225219727, + "logps_train/ref_chosen": -25.0, + "logps_train/ref_rejected": -31.5, + "logps_train/rejected": -53.17919158935547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.37774407863616943, + "rewards_train/margins": 2.546249270439148, + "rewards_train/rejected": -2.1685051918029785, + "step": 804 + }, + { + "epoch": 1.07, + "logps_train/chosen": -76.6697998046875, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -116.48828887939453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4963010847568512, + "rewards_train/margins": 3.3318497240543365, + "rewards_train/rejected": -2.8355486392974854, + "step": 805 + }, + { + "epoch": 1.07, + "learning_rate": 4.379108533116507e-07, + "loss": 0.1468, + "step": 806 + }, + { + "epoch": 1.07, + "logps_train/chosen": -69.28903198242188, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -119.68470764160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.0029266774654388428, + "rewards_train/margins": 3.4421073496341705, + "rewards_train/rejected": -3.4450340270996094, + "step": 806 + }, + { + "epoch": 1.07, + "logps_train/chosen": -34.827537536621094, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -66.54564666748047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5211523771286011, + "rewards_train/margins": 2.8805512189865112, + "rewards_train/rejected": -2.35939884185791, + "step": 807 + }, + { + "epoch": 1.07, + "learning_rate": 4.3754776367090974e-07, + "loss": 0.1153, + "step": 808 + }, + { + "epoch": 1.07, + "logps_train/chosen": -52.845703125, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -95.40000915527344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19902321696281433, + "rewards_train/margins": 2.624961346387863, + "rewards_train/rejected": -2.425938129425049, + "step": 808 + }, + { + "epoch": 1.07, + "logps_train/chosen": -101.28802490234375, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -113.39967346191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9540099501609802, + "rewards_train/margins": 4.187727391719818, + "rewards_train/rejected": -3.233717441558838, + "step": 809 + }, + { + "epoch": 1.08, + "learning_rate": 4.37183766938933e-07, + "loss": 0.12, + "step": 810 + }, + { + "epoch": 1.08, + "logps_train/chosen": -61.20787811279297, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -103.71305847167969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6956185102462769, + "rewards_train/margins": 3.5073541402816772, + "rewards_train/rejected": -2.8117356300354004, + "step": 810 + }, + { + "epoch": 1.08, + "logps_train/chosen": -80.150390625, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -105.05763244628906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10183099657297134, + "rewards_train/margins": 2.36579779535532, + "rewards_train/rejected": -2.2639667987823486, + "step": 811 + }, + { + "epoch": 1.08, + "learning_rate": 4.368188648762227e-07, + "loss": 0.1604, + "step": 812 + }, + { + "epoch": 1.08, + "logps_train/chosen": -58.470069885253906, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -86.8299331665039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2936176657676697, + "rewards_train/margins": 3.0047358870506287, + "rewards_train/rejected": -2.711118221282959, + "step": 812 + }, + { + "epoch": 1.08, + "logps_train/chosen": -51.30886459350586, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -77.62238311767578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32926976680755615, + "rewards_train/margins": 2.5910202264785767, + "rewards_train/rejected": -2.2617504596710205, + "step": 813 + }, + { + "epoch": 1.08, + "learning_rate": 4.364530592476595e-07, + "loss": 0.1232, + "step": 814 + }, + { + "epoch": 1.08, + "logps_train/chosen": -48.05463409423828, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -87.86077117919922, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.8900444507598877, + "rewards_train/margins": 3.1332504749298096, + "rewards_train/rejected": -2.243206024169922, + "step": 814 + }, + { + "epoch": 1.08, + "logps_train/chosen": -52.04695129394531, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -87.9521484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6781174540519714, + "rewards_train/margins": 3.2077078223228455, + "rewards_train/rejected": -2.529590368270874, + "step": 815 + }, + { + "epoch": 1.08, + "learning_rate": 4.3608635182249465e-07, + "loss": 0.1106, + "step": 816 + }, + { + "epoch": 1.08, + "logps_train/chosen": -39.63652038574219, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -84.72727966308594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.33439499139785767, + "rewards_train/margins": 3.0946224331855774, + "rewards_train/rejected": -2.7602274417877197, + "step": 816 + }, + { + "epoch": 1.08, + "logps_train/chosen": -72.6854476928711, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -103.39086151123047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.006455421447754, + "rewards_train/margins": 3.720541477203369, + "rewards_train/rejected": -2.7140860557556152, + "step": 817 + }, + { + "epoch": 1.09, + "learning_rate": 4.3571874437434074e-07, + "loss": 0.134, + "step": 818 + }, + { + "epoch": 1.09, + "logps_train/chosen": -44.015933990478516, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -85.70500946044922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.014422163367271423, + "rewards_train/margins": 2.806798204779625, + "rewards_train/rejected": -2.7923760414123535, + "step": 818 + }, + { + "epoch": 1.09, + "logps_train/chosen": -70.45325469970703, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -90.65718078613281, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08592414855957031, + "rewards_train/margins": 2.2282047271728516, + "rewards_train/rejected": -2.1422805786132812, + "step": 819 + }, + { + "epoch": 1.09, + "learning_rate": 4.3535023868116363e-07, + "loss": 0.1947, + "step": 820 + }, + { + "epoch": 1.09, + "logps_train/chosen": -43.031463623046875, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -72.33087921142578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.24060384929180145, + "rewards_train/margins": 2.478379413485527, + "rewards_train/rejected": -2.2377755641937256, + "step": 820 + }, + { + "epoch": 1.09, + "logps_train/chosen": -51.21966552734375, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -78.81327056884766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.021966993808746338, + "rewards_train/margins": 2.6968597769737244, + "rewards_train/rejected": -2.7188267707824707, + "step": 821 + }, + { + "epoch": 1.09, + "learning_rate": 4.349808365252733e-07, + "loss": 0.1072, + "step": 822 + }, + { + "epoch": 1.09, + "logps_train/chosen": -70.05642700195312, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -96.88026428222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8474821448326111, + "rewards_train/margins": 3.1808213591575623, + "rewards_train/rejected": -2.333339214324951, + "step": 822 + }, + { + "epoch": 1.09, + "logps_train/chosen": -91.50241088867188, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -118.50920104980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12602216005325317, + "rewards_train/margins": 2.7553674578666687, + "rewards_train/rejected": -2.881389617919922, + "step": 823 + }, + { + "epoch": 1.09, + "learning_rate": 4.3461053969331573e-07, + "loss": 0.1176, + "step": 824 + }, + { + "epoch": 1.09, + "logps_train/chosen": -51.377254486083984, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -92.90863800048828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1153994798660278, + "rewards_train/margins": 3.503137946128845, + "rewards_train/rejected": -2.3877384662628174, + "step": 824 + }, + { + "epoch": 1.1, + "logps_train/chosen": -65.79637145996094, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -144.54788208007812, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.6281752586364746, + "rewards_train/margins": 3.4423394203186035, + "rewards_train/rejected": -2.814164161682129, + "step": 825 + }, + { + "epoch": 1.1, + "learning_rate": 4.3423934997626426e-07, + "loss": 0.287, + "step": 826 + }, + { + "epoch": 1.1, + "logps_train/chosen": -41.493492126464844, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -69.26010131835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3498693108558655, + "rewards_train/margins": 2.947754681110382, + "rewards_train/rejected": -2.5978853702545166, + "step": 826 + }, + { + "epoch": 1.1, + "logps_train/chosen": -42.88385772705078, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -67.99557495117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9366143345832825, + "rewards_train/margins": 2.9814847111701965, + "rewards_train/rejected": -2.044870376586914, + "step": 827 + }, + { + "epoch": 1.1, + "learning_rate": 4.338672691694104e-07, + "loss": 0.1421, + "step": 828 + }, + { + "epoch": 1.1, + "logps_train/chosen": -49.87078094482422, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -61.25836181640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3457341492176056, + "rewards_train/margins": 2.4739144146442413, + "rewards_train/rejected": -2.1281802654266357, + "step": 828 + }, + { + "epoch": 1.1, + "logps_train/chosen": -57.14312744140625, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -92.68283081054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8138123750686646, + "rewards_train/margins": 3.8305329084396362, + "rewards_train/rejected": -3.0167205333709717, + "step": 829 + }, + { + "epoch": 1.1, + "learning_rate": 4.334942990723558e-07, + "loss": 0.1216, + "step": 830 + }, + { + "epoch": 1.1, + "logps_train/chosen": -84.87217712402344, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -151.1685791015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20750823616981506, + "rewards_train/margins": 3.9384283125400543, + "rewards_train/rejected": -3.7309200763702393, + "step": 830 + }, + { + "epoch": 1.1, + "logps_train/chosen": -32.686134338378906, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -67.4171142578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5641993284225464, + "rewards_train/margins": 2.791848063468933, + "rewards_train/rejected": -2.2276487350463867, + "step": 831 + }, + { + "epoch": 1.1, + "learning_rate": 4.3312044148900293e-07, + "loss": 0.1063, + "step": 832 + }, + { + "epoch": 1.1, + "logps_train/chosen": -39.43988800048828, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -70.92292785644531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7673392295837402, + "rewards_train/margins": 3.0377566814422607, + "rewards_train/rejected": -2.2704174518585205, + "step": 832 + }, + { + "epoch": 1.11, + "logps_train/chosen": -53.904075622558594, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -98.24420928955078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4009988307952881, + "rewards_train/margins": 3.741044521331787, + "rewards_train/rejected": -3.340045690536499, + "step": 833 + }, + { + "epoch": 1.11, + "learning_rate": 4.327456982275469e-07, + "loss": 0.0821, + "step": 834 + }, + { + "epoch": 1.11, + "logps_train/chosen": -53.978431701660156, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -104.92790222167969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.713094174861908, + "rewards_train/margins": 3.704321801662445, + "rewards_train/rejected": -2.991227626800537, + "step": 834 + }, + { + "epoch": 1.11, + "logps_train/chosen": -65.21916198730469, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -75.34265899658203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4640212059020996, + "rewards_train/margins": 2.6803178787231445, + "rewards_train/rejected": -2.216296672821045, + "step": 835 + }, + { + "epoch": 1.11, + "learning_rate": 4.323700711004665e-07, + "loss": 0.0967, + "step": 836 + }, + { + "epoch": 1.11, + "logps_train/chosen": -89.44677734375, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -162.52613830566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42719754576683044, + "rewards_train/margins": 4.5673108994960785, + "rewards_train/rejected": -4.140113353729248, + "step": 836 + }, + { + "epoch": 1.11, + "logps_train/chosen": -59.180702209472656, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -91.57246398925781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8491173982620239, + "rewards_train/margins": 3.270426392555237, + "rewards_train/rejected": -2.421308994293213, + "step": 837 + }, + { + "epoch": 1.11, + "learning_rate": 4.319935619245153e-07, + "loss": 0.0547, + "step": 838 + }, + { + "epoch": 1.11, + "logps_train/chosen": -36.24166488647461, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -61.67096710205078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4555209279060364, + "rewards_train/margins": 3.0151954293251038, + "rewards_train/rejected": -2.5596745014190674, + "step": 838 + }, + { + "epoch": 1.11, + "logps_train/chosen": -51.77582931518555, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -92.5146713256836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7142292261123657, + "rewards_train/margins": 3.675071597099304, + "rewards_train/rejected": -2.9608423709869385, + "step": 839 + }, + { + "epoch": 1.12, + "learning_rate": 4.31616172520713e-07, + "loss": 0.0824, + "step": 840 + }, + { + "epoch": 1.12, + "logps_train/chosen": -47.10211944580078, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -92.96401977539062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0233820676803589, + "rewards_train/margins": 3.482284188270569, + "rewards_train/rejected": -2.45890212059021, + "step": 840 + }, + { + "epoch": 1.12, + "logps_train/chosen": -68.681396484375, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -85.32242584228516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7521727085113525, + "rewards_train/margins": 3.278165578842163, + "rewards_train/rejected": -2.5259928703308105, + "step": 841 + }, + { + "epoch": 1.12, + "learning_rate": 4.312379047143365e-07, + "loss": 0.0508, + "step": 842 + }, + { + "epoch": 1.12, + "logps_train/chosen": -43.5899658203125, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -70.38139343261719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5937379598617554, + "rewards_train/margins": 2.970548987388611, + "rewards_train/rejected": -2.3768110275268555, + "step": 842 + }, + { + "epoch": 1.12, + "logps_train/chosen": -52.637733459472656, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -78.90478515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.8252888321876526, + "rewards_train/margins": 3.165767252445221, + "rewards_train/rejected": -2.3404784202575684, + "step": 843 + }, + { + "epoch": 1.12, + "learning_rate": 4.3085876033491146e-07, + "loss": 0.1228, + "step": 844 + }, + { + "epoch": 1.12, + "logps_train/chosen": -35.52029800415039, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -60.43755340576172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0276578664779663, + "rewards_train/margins": 3.487038016319275, + "rewards_train/rejected": -2.4593801498413086, + "step": 844 + }, + { + "epoch": 1.12, + "logps_train/chosen": -39.0677604675293, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -80.95925903320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4244741201400757, + "rewards_train/margins": 3.4446195363998413, + "rewards_train/rejected": -3.0201454162597656, + "step": 845 + }, + { + "epoch": 1.12, + "learning_rate": 4.3047874121620284e-07, + "loss": 0.0585, + "step": 846 + }, + { + "epoch": 1.12, + "logps_train/chosen": -82.25259399414062, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -141.22845458984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.3055022954940796, + "rewards_train/margins": 4.597878575325012, + "rewards_train/rejected": -3.2923762798309326, + "step": 846 + }, + { + "epoch": 1.12, + "logps_train/chosen": -59.74720764160156, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -107.6315689086914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28934168815612793, + "rewards_train/margins": 3.738436222076416, + "rewards_train/rejected": -3.449094533920288, + "step": 847 + }, + { + "epoch": 1.13, + "learning_rate": 4.3009784919620655e-07, + "loss": 0.1233, + "step": 848 + }, + { + "epoch": 1.13, + "logps_train/chosen": -41.59056854248047, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -103.746826171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7096932530403137, + "rewards_train/margins": 4.278126060962677, + "rewards_train/rejected": -3.5684328079223633, + "step": 848 + }, + { + "epoch": 1.13, + "logps_train/chosen": -72.16027069091797, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -116.64436340332031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.8222544193267822, + "rewards_train/margins": 5.135128736495972, + "rewards_train/rejected": -3.3128743171691895, + "step": 849 + }, + { + "epoch": 1.13, + "learning_rate": 4.2971608611714026e-07, + "loss": 0.0486, + "step": 850 + }, + { + "epoch": 1.13, + "logps_train/chosen": -69.67103576660156, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -104.18919372558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.651646614074707, + "rewards_train/margins": 3.6025967597961426, + "rewards_train/rejected": -2.9509501457214355, + "step": 850 + }, + { + "epoch": 1.13, + "logps_train/chosen": -51.434364318847656, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -109.86626434326172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.07546746730804443, + "rewards_train/margins": 3.0947519540786743, + "rewards_train/rejected": -3.1702194213867188, + "step": 851 + }, + { + "epoch": 1.13, + "learning_rate": 4.2933345382543474e-07, + "loss": 0.1412, + "step": 852 + }, + { + "epoch": 1.13, + "logps_train/chosen": -55.59050750732422, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -71.9244384765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6682929992675781, + "rewards_train/margins": 2.223236560821533, + "rewards_train/rejected": -1.554943561553955, + "step": 852 + }, + { + "epoch": 1.13, + "logps_train/chosen": -56.12427520751953, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -80.46771240234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.053978681564331, + "rewards_train/margins": 4.1577818393707275, + "rewards_train/rejected": -3.1038031578063965, + "step": 853 + }, + { + "epoch": 1.13, + "learning_rate": 4.2894995417172463e-07, + "loss": 0.0992, + "step": 854 + }, + { + "epoch": 1.13, + "logps_train/chosen": -109.62224578857422, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -142.90023803710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.16590020060539246, + "rewards_train/margins": 4.015298575162888, + "rewards_train/rejected": -3.849398374557495, + "step": 854 + }, + { + "epoch": 1.14, + "logps_train/chosen": -56.15511703491211, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -110.76558685302734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.41573822498321533, + "rewards_train/margins": 3.667296528816223, + "rewards_train/rejected": -3.251558303833008, + "step": 855 + }, + { + "epoch": 1.14, + "learning_rate": 4.2856558901083966e-07, + "loss": 0.0626, + "step": 856 + }, + { + "epoch": 1.14, + "logps_train/chosen": -41.163368225097656, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -84.08399963378906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0316120386123657, + "rewards_train/margins": 3.74313747882843, + "rewards_train/rejected": -2.7115254402160645, + "step": 856 + }, + { + "epoch": 1.14, + "logps_train/chosen": -70.54540252685547, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -138.29885864257812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6470222473144531, + "rewards_train/margins": 4.305031776428223, + "rewards_train/rejected": -3.6580095291137695, + "step": 857 + }, + { + "epoch": 1.14, + "learning_rate": 4.281803602017957e-07, + "loss": 0.0722, + "step": 858 + }, + { + "epoch": 1.14, + "logps_train/chosen": -92.97169494628906, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -101.36845397949219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.270798921585083, + "rewards_train/margins": 4.256472110748291, + "rewards_train/rejected": -2.985673189163208, + "step": 858 + }, + { + "epoch": 1.14, + "logps_train/chosen": -73.10415649414062, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -98.9078369140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9216163754463196, + "rewards_train/margins": 4.317087471485138, + "rewards_train/rejected": -3.3954710960388184, + "step": 859 + }, + { + "epoch": 1.14, + "learning_rate": 4.2779426960778587e-07, + "loss": 0.0547, + "step": 860 + }, + { + "epoch": 1.14, + "logps_train/chosen": -27.713716506958008, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -56.51048278808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9958158731460571, + "rewards_train/margins": 2.969520688056946, + "rewards_train/rejected": -1.9737048149108887, + "step": 860 + }, + { + "epoch": 1.14, + "logps_train/chosen": -38.01380920410156, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -76.73248291015625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.5540881156921387, + "rewards_train/margins": 3.2945237159729004, + "rewards_train/rejected": -2.7404356002807617, + "step": 861 + }, + { + "epoch": 1.14, + "learning_rate": 4.27407319096171e-07, + "loss": 0.1109, + "step": 862 + }, + { + "epoch": 1.14, + "logps_train/chosen": -48.41714096069336, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -89.53971862792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38377442955970764, + "rewards_train/margins": 3.9854022562503815, + "rewards_train/rejected": -3.601627826690674, + "step": 862 + }, + { + "epoch": 1.15, + "logps_train/chosen": -46.59536361694336, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -90.42716979980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1974947154521942, + "rewards_train/margins": 3.5042745769023895, + "rewards_train/rejected": -3.3067798614501953, + "step": 863 + }, + { + "epoch": 1.15, + "learning_rate": 4.270195105384714e-07, + "loss": 0.0837, + "step": 864 + }, + { + "epoch": 1.15, + "logps_train/chosen": -64.23978424072266, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -97.1863784790039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6635218858718872, + "rewards_train/margins": 3.844659924507141, + "rewards_train/rejected": -3.181138038635254, + "step": 864 + }, + { + "epoch": 1.15, + "logps_train/chosen": -62.37693786621094, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -112.25022888183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9805680513381958, + "rewards_train/margins": 3.9509037733078003, + "rewards_train/rejected": -2.9703357219696045, + "step": 865 + }, + { + "epoch": 1.15, + "learning_rate": 4.2663084581035727e-07, + "loss": 0.0839, + "step": 866 + }, + { + "epoch": 1.15, + "logps_train/chosen": -66.78048706054688, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -85.95433044433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6172642707824707, + "rewards_train/margins": 3.6236352920532227, + "rewards_train/rejected": -3.006371021270752, + "step": 866 + }, + { + "epoch": 1.15, + "logps_train/chosen": -46.817909240722656, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -64.48454284667969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20570902526378632, + "rewards_train/margins": 2.3432260006666183, + "rewards_train/rejected": -2.137516975402832, + "step": 867 + }, + { + "epoch": 1.15, + "learning_rate": 4.262413267916396e-07, + "loss": 0.1671, + "step": 868 + }, + { + "epoch": 1.15, + "logps_train/chosen": -108.32286071777344, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -156.0655975341797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9739636182785034, + "rewards_train/margins": 5.496148467063904, + "rewards_train/rejected": -4.5221848487854, + "step": 868 + }, + { + "epoch": 1.15, + "logps_train/chosen": -50.23903274536133, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -90.75271606445312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8834207057952881, + "rewards_train/margins": 3.5180675983428955, + "rewards_train/rejected": -2.6346468925476074, + "step": 869 + }, + { + "epoch": 1.16, + "learning_rate": 4.2585095536626143e-07, + "loss": 0.0436, + "step": 870 + }, + { + "epoch": 1.16, + "logps_train/chosen": -69.28868103027344, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -135.10012817382812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.285194456577301, + "rewards_train/margins": 4.782706797122955, + "rewards_train/rejected": -4.497512340545654, + "step": 870 + }, + { + "epoch": 1.16, + "logps_train/chosen": -49.640625, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -94.07460021972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.17890629172325134, + "rewards_train/margins": 3.473866790533066, + "rewards_train/rejected": -3.2949604988098145, + "step": 871 + }, + { + "epoch": 1.16, + "learning_rate": 4.2545973342228837e-07, + "loss": 0.0504, + "step": 872 + }, + { + "epoch": 1.16, + "logps_train/chosen": -49.62659454345703, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -82.99362182617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4873405396938324, + "rewards_train/margins": 3.574202388525009, + "rewards_train/rejected": -3.0868618488311768, + "step": 872 + }, + { + "epoch": 1.16, + "logps_train/chosen": -87.91971588134766, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -133.903076171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.890059769153595, + "rewards_train/margins": 4.710054457187653, + "rewards_train/rejected": -3.8199946880340576, + "step": 873 + }, + { + "epoch": 1.16, + "learning_rate": 4.2506766285189976e-07, + "loss": 0.0391, + "step": 874 + }, + { + "epoch": 1.16, + "logps_train/chosen": -62.36461639404297, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -108.12089538574219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2940070927143097, + "rewards_train/margins": 3.9748471081256866, + "rewards_train/rejected": -3.680840015411377, + "step": 874 + }, + { + "epoch": 1.16, + "logps_train/chosen": -59.10348892211914, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -78.975830078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7521509528160095, + "rewards_train/margins": 3.5458282828330994, + "rewards_train/rejected": -2.79367733001709, + "step": 875 + }, + { + "epoch": 1.16, + "learning_rate": 4.246747455513794e-07, + "loss": 0.0529, + "step": 876 + }, + { + "epoch": 1.16, + "logps_train/chosen": -45.67261505126953, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -102.71471405029297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8608638048171997, + "rewards_train/margins": 4.508898138999939, + "rewards_train/rejected": -3.6480343341827393, + "step": 876 + }, + { + "epoch": 1.16, + "logps_train/chosen": -49.87294387817383, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -89.14911651611328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.079892873764038, + "rewards_train/margins": 4.429179430007935, + "rewards_train/rejected": -3.3492865562438965, + "step": 877 + }, + { + "epoch": 1.17, + "learning_rate": 4.242809834211063e-07, + "loss": 0.0352, + "step": 878 + }, + { + "epoch": 1.17, + "logps_train/chosen": -42.957130432128906, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -73.38704681396484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2386818528175354, + "rewards_train/margins": 2.2734598517417908, + "rewards_train/rejected": -2.512141704559326, + "step": 878 + }, + { + "epoch": 1.17, + "logps_train/chosen": -56.34748840332031, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -92.78182983398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.16681388020515442, + "rewards_train/margins": 3.2121849358081818, + "rewards_train/rejected": -3.0453710556030273, + "step": 879 + }, + { + "epoch": 1.17, + "learning_rate": 4.238863783655456e-07, + "loss": 0.0929, + "step": 880 + }, + { + "epoch": 1.17, + "logps_train/chosen": -61.931610107421875, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -97.60562133789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1459019184112549, + "rewards_train/margins": 4.078338623046875, + "rewards_train/rejected": -2.93243670463562, + "step": 880 + }, + { + "epoch": 1.17, + "logps_train/chosen": -76.86953735351562, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -112.93692779541016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2130463868379593, + "rewards_train/margins": 3.3524423390626907, + "rewards_train/rejected": -3.1393959522247314, + "step": 881 + }, + { + "epoch": 1.17, + "learning_rate": 4.234909322932393e-07, + "loss": 0.0861, + "step": 882 + }, + { + "epoch": 1.17, + "logps_train/chosen": -53.91347122192383, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -85.40789031982422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.466465562582016, + "rewards_train/margins": 3.154129832983017, + "rewards_train/rejected": -2.687664270401001, + "step": 882 + }, + { + "epoch": 1.17, + "logps_train/chosen": -62.2425537109375, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -92.77987670898438, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.34761956334114075, + "rewards_train/margins": 3.3787329494953156, + "rewards_train/rejected": -3.031113386154175, + "step": 883 + }, + { + "epoch": 1.17, + "learning_rate": 4.230946471167971e-07, + "loss": 0.0892, + "step": 884 + }, + { + "epoch": 1.17, + "logps_train/chosen": -65.4237060546875, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -117.238037109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31231626868247986, + "rewards_train/margins": 4.46736940741539, + "rewards_train/rejected": -4.15505313873291, + "step": 884 + }, + { + "epoch": 1.18, + "logps_train/chosen": -81.63449096679688, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -134.44198608398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.694363534450531, + "rewards_train/margins": 4.832311570644379, + "rewards_train/rejected": -4.137948036193848, + "step": 885 + }, + { + "epoch": 1.18, + "learning_rate": 4.22697524752887e-07, + "loss": 0.0369, + "step": 886 + }, + { + "epoch": 1.18, + "logps_train/chosen": -72.50392150878906, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -92.62897491455078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0265616178512573, + "rewards_train/margins": 3.4859436750411987, + "rewards_train/rejected": -2.4593820571899414, + "step": 886 + }, + { + "epoch": 1.18, + "logps_train/chosen": -68.845947265625, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -133.44363403320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6997804641723633, + "rewards_train/margins": 4.884769439697266, + "rewards_train/rejected": -4.184988975524902, + "step": 887 + }, + { + "epoch": 1.18, + "learning_rate": 4.2229956712222625e-07, + "loss": 0.0456, + "step": 888 + }, + { + "epoch": 1.18, + "logps_train/chosen": -49.1992301940918, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -86.04940032958984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6058585047721863, + "rewards_train/margins": 3.7475168108940125, + "rewards_train/rejected": -3.141658306121826, + "step": 888 + }, + { + "epoch": 1.18, + "logps_train/chosen": -60.015098571777344, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -95.0704345703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2156777381896973, + "rewards_train/margins": 4.1063148975372314, + "rewards_train/rejected": -2.890637159347534, + "step": 889 + }, + { + "epoch": 1.18, + "learning_rate": 4.21900776149572e-07, + "loss": 0.0681, + "step": 890 + }, + { + "epoch": 1.18, + "logps_train/chosen": -85.4272232055664, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -120.84565734863281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.5658717155456543, + "rewards_train/margins": 5.718797206878662, + "rewards_train/rejected": -4.152925491333008, + "step": 890 + }, + { + "epoch": 1.18, + "logps_train/chosen": -77.84867858886719, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -139.948974609375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1088826656341553, + "rewards_train/margins": 6.110029458999634, + "rewards_train/rejected": -5.0011467933654785, + "step": 891 + }, + { + "epoch": 1.18, + "learning_rate": 4.2150115376371165e-07, + "loss": 0.043, + "step": 892 + }, + { + "epoch": 1.18, + "logps_train/chosen": -51.44811248779297, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -93.90492248535156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4833136796951294, + "rewards_train/margins": 4.401930928230286, + "rewards_train/rejected": -3.9186172485351562, + "step": 892 + }, + { + "epoch": 1.19, + "logps_train/chosen": -90.07806396484375, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -124.75885772705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6998106241226196, + "rewards_train/margins": 4.914758801460266, + "rewards_train/rejected": -4.2149481773376465, + "step": 893 + }, + { + "epoch": 1.19, + "learning_rate": 4.2110070189745405e-07, + "loss": 0.0382, + "step": 894 + }, + { + "epoch": 1.19, + "logps_train/chosen": -67.50318145751953, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -96.06961059570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.5653066635131836, + "rewards_train/margins": 4.381642818450928, + "rewards_train/rejected": -2.816336154937744, + "step": 894 + }, + { + "epoch": 1.19, + "logps_train/chosen": -49.06819152832031, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -97.88206481933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0728683471679688, + "rewards_train/margins": 4.471231460571289, + "rewards_train/rejected": -3.3983631134033203, + "step": 895 + }, + { + "epoch": 1.19, + "learning_rate": 4.2069942248761984e-07, + "loss": 0.0444, + "step": 896 + }, + { + "epoch": 1.19, + "logps_train/chosen": -39.21394348144531, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -86.92652893066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42899614572525024, + "rewards_train/margins": 3.643524706363678, + "rewards_train/rejected": -3.2145285606384277, + "step": 896 + }, + { + "epoch": 1.19, + "logps_train/chosen": -49.27470397949219, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -97.69639587402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.4055376052856445, + "rewards_train/margins": 4.906427621841431, + "rewards_train/rejected": -3.500890016555786, + "step": 897 + }, + { + "epoch": 1.19, + "learning_rate": 4.2029731747503215e-07, + "loss": 0.0921, + "step": 898 + }, + { + "epoch": 1.19, + "logps_train/chosen": -42.635555267333984, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -68.8574447631836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7536320686340332, + "rewards_train/margins": 3.321408271789551, + "rewards_train/rejected": -2.5677762031555176, + "step": 898 + }, + { + "epoch": 1.19, + "logps_train/chosen": -65.1983642578125, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -117.09519958496094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0629760026931763, + "rewards_train/margins": 5.020152449607849, + "rewards_train/rejected": -3.957176446914673, + "step": 899 + }, + { + "epoch": 1.2, + "learning_rate": 4.198943888045072e-07, + "loss": 0.0711, + "step": 900 + }, + { + "epoch": 1.2, + "logps_train/chosen": -72.8696060180664, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -122.85391998291016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9849143624305725, + "rewards_train/margins": 5.079681932926178, + "rewards_train/rejected": -4.0947675704956055, + "step": 900 + }, + { + "epoch": 1.2, + "logps_train/chosen": -67.7914047241211, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -104.09950256347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0138282775878906, + "rewards_train/margins": 4.64018440246582, + "rewards_train/rejected": -3.6263561248779297, + "step": 901 + }, + { + "epoch": 1.2, + "learning_rate": 4.194906384248449e-07, + "loss": 0.0427, + "step": 902 + }, + { + "epoch": 1.2, + "logps_train/chosen": -69.12123107910156, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -129.08428955078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.9535012245178223, + "rewards_train/margins": 6.556071758270264, + "rewards_train/rejected": -4.602570533752441, + "step": 902 + }, + { + "epoch": 1.2, + "logps_train/chosen": -48.77479934692383, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -63.59720993041992, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.425645112991333, + "rewards_train/margins": 3.7392725944519043, + "rewards_train/rejected": -2.3136274814605713, + "step": 903 + }, + { + "epoch": 1.2, + "learning_rate": 4.190860682888194e-07, + "loss": 0.0277, + "step": 904 + }, + { + "epoch": 1.2, + "logps_train/chosen": -42.04167556762695, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -76.12769317626953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.35286375880241394, + "rewards_train/margins": 3.1812581717967987, + "rewards_train/rejected": -2.8283944129943848, + "step": 904 + }, + { + "epoch": 1.2, + "logps_train/chosen": -57.059051513671875, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -128.88531494140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1472200155258179, + "rewards_train/margins": 5.173252463340759, + "rewards_train/rejected": -4.026032447814941, + "step": 905 + }, + { + "epoch": 1.2, + "learning_rate": 4.186806803531697e-07, + "loss": 0.0597, + "step": 906 + }, + { + "epoch": 1.2, + "logps_train/chosen": -30.11660385131836, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -66.88246154785156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45025354623794556, + "rewards_train/margins": 2.873753845691681, + "rewards_train/rejected": -2.4235002994537354, + "step": 906 + }, + { + "epoch": 1.2, + "logps_train/chosen": -51.4620361328125, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -101.95337677001953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.014953434467315674, + "rewards_train/margins": 4.089759528636932, + "rewards_train/rejected": -4.104712963104248, + "step": 907 + }, + { + "epoch": 1.21, + "learning_rate": 4.1827447657859024e-07, + "loss": 0.0841, + "step": 908 + }, + { + "epoch": 1.21, + "logps_train/chosen": -52.18760681152344, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -100.23721313476562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42342692613601685, + "rewards_train/margins": 4.464335262775421, + "rewards_train/rejected": -4.040908336639404, + "step": 908 + }, + { + "epoch": 1.21, + "logps_train/chosen": -35.78356170654297, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -82.17020416259766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6942998766899109, + "rewards_train/margins": 3.747257649898529, + "rewards_train/rejected": -3.052957773208618, + "step": 909 + }, + { + "epoch": 1.21, + "learning_rate": 4.178674589297212e-07, + "loss": 0.0391, + "step": 910 + }, + { + "epoch": 1.21, + "logps_train/chosen": -72.89419555664062, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -136.70672607421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3840184807777405, + "rewards_train/margins": 5.401565611362457, + "rewards_train/rejected": -5.017547130584717, + "step": 910 + }, + { + "epoch": 1.21, + "logps_train/chosen": -39.258819580078125, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -86.83905029296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6694308519363403, + "rewards_train/margins": 3.982242226600647, + "rewards_train/rejected": -3.3128113746643066, + "step": 911 + }, + { + "epoch": 1.21, + "learning_rate": 4.174596293751391e-07, + "loss": 0.0796, + "step": 912 + }, + { + "epoch": 1.21, + "logps_train/chosen": -81.449462890625, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -141.7239990234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1539310216903687, + "rewards_train/margins": 4.428674101829529, + "rewards_train/rejected": -3.27474308013916, + "step": 912 + }, + { + "epoch": 1.21, + "logps_train/chosen": -55.227561950683594, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -97.64739990234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.3116190433502197, + "rewards_train/margins": 4.6638593673706055, + "rewards_train/rejected": -3.3522403240203857, + "step": 913 + }, + { + "epoch": 1.21, + "learning_rate": 4.1705098988734767e-07, + "loss": 0.0277, + "step": 914 + }, + { + "epoch": 1.21, + "logps_train/chosen": -44.630889892578125, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -88.82794189453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4822237193584442, + "rewards_train/margins": 3.8165800273418427, + "rewards_train/rejected": -3.3343563079833984, + "step": 914 + }, + { + "epoch": 1.22, + "logps_train/chosen": -38.097747802734375, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -82.90342712402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1652250587940216, + "rewards_train/margins": 3.004395514726639, + "rewards_train/rejected": -2.839170455932617, + "step": 915 + }, + { + "epoch": 1.22, + "learning_rate": 4.166415424427675e-07, + "loss": 0.0809, + "step": 916 + }, + { + "epoch": 1.22, + "logps_train/chosen": -53.195701599121094, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -86.19358825683594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1261327862739563, + "rewards_train/margins": 2.937678635120392, + "rewards_train/rejected": -2.8115458488464355, + "step": 916 + }, + { + "epoch": 1.22, + "logps_train/chosen": -54.32292175292969, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -92.09500122070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.02447935938835144, + "rewards_train/margins": 3.611582785844803, + "rewards_train/rejected": -3.6360621452331543, + "step": 917 + }, + { + "epoch": 1.22, + "learning_rate": 4.162312890217272e-07, + "loss": 0.102, + "step": 918 + }, + { + "epoch": 1.22, + "logps_train/chosen": -61.09229278564453, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -76.89395141601562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08847537636756897, + "rewards_train/margins": 2.3045794069767, + "rewards_train/rejected": -2.216104030609131, + "step": 918 + }, + { + "epoch": 1.22, + "logps_train/chosen": -50.561588287353516, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -92.11100769042969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5789974331855774, + "rewards_train/margins": 4.27916032075882, + "rewards_train/rejected": -3.700162887573242, + "step": 919 + }, + { + "epoch": 1.22, + "learning_rate": 4.1582023160845343e-07, + "loss": 0.1111, + "step": 920 + }, + { + "epoch": 1.22, + "logps_train/chosen": -55.1916389465332, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -109.59123992919922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9136486649513245, + "rewards_train/margins": 4.693476021289825, + "rewards_train/rejected": -3.779827356338501, + "step": 920 + }, + { + "epoch": 1.22, + "logps_train/chosen": -40.989498138427734, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -77.27867126464844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8307376503944397, + "rewards_train/margins": 3.8484485745429993, + "rewards_train/rejected": -3.0177109241485596, + "step": 921 + }, + { + "epoch": 1.22, + "learning_rate": 4.154083721910615e-07, + "loss": 0.0287, + "step": 922 + }, + { + "epoch": 1.22, + "logps_train/chosen": -63.161231994628906, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -85.92396545410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.07237371802330017, + "rewards_train/margins": 2.9204141199588776, + "rewards_train/rejected": -2.9927878379821777, + "step": 922 + }, + { + "epoch": 1.23, + "logps_train/chosen": -70.62162780761719, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -142.40940856933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9472123384475708, + "rewards_train/margins": 4.892840504646301, + "rewards_train/rejected": -3.9456281661987305, + "step": 923 + }, + { + "epoch": 1.23, + "learning_rate": 4.149957127615457e-07, + "loss": 0.0843, + "step": 924 + }, + { + "epoch": 1.23, + "logps_train/chosen": -63.873023986816406, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -96.33056640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15332257747650146, + "rewards_train/margins": 3.598683714866638, + "rewards_train/rejected": -3.4453611373901367, + "step": 924 + }, + { + "epoch": 1.23, + "logps_train/chosen": -35.75093078613281, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -94.66659545898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7197617888450623, + "rewards_train/margins": 4.382905662059784, + "rewards_train/rejected": -3.6631438732147217, + "step": 925 + }, + { + "epoch": 1.23, + "learning_rate": 4.145822553157695e-07, + "loss": 0.0711, + "step": 926 + }, + { + "epoch": 1.23, + "logps_train/chosen": -44.981101989746094, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -88.52735900878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7448585033416748, + "rewards_train/margins": 4.309313058853149, + "rewards_train/rejected": -3.5644545555114746, + "step": 926 + }, + { + "epoch": 1.23, + "logps_train/chosen": -48.46117401123047, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -82.884765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6343510150909424, + "rewards_train/margins": 3.315014600753784, + "rewards_train/rejected": -2.680663585662842, + "step": 927 + }, + { + "epoch": 1.23, + "learning_rate": 4.141680018534563e-07, + "loss": 0.0584, + "step": 928 + }, + { + "epoch": 1.23, + "logps_train/chosen": -62.465728759765625, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -94.887939453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9940519332885742, + "rewards_train/margins": 4.067220687866211, + "rewards_train/rejected": -3.0731687545776367, + "step": 928 + }, + { + "epoch": 1.23, + "logps_train/chosen": -53.429527282714844, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -99.75729370117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1289224624633789, + "rewards_train/margins": 4.048401355743408, + "rewards_train/rejected": -3.9194788932800293, + "step": 929 + }, + { + "epoch": 1.24, + "learning_rate": 4.137529543781794e-07, + "loss": 0.0342, + "step": 930 + }, + { + "epoch": 1.24, + "logps_train/chosen": -38.94437026977539, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -82.13406372070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0071253180503845215, + "rewards_train/margins": 3.622875154018402, + "rewards_train/rejected": -3.6157498359680176, + "step": 930 + }, + { + "epoch": 1.24, + "logps_train/chosen": -70.44154357910156, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -117.73814392089844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1511577367782593, + "rewards_train/margins": 4.334346890449524, + "rewards_train/rejected": -3.1831891536712646, + "step": 931 + }, + { + "epoch": 1.24, + "learning_rate": 4.1333711489735224e-07, + "loss": 0.0601, + "step": 932 + }, + { + "epoch": 1.24, + "logps_train/chosen": -34.402015686035156, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -73.76387023925781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5648766756057739, + "rewards_train/margins": 3.2592321634292603, + "rewards_train/rejected": -2.6943554878234863, + "step": 932 + }, + { + "epoch": 1.24, + "logps_train/chosen": -46.12580490112305, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -67.19065856933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.932732105255127, + "rewards_train/margins": 3.4642982482910156, + "rewards_train/rejected": -2.5315661430358887, + "step": 933 + }, + { + "epoch": 1.24, + "learning_rate": 4.12920485422219e-07, + "loss": 0.0891, + "step": 934 + }, + { + "epoch": 1.24, + "logps_train/chosen": -40.333343505859375, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -74.30396270751953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2391263246536255, + "rewards_train/margins": 3.0122958421707153, + "rewards_train/rejected": -2.77316951751709, + "step": 934 + }, + { + "epoch": 1.24, + "logps_train/chosen": -66.84693908691406, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -158.54598999023438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.4332746267318726, + "rewards_train/margins": 5.669124960899353, + "rewards_train/rejected": -4.2358503341674805, + "step": 935 + }, + { + "epoch": 1.24, + "learning_rate": 4.1250306796784486e-07, + "loss": 0.0859, + "step": 936 + }, + { + "epoch": 1.24, + "logps_train/chosen": -59.74602508544922, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -78.83283233642578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20821015536785126, + "rewards_train/margins": 3.857118770480156, + "rewards_train/rejected": -3.6489086151123047, + "step": 936 + }, + { + "epoch": 1.24, + "logps_train/chosen": -59.689247131347656, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -115.943115234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0295121669769287, + "rewards_train/margins": 4.8488242626190186, + "rewards_train/rejected": -3.81931209564209, + "step": 937 + }, + { + "epoch": 1.25, + "learning_rate": 4.120848645531059e-07, + "loss": 0.0394, + "step": 938 + }, + { + "epoch": 1.25, + "logps_train/chosen": -62.38712692260742, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -110.88436889648438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2691000699996948, + "rewards_train/margins": 4.173161625862122, + "rewards_train/rejected": -3.9040615558624268, + "step": 938 + }, + { + "epoch": 1.25, + "logps_train/chosen": -46.03522491455078, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -94.3001937866211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.723039984703064, + "rewards_train/margins": 4.403059124946594, + "rewards_train/rejected": -3.6800191402435303, + "step": 939 + }, + { + "epoch": 1.25, + "learning_rate": 4.116658772006797e-07, + "loss": 0.0427, + "step": 940 + }, + { + "epoch": 1.25, + "logps_train/chosen": -47.388282775878906, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -82.37748718261719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7486715316772461, + "rewards_train/margins": 3.9637644290924072, + "rewards_train/rejected": -3.215092897415161, + "step": 940 + }, + { + "epoch": 1.25, + "logps_train/chosen": -79.65668487548828, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -135.0927734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.882768988609314, + "rewards_train/margins": 5.68110978603363, + "rewards_train/rejected": -4.798340797424316, + "step": 941 + }, + { + "epoch": 1.25, + "learning_rate": 4.1124610793703554e-07, + "loss": 0.0444, + "step": 942 + }, + { + "epoch": 1.25, + "logps_train/chosen": -58.79033660888672, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -99.20828247070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0990912914276123, + "rewards_train/margins": 3.752732515335083, + "rewards_train/rejected": -3.6536412239074707, + "step": 942 + }, + { + "epoch": 1.25, + "logps_train/chosen": -70.18021392822266, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -97.68711853027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8538536429405212, + "rewards_train/margins": 3.2772530913352966, + "rewards_train/rejected": -2.4233994483947754, + "step": 943 + }, + { + "epoch": 1.25, + "learning_rate": 4.108255587924241e-07, + "loss": 0.0913, + "step": 944 + }, + { + "epoch": 1.25, + "logps_train/chosen": -29.704769134521484, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -63.272972106933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9445928335189819, + "rewards_train/margins": 3.0562652349472046, + "rewards_train/rejected": -2.1116724014282227, + "step": 944 + }, + { + "epoch": 1.25, + "logps_train/chosen": -59.856346130371094, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -145.3099822998047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6221776008605957, + "rewards_train/margins": 5.682863712310791, + "rewards_train/rejected": -5.060686111450195, + "step": 945 + }, + { + "epoch": 1.26, + "learning_rate": 4.1040423180086835e-07, + "loss": 0.0693, + "step": 946 + }, + { + "epoch": 1.26, + "logps_train/chosen": -30.28253173828125, + "logps_train/ref_chosen": -36.25, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -71.4852294921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.605145275592804, + "rewards_train/margins": 3.775543510913849, + "rewards_train/rejected": -3.170398235321045, + "step": 946 + }, + { + "epoch": 1.26, + "logps_train/chosen": -65.02572631835938, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -73.66020202636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2982089519500732, + "rewards_train/margins": 3.4173545837402344, + "rewards_train/rejected": -2.119145631790161, + "step": 947 + }, + { + "epoch": 1.26, + "learning_rate": 4.0998212900015343e-07, + "loss": 0.0795, + "step": 948 + }, + { + "epoch": 1.26, + "logps_train/chosen": -50.15205383300781, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -115.87500762939453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9674117565155029, + "rewards_train/margins": 4.726006984710693, + "rewards_train/rejected": -3.7585952281951904, + "step": 948 + }, + { + "epoch": 1.26, + "logps_train/chosen": -74.43000793457031, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -143.4873046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.192936658859253, + "rewards_train/margins": 5.982293367385864, + "rewards_train/rejected": -4.789356708526611, + "step": 949 + }, + { + "epoch": 1.26, + "learning_rate": 4.095592524318165e-07, + "loss": 0.0247, + "step": 950 + }, + { + "epoch": 1.26, + "logps_train/chosen": -52.825592041015625, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -111.33204650878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04271543025970459, + "rewards_train/margins": 3.9686142206192017, + "rewards_train/rejected": -4.011329650878906, + "step": 950 + }, + { + "epoch": 1.26, + "logps_train/chosen": -62.18234634399414, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -97.35087585449219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5028594136238098, + "rewards_train/margins": 3.570759356021881, + "rewards_train/rejected": -3.0678999423980713, + "step": 951 + }, + { + "epoch": 1.26, + "learning_rate": 4.0913560414113725e-07, + "loss": 0.0887, + "step": 952 + }, + { + "epoch": 1.26, + "logps_train/chosen": -76.3884506225586, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -121.98712158203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7404517531394958, + "rewards_train/margins": 4.998538911342621, + "rewards_train/rejected": -4.258087158203125, + "step": 952 + }, + { + "epoch": 1.27, + "logps_train/chosen": -34.61338806152344, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -62.511173248291016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3394424617290497, + "rewards_train/margins": 2.681575506925583, + "rewards_train/rejected": -2.342133045196533, + "step": 953 + }, + { + "epoch": 1.27, + "learning_rate": 4.0871118617712785e-07, + "loss": 0.1339, + "step": 954 + }, + { + "epoch": 1.27, + "logps_train/chosen": -68.00106811523438, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -101.47566223144531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.44364291429519653, + "rewards_train/margins": 3.934958755970001, + "rewards_train/rejected": -3.4913158416748047, + "step": 954 + }, + { + "epoch": 1.27, + "logps_train/chosen": -41.08306121826172, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -77.29725646972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.49677225947380066, + "rewards_train/margins": 3.9405607879161835, + "rewards_train/rejected": -3.443788528442383, + "step": 955 + }, + { + "epoch": 1.27, + "learning_rate": 4.082860005925231e-07, + "loss": 0.0729, + "step": 956 + }, + { + "epoch": 1.27, + "logps_train/chosen": -56.42616653442383, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -122.04458618164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5136333703994751, + "rewards_train/margins": 5.108717560768127, + "rewards_train/rejected": -4.595084190368652, + "step": 956 + }, + { + "epoch": 1.27, + "logps_train/chosen": -62.269317626953125, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -107.33318328857422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6949433088302612, + "rewards_train/margins": 4.119667887687683, + "rewards_train/rejected": -3.424724578857422, + "step": 957 + }, + { + "epoch": 1.27, + "learning_rate": 4.0786004944377043e-07, + "loss": 0.0289, + "step": 958 + }, + { + "epoch": 1.27, + "logps_train/chosen": -42.874794006347656, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -39.75, + "logps_train/rejected": -63.91020584106445, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9953328371047974, + "rewards_train/margins": 3.4183846712112427, + "rewards_train/rejected": -2.4230518341064453, + "step": 958 + }, + { + "epoch": 1.27, + "logps_train/chosen": -42.67525100708008, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -79.3193359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3605998158454895, + "rewards_train/margins": 3.5761268734931946, + "rewards_train/rejected": -3.215527057647705, + "step": 959 + }, + { + "epoch": 1.27, + "learning_rate": 4.0743333479102e-07, + "loss": 0.0704, + "step": 960 + }, + { + "epoch": 1.27, + "logps_train/chosen": -66.95230102539062, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -112.90501403808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5758639574050903, + "rewards_train/margins": 4.456990361213684, + "rewards_train/rejected": -3.8811264038085938, + "step": 960 + }, + { + "epoch": 1.28, + "logps_train/chosen": -37.265350341796875, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -38.75, + "logps_train/rejected": -59.17976379394531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.27112123370170593, + "rewards_train/margins": 2.303160399198532, + "rewards_train/rejected": -2.032039165496826, + "step": 961 + }, + { + "epoch": 1.28, + "learning_rate": 4.0700585869811465e-07, + "loss": 0.108, + "step": 962 + }, + { + "epoch": 1.28, + "logps_train/chosen": -40.67923355102539, + "logps_train/ref_chosen": -42.5, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -75.93785095214844, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.19348713755607605, + "rewards_train/margins": 2.821647137403488, + "rewards_train/rejected": -2.628159999847412, + "step": 962 + }, + { + "epoch": 1.28, + "logps_train/chosen": -54.76226043701172, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -131.7702178955078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2643990516662598, + "rewards_train/margins": 5.339857578277588, + "rewards_train/rejected": -4.075458526611328, + "step": 963 + }, + { + "epoch": 1.28, + "learning_rate": 4.0657762323258014e-07, + "loss": 0.0975, + "step": 964 + }, + { + "epoch": 1.28, + "logps_train/chosen": -83.19999694824219, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -124.19242858886719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.978438138961792, + "rewards_train/margins": 4.899243593215942, + "rewards_train/rejected": -3.9208054542541504, + "step": 964 + }, + { + "epoch": 1.28, + "logps_train/chosen": -73.39173126220703, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -113.95267486572266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08270162343978882, + "rewards_train/margins": 3.8014068007469177, + "rewards_train/rejected": -3.718705177307129, + "step": 965 + }, + { + "epoch": 1.28, + "learning_rate": 4.061486304656149e-07, + "loss": 0.0427, + "step": 966 + }, + { + "epoch": 1.28, + "logps_train/chosen": -49.0689582824707, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -98.26251983642578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8099011182785034, + "rewards_train/margins": 4.797090411186218, + "rewards_train/rejected": -3.987189292907715, + "step": 966 + }, + { + "epoch": 1.28, + "logps_train/chosen": -66.22107696533203, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -121.72024536132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8544548153877258, + "rewards_train/margins": 5.161636173725128, + "rewards_train/rejected": -4.307181358337402, + "step": 967 + }, + { + "epoch": 1.29, + "learning_rate": 4.057188824720801e-07, + "loss": 0.0643, + "step": 968 + }, + { + "epoch": 1.29, + "logps_train/chosen": -80.0987319946289, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -117.8000717163086, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.4901275038719177, + "rewards_train/margins": 3.670134961605072, + "rewards_train/rejected": -3.1800074577331543, + "step": 968 + }, + { + "epoch": 1.29, + "logps_train/chosen": -51.99189758300781, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -73.40035247802734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2125288248062134, + "rewards_train/margins": 3.9181896448135376, + "rewards_train/rejected": -2.705660820007324, + "step": 969 + }, + { + "epoch": 1.29, + "learning_rate": 4.052883813304897e-07, + "loss": 0.1029, + "step": 970 + }, + { + "epoch": 1.29, + "logps_train/chosen": -41.81707000732422, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -108.32887268066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8354808688163757, + "rewards_train/margins": 4.230868756771088, + "rewards_train/rejected": -3.395387887954712, + "step": 970 + }, + { + "epoch": 1.29, + "logps_train/chosen": -70.6596908569336, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -91.08563995361328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5168430805206299, + "rewards_train/margins": 3.537907123565674, + "rewards_train/rejected": -3.021064043045044, + "step": 971 + }, + { + "epoch": 1.29, + "learning_rate": 4.048571291230003e-07, + "loss": 0.071, + "step": 972 + }, + { + "epoch": 1.29, + "logps_train/chosen": -36.96552276611328, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -35.25, + "logps_train/rejected": -63.379859924316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6413381099700928, + "rewards_train/margins": 3.4558866024017334, + "rewards_train/rejected": -2.8145484924316406, + "step": 972 + }, + { + "epoch": 1.29, + "logps_train/chosen": -62.48179626464844, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -95.2090835571289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1712360382080078, + "rewards_train/margins": 4.900933265686035, + "rewards_train/rejected": -3.7296972274780273, + "step": 973 + }, + { + "epoch": 1.29, + "learning_rate": 4.0442512793540107e-07, + "loss": 0.1032, + "step": 974 + }, + { + "epoch": 1.29, + "logps_train/chosen": -42.25353240966797, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -52.122840881347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2648809850215912, + "rewards_train/margins": 1.7584150731563568, + "rewards_train/rejected": -1.4935340881347656, + "step": 974 + }, + { + "epoch": 1.29, + "logps_train/chosen": -80.43029022216797, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -153.0605010986328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08822091668844223, + "rewards_train/margins": 5.933333240449429, + "rewards_train/rejected": -5.845112323760986, + "step": 975 + }, + { + "epoch": 1.3, + "learning_rate": 4.0399237985710365e-07, + "loss": 0.1042, + "step": 976 + }, + { + "epoch": 1.3, + "logps_train/chosen": -42.721588134765625, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -95.30130767822266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.16502982378005981, + "rewards_train/margins": 3.906507194042206, + "rewards_train/rejected": -4.071537017822266, + "step": 976 + }, + { + "epoch": 1.3, + "logps_train/chosen": -50.351463317871094, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -101.41901397705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9757912158966064, + "rewards_train/margins": 4.138005495071411, + "rewards_train/rejected": -3.1622142791748047, + "step": 977 + }, + { + "epoch": 1.3, + "learning_rate": 4.0355888698113227e-07, + "loss": 0.0927, + "step": 978 + }, + { + "epoch": 1.3, + "logps_train/chosen": -73.92927551269531, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -94.5, + "logps_train/rejected": -141.10940551757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23988446593284607, + "rewards_train/margins": 4.886761277914047, + "rewards_train/rejected": -4.646876811981201, + "step": 978 + }, + { + "epoch": 1.3, + "logps_train/chosen": -33.883792877197266, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -44.75, + "logps_train/rejected": -76.19090270996094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6881831884384155, + "rewards_train/margins": 3.8439923524856567, + "rewards_train/rejected": -3.155809164047241, + "step": 979 + }, + { + "epoch": 1.3, + "learning_rate": 4.0312465140411323e-07, + "loss": 0.0837, + "step": 980 + }, + { + "epoch": 1.3, + "logps_train/chosen": -74.60830688476562, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -101.61077880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.5969820022583008, + "rewards_train/margins": 4.809622049331665, + "rewards_train/rejected": -3.2126400470733643, + "step": 980 + }, + { + "epoch": 1.3, + "logps_train/chosen": -45.12177276611328, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -107.99531555175781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3663383722305298, + "rewards_train/margins": 4.236182332038879, + "rewards_train/rejected": -3.8698439598083496, + "step": 981 + }, + { + "epoch": 1.3, + "learning_rate": 4.02689675226265e-07, + "loss": 0.0304, + "step": 982 + }, + { + "epoch": 1.3, + "logps_train/chosen": -71.71450805664062, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -108.8749771118164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8519866466522217, + "rewards_train/margins": 3.9590156078338623, + "rewards_train/rejected": -3.1070289611816406, + "step": 982 + }, + { + "epoch": 1.31, + "logps_train/chosen": -44.77873992919922, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -90.83544921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6157783269882202, + "rewards_train/margins": 3.8610416650772095, + "rewards_train/rejected": -3.2452633380889893, + "step": 983 + }, + { + "epoch": 1.31, + "learning_rate": 4.022539605513882e-07, + "loss": 0.0651, + "step": 984 + }, + { + "epoch": 1.31, + "logps_train/chosen": -69.95819854736328, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -108.64181518554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.46886688470840454, + "rewards_train/margins": 3.9062523245811462, + "rewards_train/rejected": -4.375119209289551, + "step": 984 + }, + { + "epoch": 1.31, + "logps_train/chosen": -53.61906814575195, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -96.06985473632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6142650246620178, + "rewards_train/margins": 3.8251567482948303, + "rewards_train/rejected": -3.2108917236328125, + "step": 985 + }, + { + "epoch": 1.31, + "learning_rate": 4.01817509486855e-07, + "loss": 0.1021, + "step": 986 + }, + { + "epoch": 1.31, + "logps_train/chosen": -70.08271026611328, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -116.93687438964844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9011039137840271, + "rewards_train/margins": 5.472917020320892, + "rewards_train/rejected": -4.571813106536865, + "step": 986 + }, + { + "epoch": 1.31, + "logps_train/chosen": -106.01170349121094, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -136.1728515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5050796866416931, + "rewards_train/margins": 4.554005563259125, + "rewards_train/rejected": -4.048925876617432, + "step": 987 + }, + { + "epoch": 1.31, + "learning_rate": 4.013803241435995e-07, + "loss": 0.0495, + "step": 988 + }, + { + "epoch": 1.31, + "logps_train/chosen": -39.71161651611328, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -34.75, + "logps_train/rejected": -60.27724075317383, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2882133722305298, + "rewards_train/margins": 3.8319531679153442, + "rewards_train/rejected": -2.5437397956848145, + "step": 988 + }, + { + "epoch": 1.31, + "logps_train/chosen": -69.28598022460938, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -120.88604736328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12687040865421295, + "rewards_train/margins": 3.9264133125543594, + "rewards_train/rejected": -3.7995429039001465, + "step": 989 + }, + { + "epoch": 1.31, + "learning_rate": 4.009424066361068e-07, + "loss": 0.0698, + "step": 990 + }, + { + "epoch": 1.31, + "logps_train/chosen": -49.20613479614258, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -100.90812683105469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2520427405834198, + "rewards_train/margins": 4.060043066740036, + "rewards_train/rejected": -3.808000326156616, + "step": 990 + }, + { + "epoch": 1.32, + "logps_train/chosen": -59.874046325683594, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -83.2501220703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6211891174316406, + "rewards_train/margins": 3.183701992034912, + "rewards_train/rejected": -2.5625128746032715, + "step": 991 + }, + { + "epoch": 1.32, + "learning_rate": 4.0050375908240354e-07, + "loss": 0.0626, + "step": 992 + }, + { + "epoch": 1.32, + "logps_train/chosen": -37.99253463745117, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -54.0, + "logps_train/rejected": -87.21085357666016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5360980033874512, + "rewards_train/margins": 3.845561981201172, + "rewards_train/rejected": -3.3094639778137207, + "step": 992 + }, + { + "epoch": 1.32, + "logps_train/chosen": -41.14564514160156, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -87.1806640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6236007213592529, + "rewards_train/margins": 4.1629555225372314, + "rewards_train/rejected": -3.5393548011779785, + "step": 993 + }, + { + "epoch": 1.32, + "learning_rate": 4.0006438360404706e-07, + "loss": 0.0665, + "step": 994 + }, + { + "epoch": 1.32, + "logps_train/chosen": -46.028621673583984, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -87.7632827758789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9674502611160278, + "rewards_train/margins": 4.587528586387634, + "rewards_train/rejected": -3.6200783252716064, + "step": 994 + }, + { + "epoch": 1.32, + "logps_train/chosen": -56.307796478271484, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -104.26795959472656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.17390775680542, + "rewards_train/margins": 4.7124223709106445, + "rewards_train/rejected": -3.5385146141052246, + "step": 995 + }, + { + "epoch": 1.32, + "learning_rate": 3.9962428232611557e-07, + "loss": 0.0223, + "step": 996 + }, + { + "epoch": 1.32, + "logps_train/chosen": -53.28760528564453, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -90.47726440429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23881781101226807, + "rewards_train/margins": 3.460908532142639, + "rewards_train/rejected": -3.222090721130371, + "step": 996 + }, + { + "epoch": 1.32, + "logps_train/chosen": -53.569358825683594, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -82.82012176513672, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.3407207727432251, + "rewards_train/margins": 2.681327223777771, + "rewards_train/rejected": -2.340606451034546, + "step": 997 + }, + { + "epoch": 1.33, + "learning_rate": 3.991834573771975e-07, + "loss": 0.152, + "step": 998 + }, + { + "epoch": 1.33, + "logps_train/chosen": -53.26763153076172, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -101.80069732666016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5872995853424072, + "rewards_train/margins": 4.925963640213013, + "rewards_train/rejected": -4.3386640548706055, + "step": 998 + }, + { + "epoch": 1.33, + "logps_train/chosen": -51.77572250366211, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -97.53910827636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1880526542663574, + "rewards_train/margins": 4.655244827270508, + "rewards_train/rejected": -3.4671921730041504, + "step": 999 + }, + { + "epoch": 1.33, + "learning_rate": 3.9874191088938145e-07, + "loss": 0.0473, + "step": 1000 + }, + { + "epoch": 1.33, + "logps_train/chosen": -67.94255828857422, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -109.68108367919922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42918145656585693, + "rewards_train/margins": 3.4801026582717896, + "rewards_train/rejected": -3.0509212017059326, + "step": 1000 + }, + { + "epoch": 1.33, + "logps_train/chosen": -53.12202453613281, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -89.68846130371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.049516201019287, + "rewards_train/margins": 4.687112092971802, + "rewards_train/rejected": -3.6375958919525146, + "step": 1001 + }, + { + "epoch": 1.33, + "learning_rate": 3.9829964499824584e-07, + "loss": 0.0681, + "step": 1002 + }, + { + "epoch": 1.33, + "logps_train/chosen": -45.417903900146484, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -136.85548400878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7019596099853516, + "rewards_train/margins": 5.3281331062316895, + "rewards_train/rejected": -4.626173496246338, + "step": 1002 + }, + { + "epoch": 1.33, + "logps_train/chosen": -80.79769897460938, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -133.72149658203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1256986856460571, + "rewards_train/margins": 4.87206757068634, + "rewards_train/rejected": -3.746368885040283, + "step": 1003 + }, + { + "epoch": 1.33, + "learning_rate": 3.9785666184284845e-07, + "loss": 0.0213, + "step": 1004 + }, + { + "epoch": 1.33, + "logps_train/chosen": -56.036407470703125, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -94.72097778320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.4760464429855347, + "rewards_train/margins": 5.1372071504592896, + "rewards_train/rejected": -3.661160707473755, + "step": 1004 + }, + { + "epoch": 1.33, + "logps_train/chosen": -83.94972229003906, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -139.4557647705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15659064054489136, + "rewards_train/margins": 5.177167117595673, + "rewards_train/rejected": -5.020576477050781, + "step": 1005 + }, + { + "epoch": 1.34, + "learning_rate": 3.974129635657162e-07, + "loss": 0.0134, + "step": 1006 + }, + { + "epoch": 1.34, + "logps_train/chosen": -70.18133544921875, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -111.02115631103516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0607725381851196, + "rewards_train/margins": 4.70077908039093, + "rewards_train/rejected": -3.6400065422058105, + "step": 1006 + }, + { + "epoch": 1.34, + "logps_train/chosen": -50.34516525268555, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -115.85397338867188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9792284965515137, + "rewards_train/margins": 4.222438097000122, + "rewards_train/rejected": -3.2432096004486084, + "step": 1007 + }, + { + "epoch": 1.34, + "learning_rate": 3.969685523128349e-07, + "loss": 0.0694, + "step": 1008 + }, + { + "epoch": 1.34, + "logps_train/chosen": -52.69221496582031, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -98.31047058105469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8526535034179688, + "rewards_train/margins": 4.3962016105651855, + "rewards_train/rejected": -3.543548107147217, + "step": 1008 + }, + { + "epoch": 1.34, + "logps_train/chosen": -45.293094635009766, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -101.41578674316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3120967149734497, + "rewards_train/margins": 4.282581686973572, + "rewards_train/rejected": -3.970484972000122, + "step": 1009 + }, + { + "epoch": 1.34, + "learning_rate": 3.9652343023363854e-07, + "loss": 0.0461, + "step": 1010 + }, + { + "epoch": 1.34, + "logps_train/chosen": -65.95108795166016, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -109.56602478027344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4259849488735199, + "rewards_train/margins": 4.029852956533432, + "rewards_train/rejected": -3.603868007659912, + "step": 1010 + }, + { + "epoch": 1.34, + "logps_train/chosen": -54.834251403808594, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -94.55369567871094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6900125741958618, + "rewards_train/margins": 4.3539756536483765, + "rewards_train/rejected": -3.6639630794525146, + "step": 1011 + }, + { + "epoch": 1.34, + "learning_rate": 3.960775994809992e-07, + "loss": 0.0807, + "step": 1012 + }, + { + "epoch": 1.34, + "logps_train/chosen": -36.85002517700195, + "logps_train/ref_chosen": -34.75, + "logps_train/ref_rejected": -37.5, + "logps_train/rejected": -71.35670471191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2029712200164795, + "rewards_train/margins": 3.1733243465423584, + "rewards_train/rejected": -3.376295566558838, + "step": 1012 + }, + { + "epoch": 1.35, + "logps_train/chosen": -60.6200065612793, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -89.31283569335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.059874415397644, + "rewards_train/margins": 3.7856889963150024, + "rewards_train/rejected": -2.7258145809173584, + "step": 1013 + }, + { + "epoch": 1.35, + "learning_rate": 3.956310622112165e-07, + "loss": 0.072, + "step": 1014 + }, + { + "epoch": 1.35, + "logps_train/chosen": -67.36927795410156, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -101.2596664428711, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9169785976409912, + "rewards_train/margins": 4.32224178314209, + "rewards_train/rejected": -3.4052631855010986, + "step": 1014 + }, + { + "epoch": 1.35, + "logps_train/chosen": -41.60442352294922, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -96.79115295410156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1582099199295044, + "rewards_train/margins": 3.336934447288513, + "rewards_train/rejected": -3.178724527359009, + "step": 1015 + }, + { + "epoch": 1.35, + "learning_rate": 3.9518382058400704e-07, + "loss": 0.1142, + "step": 1016 + }, + { + "epoch": 1.35, + "logps_train/chosen": -52.19397735595703, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -83.73917388916016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6188836097717285, + "rewards_train/margins": 3.8568637371063232, + "rewards_train/rejected": -3.2379801273345947, + "step": 1016 + }, + { + "epoch": 1.35, + "logps_train/chosen": -65.00798034667969, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -91.54377746582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6523267030715942, + "rewards_train/margins": 3.672329068183899, + "rewards_train/rejected": -3.0200023651123047, + "step": 1017 + }, + { + "epoch": 1.35, + "learning_rate": 3.947358767624944e-07, + "loss": 0.0424, + "step": 1018 + }, + { + "epoch": 1.35, + "logps_train/chosen": -59.24694061279297, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -107.57969665527344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2692509591579437, + "rewards_train/margins": 4.517064183950424, + "rewards_train/rejected": -4.2478132247924805, + "step": 1018 + }, + { + "epoch": 1.35, + "logps_train/chosen": -89.07691955566406, + "logps_train/ref_chosen": -92.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -104.98500061035156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3313709497451782, + "rewards_train/margins": 3.690808892250061, + "rewards_train/rejected": -3.359437942504883, + "step": 1019 + }, + { + "epoch": 1.35, + "learning_rate": 3.942872329131982e-07, + "loss": 0.0533, + "step": 1020 + }, + { + "epoch": 1.35, + "logps_train/chosen": -48.38117218017578, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -68.29963684082031, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.07946094125509262, + "rewards_train/margins": 2.702588878571987, + "rewards_train/rejected": -2.6231279373168945, + "step": 1020 + }, + { + "epoch": 1.36, + "logps_train/chosen": -39.482635498046875, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -68.35737609863281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7329864501953125, + "rewards_train/margins": 2.9062247276306152, + "rewards_train/rejected": -2.1732382774353027, + "step": 1021 + }, + { + "epoch": 1.36, + "learning_rate": 3.9383789120602373e-07, + "loss": 0.1518, + "step": 1022 + }, + { + "epoch": 1.36, + "logps_train/chosen": -67.55949401855469, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -110.52125549316406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4628005921840668, + "rewards_train/margins": 4.716488927602768, + "rewards_train/rejected": -4.253688335418701, + "step": 1022 + }, + { + "epoch": 1.36, + "logps_train/chosen": -59.67716979980469, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -122.22808837890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18892377614974976, + "rewards_train/margins": 5.23321670293808, + "rewards_train/rejected": -5.04429292678833, + "step": 1023 + }, + { + "epoch": 1.36, + "learning_rate": 3.9338785381425176e-07, + "loss": 0.0433, + "step": 1024 + }, + { + "epoch": 1.36, + "logps_train/chosen": -90.24518585205078, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -139.3506622314453, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.10360658168792725, + "rewards_train/margins": 3.993359684944153, + "rewards_train/rejected": -3.8897531032562256, + "step": 1024 + }, + { + "epoch": 1.36, + "logps_train/chosen": -35.725196838378906, + "logps_train/ref_chosen": -44.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -82.62857818603516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9130270481109619, + "rewards_train/margins": 4.032134771347046, + "rewards_train/rejected": -3.119107723236084, + "step": 1025 + }, + { + "epoch": 1.36, + "learning_rate": 3.929371229145275e-07, + "loss": 0.2019, + "step": 1026 + }, + { + "epoch": 1.36, + "logps_train/chosen": -74.83785247802734, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -119.43509674072266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4130899906158447, + "rewards_train/margins": 4.7034752368927, + "rewards_train/rejected": -4.2903852462768555, + "step": 1026 + }, + { + "epoch": 1.36, + "logps_train/chosen": -42.357696533203125, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -84.29924774169922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38591018319129944, + "rewards_train/margins": 3.8814598619937897, + "rewards_train/rejected": -3.4955496788024902, + "step": 1027 + }, + { + "epoch": 1.37, + "learning_rate": 3.924857006868508e-07, + "loss": 0.031, + "step": 1028 + }, + { + "epoch": 1.37, + "logps_train/chosen": -62.99864196777344, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -113.484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7876359224319458, + "rewards_train/margins": 4.931385159492493, + "rewards_train/rejected": -4.143749237060547, + "step": 1028 + }, + { + "epoch": 1.37, + "logps_train/chosen": -39.19382858276367, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -67.39474487304688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6212420463562012, + "rewards_train/margins": 3.7962634563446045, + "rewards_train/rejected": -3.1750214099884033, + "step": 1029 + }, + { + "epoch": 1.37, + "learning_rate": 3.9203358931456474e-07, + "loss": 0.0315, + "step": 1030 + }, + { + "epoch": 1.37, + "logps_train/chosen": -51.22350311279297, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -103.72412872314453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19639964401721954, + "rewards_train/margins": 3.426625445485115, + "rewards_train/rejected": -3.2302258014678955, + "step": 1030 + }, + { + "epoch": 1.37, + "logps_train/chosen": -55.18017578125, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -104.96635437011719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4007323384284973, + "rewards_train/margins": 4.317681133747101, + "rewards_train/rejected": -3.9169487953186035, + "step": 1031 + }, + { + "epoch": 1.37, + "learning_rate": 3.915807909843457e-07, + "loss": 0.0399, + "step": 1032 + }, + { + "epoch": 1.37, + "logps_train/chosen": -51.672515869140625, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -97.20822143554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.49446725845336914, + "rewards_train/margins": 4.267633438110352, + "rewards_train/rejected": -3.7731661796569824, + "step": 1032 + }, + { + "epoch": 1.37, + "logps_train/chosen": -76.35983276367188, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -106.61408996582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2027553915977478, + "rewards_train/margins": 3.2094350457191467, + "rewards_train/rejected": -3.4121904373168945, + "step": 1033 + }, + { + "epoch": 1.37, + "learning_rate": 3.911273078861926e-07, + "loss": 0.1049, + "step": 1034 + }, + { + "epoch": 1.37, + "logps_train/chosen": -59.87396240234375, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -102.09280395507812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.17705705761909485, + "rewards_train/margins": 3.7832129299640656, + "rewards_train/rejected": -3.6061558723449707, + "step": 1034 + }, + { + "epoch": 1.37, + "logps_train/chosen": -36.76939392089844, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -69.31784057617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0308732986450195, + "rewards_train/margins": 3.701328754425049, + "rewards_train/rejected": -2.6704554557800293, + "step": 1035 + }, + { + "epoch": 1.38, + "learning_rate": 3.906731422134164e-07, + "loss": 0.0946, + "step": 1036 + }, + { + "epoch": 1.38, + "logps_train/chosen": -54.1653938293457, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -84.251708984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6131480932235718, + "rewards_train/margins": 3.5605849027633667, + "rewards_train/rejected": -2.947436809539795, + "step": 1036 + }, + { + "epoch": 1.38, + "logps_train/chosen": -67.73870086669922, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -106.1065444946289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4706614911556244, + "rewards_train/margins": 5.057487815618515, + "rewards_train/rejected": -4.586826324462891, + "step": 1037 + }, + { + "epoch": 1.38, + "learning_rate": 3.9021829616262913e-07, + "loss": 0.0614, + "step": 1038 + }, + { + "epoch": 1.38, + "logps_train/chosen": -63.47399139404297, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -127.60704040527344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3811319172382355, + "rewards_train/margins": 4.873085767030716, + "rewards_train/rejected": -4.4919538497924805, + "step": 1038 + }, + { + "epoch": 1.38, + "logps_train/chosen": -59.90660858154297, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -136.5363006591797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.25465142726898193, + "rewards_train/margins": 5.894218802452087, + "rewards_train/rejected": -5.6395673751831055, + "step": 1039 + }, + { + "epoch": 1.38, + "learning_rate": 3.8976277193373377e-07, + "loss": 0.0246, + "step": 1040 + }, + { + "epoch": 1.38, + "logps_train/chosen": -52.551116943359375, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -115.63165283203125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7495759725570679, + "rewards_train/margins": 4.58617889881134, + "rewards_train/rejected": -3.8366029262542725, + "step": 1040 + }, + { + "epoch": 1.38, + "logps_train/chosen": -22.925827026367188, + "logps_train/ref_chosen": -27.5, + "logps_train/ref_rejected": -35.5, + "logps_train/rejected": -62.66178894042969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4554642140865326, + "rewards_train/margins": 3.1657839715480804, + "rewards_train/rejected": -2.710319757461548, + "step": 1041 + }, + { + "epoch": 1.38, + "learning_rate": 3.8930657172991316e-07, + "loss": 0.0512, + "step": 1042 + }, + { + "epoch": 1.38, + "logps_train/chosen": -53.44230651855469, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -93.5882568359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6480851173400879, + "rewards_train/margins": 4.873707294464111, + "rewards_train/rejected": -4.225622177124023, + "step": 1042 + }, + { + "epoch": 1.39, + "logps_train/chosen": -59.77875900268555, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -100.12228393554688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.25073757767677307, + "rewards_train/margins": 4.154762893915176, + "rewards_train/rejected": -3.9040253162384033, + "step": 1043 + }, + { + "epoch": 1.39, + "learning_rate": 3.888496977576198e-07, + "loss": 0.0637, + "step": 1044 + }, + { + "epoch": 1.39, + "logps_train/chosen": -55.947975158691406, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -112.9357681274414, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4817647635936737, + "rewards_train/margins": 5.142529457807541, + "rewards_train/rejected": -4.660764694213867, + "step": 1044 + }, + { + "epoch": 1.39, + "logps_train/chosen": -69.31583404541016, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -63.5, + "logps_train/rejected": -102.5622787475586, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5824788808822632, + "rewards_train/margins": 4.487144112586975, + "rewards_train/rejected": -3.904665231704712, + "step": 1045 + }, + { + "epoch": 1.39, + "learning_rate": 3.883921522265646e-07, + "loss": 0.0261, + "step": 1046 + }, + { + "epoch": 1.39, + "logps_train/chosen": -70.0384521484375, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -113.40721130371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18209242820739746, + "rewards_train/margins": 4.174376487731934, + "rewards_train/rejected": -3.992284059524536, + "step": 1046 + }, + { + "epoch": 1.39, + "logps_train/chosen": -58.87603759765625, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -87.86286926269531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7436462640762329, + "rewards_train/margins": 3.533059000968933, + "rewards_train/rejected": -2.7894127368927, + "step": 1047 + }, + { + "epoch": 1.39, + "learning_rate": 3.87933937349707e-07, + "loss": 0.0678, + "step": 1048 + }, + { + "epoch": 1.39, + "logps_train/chosen": -58.437591552734375, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -90.81820678710938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.29061585664749146, + "rewards_train/margins": 3.9052491784095764, + "rewards_train/rejected": -3.614633321762085, + "step": 1048 + }, + { + "epoch": 1.39, + "logps_train/chosen": -43.98640441894531, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -81.9910888671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.27401575446128845, + "rewards_train/margins": 3.1918743550777435, + "rewards_train/rejected": -2.917858600616455, + "step": 1049 + }, + { + "epoch": 1.39, + "learning_rate": 3.874750553432433e-07, + "loss": 0.0729, + "step": 1050 + }, + { + "epoch": 1.39, + "logps_train/chosen": -62.12126922607422, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -138.9076385498047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6417795419692993, + "rewards_train/margins": 5.654418587684631, + "rewards_train/rejected": -5.012639045715332, + "step": 1050 + }, + { + "epoch": 1.4, + "logps_train/chosen": -69.0708236694336, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -97.9734878540039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04023642838001251, + "rewards_train/margins": 2.8766434639692307, + "rewards_train/rejected": -2.916879892349243, + "step": 1051 + }, + { + "epoch": 1.4, + "learning_rate": 3.870155084265967e-07, + "loss": 0.069, + "step": 1052 + }, + { + "epoch": 1.4, + "logps_train/chosen": -61.396236419677734, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -106.94522094726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5814701318740845, + "rewards_train/margins": 4.19044554233551, + "rewards_train/rejected": -3.608975410461426, + "step": 1052 + }, + { + "epoch": 1.4, + "logps_train/chosen": -65.21392059326172, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -127.26935577392578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.390326589345932, + "rewards_train/margins": 4.41960534453392, + "rewards_train/rejected": -4.029278755187988, + "step": 1053 + }, + { + "epoch": 1.4, + "learning_rate": 3.865552988224062e-07, + "loss": 0.0455, + "step": 1054 + }, + { + "epoch": 1.4, + "logps_train/chosen": -39.886680603027344, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -82.67132568359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8449255228042603, + "rewards_train/margins": 3.7524880170822144, + "rewards_train/rejected": -2.907562494277954, + "step": 1054 + }, + { + "epoch": 1.4, + "logps_train/chosen": -72.55946350097656, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -135.72982788085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6815534830093384, + "rewards_train/margins": 5.444380402565002, + "rewards_train/rejected": -4.762826919555664, + "step": 1055 + }, + { + "epoch": 1.4, + "learning_rate": 3.86094428756516e-07, + "loss": 0.079, + "step": 1056 + }, + { + "epoch": 1.4, + "logps_train/chosen": -69.78994750976562, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -99.00350952148438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.328036904335022, + "rewards_train/margins": 3.648309111595154, + "rewards_train/rejected": -3.320272207260132, + "step": 1056 + }, + { + "epoch": 1.4, + "logps_train/chosen": -73.80904388427734, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -91.90657043457031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5300331115722656, + "rewards_train/margins": 3.8800649642944336, + "rewards_train/rejected": -3.350031852722168, + "step": 1057 + }, + { + "epoch": 1.41, + "learning_rate": 3.856329004579647e-07, + "loss": 0.0279, + "step": 1058 + }, + { + "epoch": 1.41, + "logps_train/chosen": -67.9787368774414, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -91.948486328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9634546637535095, + "rewards_train/margins": 3.448147237300873, + "rewards_train/rejected": -2.4846925735473633, + "step": 1058 + }, + { + "epoch": 1.41, + "logps_train/chosen": -56.517417907714844, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -78.15455627441406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2271642684936523, + "rewards_train/margins": 3.354339122772217, + "rewards_train/rejected": -2.1271748542785645, + "step": 1059 + }, + { + "epoch": 1.41, + "learning_rate": 3.8517071615897443e-07, + "loss": 0.1296, + "step": 1060 + }, + { + "epoch": 1.41, + "logps_train/chosen": -84.07479858398438, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -139.5961151123047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1758396327495575, + "rewards_train/margins": 4.849397450685501, + "rewards_train/rejected": -5.025237083435059, + "step": 1060 + }, + { + "epoch": 1.41, + "logps_train/chosen": -57.02593994140625, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -105.87884521484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5033937096595764, + "rewards_train/margins": 4.244012415409088, + "rewards_train/rejected": -3.7406187057495117, + "step": 1061 + }, + { + "epoch": 1.41, + "learning_rate": 3.847078780949401e-07, + "loss": 0.0518, + "step": 1062 + }, + { + "epoch": 1.41, + "logps_train/chosen": -71.30848693847656, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -104.53205871582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4847757816314697, + "rewards_train/margins": 4.8950135707855225, + "rewards_train/rejected": -4.410237789154053, + "step": 1062 + }, + { + "epoch": 1.41, + "logps_train/chosen": -47.513587951660156, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -69.87454223632812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.03418812155723572, + "rewards_train/margins": 2.6255482137203217, + "rewards_train/rejected": -2.591360092163086, + "step": 1063 + }, + { + "epoch": 1.41, + "learning_rate": 3.8424438850441885e-07, + "loss": 0.0852, + "step": 1064 + }, + { + "epoch": 1.41, + "logps_train/chosen": -73.03597259521484, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -123.27498626708984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.572184145450592, + "rewards_train/margins": 4.182495176792145, + "rewards_train/rejected": -3.6103110313415527, + "step": 1064 + }, + { + "epoch": 1.41, + "logps_train/chosen": -58.52642059326172, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -111.962646484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09561094641685486, + "rewards_train/margins": 4.01393523812294, + "rewards_train/rejected": -4.109546184539795, + "step": 1065 + }, + { + "epoch": 1.42, + "learning_rate": 3.837802496291186e-07, + "loss": 0.0634, + "step": 1066 + }, + { + "epoch": 1.42, + "logps_train/chosen": -87.14884948730469, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -140.74317932128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19448992609977722, + "rewards_train/margins": 4.715682476758957, + "rewards_train/rejected": -4.52119255065918, + "step": 1066 + }, + { + "epoch": 1.42, + "logps_train/chosen": -55.47509765625, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -103.63142395019531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1962401121854782, + "rewards_train/margins": 4.593757793307304, + "rewards_train/rejected": -4.397517681121826, + "step": 1067 + }, + { + "epoch": 1.42, + "learning_rate": 3.8331546371388797e-07, + "loss": 0.026, + "step": 1068 + }, + { + "epoch": 1.42, + "logps_train/chosen": -46.59416198730469, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -73.27717590332031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2671464681625366, + "rewards_train/margins": 3.237052083015442, + "rewards_train/rejected": -2.9699056148529053, + "step": 1068 + }, + { + "epoch": 1.42, + "logps_train/chosen": -81.00228881835938, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -108.65495300292969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7060210704803467, + "rewards_train/margins": 4.946516752243042, + "rewards_train/rejected": -4.240495681762695, + "step": 1069 + }, + { + "epoch": 1.42, + "learning_rate": 3.828500330067047e-07, + "loss": 0.053, + "step": 1070 + }, + { + "epoch": 1.42, + "logps_train/chosen": -60.29121017456055, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -86.41423034667969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5646289587020874, + "rewards_train/margins": 3.957614779472351, + "rewards_train/rejected": -3.3929858207702637, + "step": 1070 + }, + { + "epoch": 1.42, + "logps_train/chosen": -36.730682373046875, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -70.3037109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4820095896720886, + "rewards_train/margins": 2.9131621718406677, + "rewards_train/rejected": -2.431152582168579, + "step": 1071 + }, + { + "epoch": 1.42, + "learning_rate": 3.823839597586654e-07, + "loss": 0.0692, + "step": 1072 + }, + { + "epoch": 1.42, + "logps_train/chosen": -33.606929779052734, + "logps_train/ref_chosen": -39.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -90.87196350097656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5670413374900818, + "rewards_train/margins": 4.056581437587738, + "rewards_train/rejected": -3.4895401000976562, + "step": 1072 + }, + { + "epoch": 1.42, + "logps_train/chosen": -36.388885498046875, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -87.69721221923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8017363548278809, + "rewards_train/margins": 3.945675849914551, + "rewards_train/rejected": -3.14393949508667, + "step": 1073 + }, + { + "epoch": 1.43, + "learning_rate": 3.8191724622397424e-07, + "loss": 0.0336, + "step": 1074 + }, + { + "epoch": 1.43, + "logps_train/chosen": -71.81846618652344, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -126.84252166748047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4345601499080658, + "rewards_train/margins": 5.360218375921249, + "rewards_train/rejected": -4.925658226013184, + "step": 1074 + }, + { + "epoch": 1.43, + "logps_train/chosen": -33.8405876159668, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -61.79624938964844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4596914052963257, + "rewards_train/margins": 2.975253462791443, + "rewards_train/rejected": -2.515562057495117, + "step": 1075 + }, + { + "epoch": 1.43, + "learning_rate": 3.8144989465993237e-07, + "loss": 0.0405, + "step": 1076 + }, + { + "epoch": 1.43, + "logps_train/chosen": -69.2583236694336, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -123.81806945800781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09135481715202332, + "rewards_train/margins": 4.496599167585373, + "rewards_train/rejected": -4.40524435043335, + "step": 1076 + }, + { + "epoch": 1.43, + "logps_train/chosen": -68.01119995117188, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -100.36489868164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.06294223666191101, + "rewards_train/margins": 4.075995177030563, + "rewards_train/rejected": -4.013052940368652, + "step": 1077 + }, + { + "epoch": 1.43, + "learning_rate": 3.809819073269265e-07, + "loss": 0.0309, + "step": 1078 + }, + { + "epoch": 1.43, + "logps_train/chosen": -109.69962310791016, + "logps_train/ref_chosen": -115.0, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -137.47056579589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5191001892089844, + "rewards_train/margins": 4.344281196594238, + "rewards_train/rejected": -3.825181007385254, + "step": 1078 + }, + { + "epoch": 1.43, + "logps_train/chosen": -37.2322998046875, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -68.06069946289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2470826506614685, + "rewards_train/margins": 3.1429962515830994, + "rewards_train/rejected": -2.895913600921631, + "step": 1079 + }, + { + "epoch": 1.43, + "learning_rate": 3.8051328648841854e-07, + "loss": 0.0729, + "step": 1080 + }, + { + "epoch": 1.43, + "logps_train/chosen": -45.251182556152344, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -88.29017639160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6326940655708313, + "rewards_train/margins": 3.7273367047309875, + "rewards_train/rejected": -3.0946426391601562, + "step": 1080 + }, + { + "epoch": 1.44, + "logps_train/chosen": -73.56950378417969, + "logps_train/ref_chosen": -75.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -111.57798767089844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1430499106645584, + "rewards_train/margins": 4.233661368489265, + "rewards_train/rejected": -4.090611457824707, + "step": 1081 + }, + { + "epoch": 1.44, + "learning_rate": 3.8004403441093436e-07, + "loss": 0.0348, + "step": 1082 + }, + { + "epoch": 1.44, + "logps_train/chosen": -77.24303436279297, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -126.18246459960938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3858528733253479, + "rewards_train/margins": 4.23847371339798, + "rewards_train/rejected": -3.852620840072632, + "step": 1082 + }, + { + "epoch": 1.44, + "logps_train/chosen": -47.20745849609375, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -102.12366485595703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5440980195999146, + "rewards_train/margins": 4.372089743614197, + "rewards_train/rejected": -3.8279917240142822, + "step": 1083 + }, + { + "epoch": 1.44, + "learning_rate": 3.7957415336405284e-07, + "loss": 0.0587, + "step": 1084 + }, + { + "epoch": 1.44, + "logps_train/chosen": -28.81665802001953, + "logps_train/ref_chosen": -28.375, + "logps_train/ref_rejected": -32.0, + "logps_train/rejected": -56.57765197753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04182225465774536, + "rewards_train/margins": 2.4061774611473083, + "rewards_train/rejected": -2.4479997158050537, + "step": 1084 + }, + { + "epoch": 1.44, + "logps_train/chosen": -66.69900512695312, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -93.84065246582031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.33009931445121765, + "rewards_train/margins": 3.148539274930954, + "rewards_train/rejected": -2.8184399604797363, + "step": 1085 + }, + { + "epoch": 1.44, + "learning_rate": 3.79103645620395e-07, + "loss": 0.1178, + "step": 1086 + }, + { + "epoch": 1.44, + "logps_train/chosen": -66.89918518066406, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -119.32633972167969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2725811302661896, + "rewards_train/margins": 4.717714816331863, + "rewards_train/rejected": -4.445133686065674, + "step": 1086 + }, + { + "epoch": 1.44, + "logps_train/chosen": -73.22869110107422, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -119.14118194580078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.40568163990974426, + "rewards_train/margins": 4.545936554670334, + "rewards_train/rejected": -4.951618194580078, + "step": 1087 + }, + { + "epoch": 1.44, + "learning_rate": 3.786325134556128e-07, + "loss": 0.0214, + "step": 1088 + }, + { + "epoch": 1.44, + "logps_train/chosen": -41.32347106933594, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -62.88675308227539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5395280122756958, + "rewards_train/margins": 3.261797070503235, + "rewards_train/rejected": -2.722269058227539, + "step": 1088 + }, + { + "epoch": 1.45, + "logps_train/chosen": -56.772151947021484, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -109.71208953857422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9141911864280701, + "rewards_train/margins": 4.727587401866913, + "rewards_train/rejected": -3.8133962154388428, + "step": 1089 + }, + { + "epoch": 1.45, + "learning_rate": 3.781607591483784e-07, + "loss": 0.036, + "step": 1090 + }, + { + "epoch": 1.45, + "logps_train/chosen": -52.89753723144531, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -39.0, + "logps_train/rejected": -72.97647094726562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.33212143182754517, + "rewards_train/margins": 3.725471556186676, + "rewards_train/rejected": -3.393350124359131, + "step": 1090 + }, + { + "epoch": 1.45, + "logps_train/chosen": -53.46153259277344, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -85.15753173828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4772842526435852, + "rewards_train/margins": 4.3735063672065735, + "rewards_train/rejected": -3.8962221145629883, + "step": 1091 + }, + { + "epoch": 1.45, + "learning_rate": 3.7768838498037293e-07, + "loss": 0.084, + "step": 1092 + }, + { + "epoch": 1.45, + "logps_train/chosen": -68.80516815185547, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -101.49811553955078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5296397805213928, + "rewards_train/margins": 4.754451811313629, + "rewards_train/rejected": -4.224812030792236, + "step": 1092 + }, + { + "epoch": 1.45, + "logps_train/chosen": -108.41425323486328, + "logps_train/ref_chosen": -109.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -132.552001953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07419991493225098, + "rewards_train/margins": 4.013775110244751, + "rewards_train/rejected": -3.9395751953125, + "step": 1093 + }, + { + "epoch": 1.45, + "learning_rate": 3.772153932362756e-07, + "loss": 0.0411, + "step": 1094 + }, + { + "epoch": 1.45, + "logps_train/chosen": -60.97172546386719, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -105.09901428222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7200145125389099, + "rewards_train/margins": 4.739291131496429, + "rewards_train/rejected": -4.0192766189575195, + "step": 1094 + }, + { + "epoch": 1.45, + "logps_train/chosen": -73.78340911865234, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -105.59697723388672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8161907196044922, + "rewards_train/margins": 4.154013633728027, + "rewards_train/rejected": -3.337822914123535, + "step": 1095 + }, + { + "epoch": 1.46, + "learning_rate": 3.767417862037525e-07, + "loss": 0.0321, + "step": 1096 + }, + { + "epoch": 1.46, + "logps_train/chosen": -89.45512390136719, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -131.07769775390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4271436929702759, + "rewards_train/margins": 4.430226445198059, + "rewards_train/rejected": -4.003082752227783, + "step": 1096 + }, + { + "epoch": 1.46, + "logps_train/chosen": -83.60784912109375, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -113.8121566772461, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 1.13608980178833, + "rewards_train/margins": 5.437617778778076, + "rewards_train/rejected": -4.301527976989746, + "step": 1097 + }, + { + "epoch": 1.46, + "learning_rate": 3.762675661734457e-07, + "loss": 0.0868, + "step": 1098 + }, + { + "epoch": 1.46, + "logps_train/chosen": -43.76305389404297, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -76.89112854003906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4572885036468506, + "rewards_train/margins": 3.4643702507019043, + "rewards_train/rejected": -3.0070817470550537, + "step": 1098 + }, + { + "epoch": 1.46, + "logps_train/chosen": -58.350563049316406, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -91.43194580078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04258058965206146, + "rewards_train/margins": 3.3553061336278915, + "rewards_train/rejected": -3.31272554397583, + "step": 1099 + }, + { + "epoch": 1.46, + "learning_rate": 3.7579273543896183e-07, + "loss": 0.0738, + "step": 1100 + }, + { + "epoch": 1.46, + "logps_train/chosen": -47.024139404296875, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -108.52749633789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6407501101493835, + "rewards_train/margins": 5.199066340923309, + "rewards_train/rejected": -4.558316230773926, + "step": 1100 + }, + { + "epoch": 1.46, + "logps_train/chosen": -57.39291000366211, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -111.84832000732422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3864902853965759, + "rewards_train/margins": 4.5314783453941345, + "rewards_train/rejected": -4.144988059997559, + "step": 1101 + }, + { + "epoch": 1.46, + "learning_rate": 3.753172962968617e-07, + "loss": 0.0155, + "step": 1102 + }, + { + "epoch": 1.46, + "logps_train/chosen": -72.623779296875, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -121.871826171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.720434308052063, + "rewards_train/margins": 4.813085675239563, + "rewards_train/rejected": -4.0926513671875, + "step": 1102 + }, + { + "epoch": 1.46, + "logps_train/chosen": -40.725425720214844, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -87.36402893066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.34191077947616577, + "rewards_train/margins": 3.592376172542572, + "rewards_train/rejected": -3.2504653930664062, + "step": 1103 + }, + { + "epoch": 1.47, + "learning_rate": 3.7484125104664835e-07, + "loss": 0.0459, + "step": 1104 + }, + { + "epoch": 1.47, + "logps_train/chosen": -58.49449157714844, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -119.99946594238281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 2.1857075691223145, + "rewards_train/margins": 6.957529067993164, + "rewards_train/rejected": -4.77182149887085, + "step": 1104 + }, + { + "epoch": 1.47, + "logps_train/chosen": -58.1461296081543, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -60.75, + "logps_train/rejected": -96.14643096923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6252306699752808, + "rewards_train/margins": 4.167608141899109, + "rewards_train/rejected": -3.542377471923828, + "step": 1105 + }, + { + "epoch": 1.47, + "learning_rate": 3.743646019907566e-07, + "loss": 0.0428, + "step": 1106 + }, + { + "epoch": 1.47, + "logps_train/chosen": -59.70039367675781, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -83.75991821289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.398710697889328, + "rewards_train/margins": 3.0403279960155487, + "rewards_train/rejected": -2.6416172981262207, + "step": 1106 + }, + { + "epoch": 1.47, + "logps_train/chosen": -46.317649841308594, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -106.39736938476562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4276101887226105, + "rewards_train/margins": 5.47359636425972, + "rewards_train/rejected": -5.045986175537109, + "step": 1107 + }, + { + "epoch": 1.47, + "learning_rate": 3.738873514345413e-07, + "loss": 0.0597, + "step": 1108 + }, + { + "epoch": 1.47, + "logps_train/chosen": -54.878684997558594, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -105.11205291748047, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.08415296673774719, + "rewards_train/margins": 4.0453589260578156, + "rewards_train/rejected": -3.9612059593200684, + "step": 1108 + }, + { + "epoch": 1.47, + "logps_train/chosen": -55.508506774902344, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -116.87051391601562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2678989768028259, + "rewards_train/margins": 4.824481904506683, + "rewards_train/rejected": -4.556582927703857, + "step": 1109 + }, + { + "epoch": 1.47, + "learning_rate": 3.7340950168626683e-07, + "loss": 0.0841, + "step": 1110 + }, + { + "epoch": 1.47, + "logps_train/chosen": -67.36357879638672, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -130.27218627929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4730176329612732, + "rewards_train/margins": 5.434611260890961, + "rewards_train/rejected": -4.9615936279296875, + "step": 1110 + }, + { + "epoch": 1.48, + "logps_train/chosen": -64.18852233886719, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -119.49390411376953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5034622550010681, + "rewards_train/margins": 4.341134488582611, + "rewards_train/rejected": -3.837672233581543, + "step": 1111 + }, + { + "epoch": 1.48, + "learning_rate": 3.7293105505709543e-07, + "loss": 0.03, + "step": 1112 + }, + { + "epoch": 1.48, + "logps_train/chosen": -43.375762939453125, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -77.70501708984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.49640795588493347, + "rewards_train/margins": 3.145034521818161, + "rewards_train/rejected": -2.6486265659332275, + "step": 1112 + }, + { + "epoch": 1.48, + "logps_train/chosen": -74.84762573242188, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -104.9190673828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9949249029159546, + "rewards_train/margins": 4.611831545829773, + "rewards_train/rejected": -3.6169066429138184, + "step": 1113 + }, + { + "epoch": 1.48, + "learning_rate": 3.724520138610762e-07, + "loss": 0.0731, + "step": 1114 + }, + { + "epoch": 1.48, + "logps_train/chosen": -108.8508529663086, + "logps_train/ref_chosen": -116.5, + "logps_train/ref_rejected": -101.0, + "logps_train/rejected": -134.4057159423828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7641333937644958, + "rewards_train/margins": 4.127361238002777, + "rewards_train/rejected": -3.3632278442382812, + "step": 1114 + }, + { + "epoch": 1.48, + "logps_train/chosen": -24.966794967651367, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -29.75, + "logps_train/rejected": -56.13177490234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0323245525360107, + "rewards_train/margins": 3.6642518043518066, + "rewards_train/rejected": -2.631927251815796, + "step": 1115 + }, + { + "epoch": 1.48, + "learning_rate": 3.7197238041513415e-07, + "loss": 0.0875, + "step": 1116 + }, + { + "epoch": 1.48, + "logps_train/chosen": -46.70785903930664, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -82.52395629882812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.24031728506088257, + "rewards_train/margins": 2.754266321659088, + "rewards_train/rejected": -2.9945836067199707, + "step": 1116 + }, + { + "epoch": 1.48, + "logps_train/chosen": -70.84528350830078, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -87.5, + "logps_train/rejected": -124.11961364746094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1279715597629547, + "rewards_train/margins": 3.7868084013462067, + "rewards_train/rejected": -3.658836841583252, + "step": 1117 + }, + { + "epoch": 1.48, + "learning_rate": 3.714921570390583e-07, + "loss": 0.0845, + "step": 1118 + }, + { + "epoch": 1.48, + "logps_train/chosen": -49.310096740722656, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -121.67115783691406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.601021945476532, + "rewards_train/margins": 5.2337629199028015, + "rewards_train/rejected": -4.6327409744262695, + "step": 1118 + }, + { + "epoch": 1.49, + "logps_train/chosen": -57.2186279296875, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -93.30982971191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.685949444770813, + "rewards_train/margins": 4.887245297431946, + "rewards_train/rejected": -4.201295852661133, + "step": 1119 + }, + { + "epoch": 1.49, + "learning_rate": 3.710113460554915e-07, + "loss": 0.0156, + "step": 1120 + }, + { + "epoch": 1.49, + "logps_train/chosen": -41.76527404785156, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -114.47797393798828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1312849223613739, + "rewards_train/margins": 4.600957125425339, + "rewards_train/rejected": -4.469672203063965, + "step": 1120 + }, + { + "epoch": 1.49, + "logps_train/chosen": -48.3980712890625, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -89.03121948242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7461304068565369, + "rewards_train/margins": 3.494564712047577, + "rewards_train/rejected": -2.74843430519104, + "step": 1121 + }, + { + "epoch": 1.49, + "learning_rate": 3.705299497899181e-07, + "loss": 0.0305, + "step": 1122 + }, + { + "epoch": 1.49, + "logps_train/chosen": -30.208553314208984, + "logps_train/ref_chosen": -30.375, + "logps_train/ref_rejected": -31.0, + "logps_train/rejected": -56.80779266357422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.014691617339849472, + "rewards_train/margins": 2.5907833836972713, + "rewards_train/rejected": -2.576091766357422, + "step": 1122 + }, + { + "epoch": 1.49, + "logps_train/chosen": -86.70021057128906, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -148.09185791015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5143535137176514, + "rewards_train/margins": 5.274319410324097, + "rewards_train/rejected": -4.759965896606445, + "step": 1123 + }, + { + "epoch": 1.49, + "learning_rate": 3.700479705706535e-07, + "loss": 0.0876, + "step": 1124 + }, + { + "epoch": 1.49, + "logps_train/chosen": -61.686363220214844, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -92.98202514648438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2673010230064392, + "rewards_train/margins": 3.260034739971161, + "rewards_train/rejected": -2.9927337169647217, + "step": 1124 + }, + { + "epoch": 1.49, + "logps_train/chosen": -48.21138000488281, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -95.42237854003906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0976119339466095, + "rewards_train/margins": 3.884381800889969, + "rewards_train/rejected": -3.7867698669433594, + "step": 1125 + }, + { + "epoch": 1.5, + "learning_rate": 3.6956541072883254e-07, + "loss": 0.0842, + "step": 1126 + }, + { + "epoch": 1.5, + "logps_train/chosen": -71.74105834960938, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -109.72960662841797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5962070822715759, + "rewards_train/margins": 4.600418031215668, + "rewards_train/rejected": -4.004210948944092, + "step": 1126 + }, + { + "epoch": 1.5, + "logps_train/chosen": -71.23037719726562, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -103.26434326171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6082123517990112, + "rewards_train/margins": 4.369021773338318, + "rewards_train/rejected": -3.7608094215393066, + "step": 1127 + }, + { + "epoch": 1.5, + "learning_rate": 3.6908227259839823e-07, + "loss": 0.037, + "step": 1128 + }, + { + "epoch": 1.5, + "logps_train/chosen": -53.96746826171875, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -105.89904022216797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6032530069351196, + "rewards_train/margins": 4.166594386100769, + "rewards_train/rejected": -3.5633413791656494, + "step": 1128 + }, + { + "epoch": 1.5, + "logps_train/chosen": -50.995262145996094, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -42.5, + "logps_train/rejected": -86.5063705444336, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.36124521493911743, + "rewards_train/margins": 4.03626674413681, + "rewards_train/rejected": -4.397511959075928, + "step": 1129 + }, + { + "epoch": 1.5, + "learning_rate": 3.685985585160907e-07, + "loss": 0.0614, + "step": 1130 + }, + { + "epoch": 1.5, + "logps_train/chosen": -68.00580596923828, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -132.22116088867188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6009820699691772, + "rewards_train/margins": 5.766848921775818, + "rewards_train/rejected": -5.165866851806641, + "step": 1130 + }, + { + "epoch": 1.5, + "logps_train/chosen": -55.612831115722656, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -96.86337280273438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8918418884277344, + "rewards_train/margins": 3.401616096496582, + "rewards_train/rejected": -2.5097742080688477, + "step": 1131 + }, + { + "epoch": 1.5, + "learning_rate": 3.681142708214355e-07, + "loss": 0.0621, + "step": 1132 + }, + { + "epoch": 1.5, + "logps_train/chosen": -46.75406265258789, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -49.75, + "logps_train/rejected": -90.41736602783203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8402189016342163, + "rewards_train/margins": 4.903830885887146, + "rewards_train/rejected": -4.06361198425293, + "step": 1132 + }, + { + "epoch": 1.5, + "logps_train/chosen": -51.25959014892578, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -111.66104888916016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3088918924331665, + "rewards_train/margins": 4.733101963996887, + "rewards_train/rejected": -4.424210071563721, + "step": 1133 + }, + { + "epoch": 1.51, + "learning_rate": 3.6762941185673274e-07, + "loss": 0.0321, + "step": 1134 + }, + { + "epoch": 1.51, + "logps_train/chosen": -102.30294799804688, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -160.617919921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.05216968059539795, + "rewards_train/margins": 6.041653752326965, + "rewards_train/rejected": -6.093823432922363, + "step": 1134 + }, + { + "epoch": 1.51, + "logps_train/chosen": -60.56263732910156, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -123.65089416503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.014048665761947632, + "rewards_train/margins": 3.513512462377548, + "rewards_train/rejected": -3.4994637966156006, + "step": 1135 + }, + { + "epoch": 1.51, + "learning_rate": 3.6714398396704527e-07, + "loss": 0.082, + "step": 1136 + }, + { + "epoch": 1.51, + "logps_train/chosen": -49.99536895751953, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -96.3818359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.36296308040618896, + "rewards_train/margins": 4.123021483421326, + "rewards_train/rejected": -3.7600584030151367, + "step": 1136 + }, + { + "epoch": 1.51, + "logps_train/chosen": -67.23214721679688, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -108.01008605957031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4090116620063782, + "rewards_train/margins": 3.5334580540657043, + "rewards_train/rejected": -3.124446392059326, + "step": 1137 + }, + { + "epoch": 1.51, + "learning_rate": 3.666579895001877e-07, + "loss": 0.0504, + "step": 1138 + }, + { + "epoch": 1.51, + "logps_train/chosen": -52.253990173339844, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -106.87899780273438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7339762449264526, + "rewards_train/margins": 4.970313906669617, + "rewards_train/rejected": -4.236337661743164, + "step": 1138 + }, + { + "epoch": 1.51, + "logps_train/chosen": -59.06680679321289, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -129.88473510742188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3948820233345032, + "rewards_train/margins": 4.944293797016144, + "rewards_train/rejected": -4.549411773681641, + "step": 1139 + }, + { + "epoch": 1.51, + "learning_rate": 3.6617143080671513e-07, + "loss": 0.061, + "step": 1140 + }, + { + "epoch": 1.51, + "logps_train/chosen": -41.75344467163086, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -92.43074035644531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6394991278648376, + "rewards_train/margins": 4.052494704723358, + "rewards_train/rejected": -3.4129955768585205, + "step": 1140 + }, + { + "epoch": 1.52, + "logps_train/chosen": -53.30633544921875, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -40.75, + "logps_train/rejected": -72.99752807617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5053037405014038, + "rewards_train/margins": 3.7284945249557495, + "rewards_train/rejected": -3.2231907844543457, + "step": 1141 + }, + { + "epoch": 1.52, + "learning_rate": 3.6568431023991133e-07, + "loss": 0.0753, + "step": 1142 + }, + { + "epoch": 1.52, + "logps_train/chosen": -98.69474029541016, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -112.0, + "logps_train/rejected": -166.5941925048828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2647863030433655, + "rewards_train/margins": 5.191898763179779, + "rewards_train/rejected": -5.4566850662231445, + "step": 1142 + }, + { + "epoch": 1.52, + "logps_train/chosen": -67.47549438476562, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -104.95869445800781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8594820499420166, + "rewards_train/margins": 4.888163805007935, + "rewards_train/rejected": -4.028681755065918, + "step": 1143 + }, + { + "epoch": 1.52, + "learning_rate": 3.651966301557777e-07, + "loss": 0.0255, + "step": 1144 + }, + { + "epoch": 1.52, + "logps_train/chosen": -78.78056335449219, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -133.0907440185547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4133502244949341, + "rewards_train/margins": 4.402111649513245, + "rewards_train/rejected": -3.9887614250183105, + "step": 1144 + }, + { + "epoch": 1.52, + "logps_train/chosen": -50.216392517089844, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -99.7728271484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4322670102119446, + "rewards_train/margins": 4.115800201892853, + "rewards_train/rejected": -3.683533191680908, + "step": 1145 + }, + { + "epoch": 1.52, + "learning_rate": 3.647083929130218e-07, + "loss": 0.0633, + "step": 1146 + }, + { + "epoch": 1.52, + "logps_train/chosen": -29.023555755615234, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -88.9283218383789, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.48670676350593567, + "rewards_train/margins": 4.163914233446121, + "rewards_train/rejected": -3.6772074699401855, + "step": 1146 + }, + { + "epoch": 1.52, + "logps_train/chosen": -65.61752319335938, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -105.22210693359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4679350256919861, + "rewards_train/margins": 4.622957646846771, + "rewards_train/rejected": -4.155022621154785, + "step": 1147 + }, + { + "epoch": 1.52, + "learning_rate": 3.6421960087304606e-07, + "loss": 0.0265, + "step": 1148 + }, + { + "epoch": 1.52, + "logps_train/chosen": -60.67645263671875, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -111.85208892822266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.34641727805137634, + "rewards_train/margins": 5.425376266241074, + "rewards_train/rejected": -5.078958988189697, + "step": 1148 + }, + { + "epoch": 1.53, + "logps_train/chosen": -70.41081237792969, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -109.43710327148438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.45201873779296875, + "rewards_train/margins": 3.971378803253174, + "rewards_train/rejected": -4.423397541046143, + "step": 1149 + }, + { + "epoch": 1.53, + "learning_rate": 3.63730256399936e-07, + "loss": 0.0185, + "step": 1150 + }, + { + "epoch": 1.53, + "logps_train/chosen": -63.833038330078125, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -95.2242660522461, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22763383388519287, + "rewards_train/margins": 4.226622939109802, + "rewards_train/rejected": -3.9989891052246094, + "step": 1150 + }, + { + "epoch": 1.53, + "logps_train/chosen": -79.74911499023438, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -139.5867462158203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18446320295333862, + "rewards_train/margins": 4.833762347698212, + "rewards_train/rejected": -4.649299144744873, + "step": 1151 + }, + { + "epoch": 1.53, + "learning_rate": 3.6324036186044916e-07, + "loss": 0.0251, + "step": 1152 + }, + { + "epoch": 1.53, + "logps_train/chosen": -54.11359405517578, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -117.70050811767578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6667654514312744, + "rewards_train/margins": 5.0352537631988525, + "rewards_train/rejected": -4.368488311767578, + "step": 1152 + }, + { + "epoch": 1.53, + "logps_train/chosen": -40.71803283691406, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -84.96842956542969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7000718712806702, + "rewards_train/margins": 4.435976564884186, + "rewards_train/rejected": -3.7359046936035156, + "step": 1153 + }, + { + "epoch": 1.53, + "learning_rate": 3.627499196240036e-07, + "loss": 0.023, + "step": 1154 + }, + { + "epoch": 1.53, + "logps_train/chosen": -55.13288116455078, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -62.07521057128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.689836859703064, + "rewards_train/margins": 2.4723581075668335, + "rewards_train/rejected": -1.7825212478637695, + "step": 1154 + }, + { + "epoch": 1.53, + "logps_train/chosen": -66.95547485351562, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -140.13360595703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6317959427833557, + "rewards_train/margins": 5.068594038486481, + "rewards_train/rejected": -4.436798095703125, + "step": 1155 + }, + { + "epoch": 1.54, + "learning_rate": 3.622589320626662e-07, + "loss": 0.0748, + "step": 1156 + }, + { + "epoch": 1.54, + "logps_train/chosen": -85.71044921875, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -110.75030517578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.37739309668540955, + "rewards_train/margins": 3.2602365911006927, + "rewards_train/rejected": -2.882843494415283, + "step": 1156 + }, + { + "epoch": 1.54, + "logps_train/chosen": -67.10986328125, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -96.31272888183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4218258559703827, + "rewards_train/margins": 3.9140367209911346, + "rewards_train/rejected": -3.492210865020752, + "step": 1157 + }, + { + "epoch": 1.54, + "learning_rate": 3.6176740155114156e-07, + "loss": 0.0752, + "step": 1158 + }, + { + "epoch": 1.54, + "logps_train/chosen": -70.64180755615234, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -92.64602661132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7436319589614868, + "rewards_train/margins": 3.8695627450942993, + "rewards_train/rejected": -3.1259307861328125, + "step": 1158 + }, + { + "epoch": 1.54, + "logps_train/chosen": -91.02542877197266, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -143.17457580566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45839452743530273, + "rewards_train/margins": 5.1750712394714355, + "rewards_train/rejected": -4.716676712036133, + "step": 1159 + }, + { + "epoch": 1.54, + "learning_rate": 3.6127533046676e-07, + "loss": 0.0629, + "step": 1160 + }, + { + "epoch": 1.54, + "logps_train/chosen": -63.011863708496094, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -87.22319793701172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9050636291503906, + "rewards_train/margins": 4.499258756637573, + "rewards_train/rejected": -3.5941951274871826, + "step": 1160 + }, + { + "epoch": 1.54, + "logps_train/chosen": -43.46173095703125, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -91.20061492919922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1538269966840744, + "rewards_train/margins": 3.6908805817365646, + "rewards_train/rejected": -3.5370535850524902, + "step": 1161 + }, + { + "epoch": 1.54, + "learning_rate": 3.607827211894667e-07, + "loss": 0.032, + "step": 1162 + }, + { + "epoch": 1.54, + "logps_train/chosen": -53.33774185180664, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -88.04535675048828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.142788365483284, + "rewards_train/margins": 4.003574326634407, + "rewards_train/rejected": -3.860785961151123, + "step": 1162 + }, + { + "epoch": 1.54, + "logps_train/chosen": -76.51295471191406, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -130.69015502929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07292267680168152, + "rewards_train/margins": 4.807563751935959, + "rewards_train/rejected": -4.734641075134277, + "step": 1163 + }, + { + "epoch": 1.55, + "learning_rate": 3.6028957610180966e-07, + "loss": 0.0356, + "step": 1164 + }, + { + "epoch": 1.55, + "logps_train/chosen": -60.12653350830078, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -125.51658630371094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1357839107513428, + "rewards_train/margins": 5.7608802318573, + "rewards_train/rejected": -4.625096321105957, + "step": 1164 + }, + { + "epoch": 1.55, + "logps_train/chosen": -38.338462829589844, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -37.0, + "logps_train/rejected": -66.69853973388672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1171303987503052, + "rewards_train/margins": 4.082296967506409, + "rewards_train/rejected": -2.9651665687561035, + "step": 1165 + }, + { + "epoch": 1.55, + "learning_rate": 3.597958975889285e-07, + "loss": 0.0402, + "step": 1166 + }, + { + "epoch": 1.55, + "logps_train/chosen": -88.50666809082031, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -130.41317749023438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.46629127860069275, + "rewards_train/margins": 4.565651088953018, + "rewards_train/rejected": -5.031942367553711, + "step": 1166 + }, + { + "epoch": 1.55, + "logps_train/chosen": -47.40610885620117, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -87.6757583618164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5031390190124512, + "rewards_train/margins": 4.253527402877808, + "rewards_train/rejected": -3.7503883838653564, + "step": 1167 + }, + { + "epoch": 1.55, + "learning_rate": 3.593016880385425e-07, + "loss": 0.0817, + "step": 1168 + }, + { + "epoch": 1.55, + "logps_train/chosen": -57.20851516723633, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -113.05278015136719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6791485548019409, + "rewards_train/margins": 5.009426951408386, + "rewards_train/rejected": -4.330278396606445, + "step": 1168 + }, + { + "epoch": 1.55, + "logps_train/chosen": -81.41835021972656, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -122.6648941040039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7284771203994751, + "rewards_train/margins": 4.707466244697571, + "rewards_train/rejected": -3.9789891242980957, + "step": 1169 + }, + { + "epoch": 1.55, + "learning_rate": 3.588069498409398e-07, + "loss": 0.0236, + "step": 1170 + }, + { + "epoch": 1.55, + "logps_train/chosen": -45.03921127319336, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -119.73301696777344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45545387268066406, + "rewards_train/margins": 5.338130950927734, + "rewards_train/rejected": -4.88267707824707, + "step": 1170 + }, + { + "epoch": 1.56, + "logps_train/chosen": -63.31031799316406, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -145.516357421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7498277425765991, + "rewards_train/margins": 6.492089629173279, + "rewards_train/rejected": -5.74226188659668, + "step": 1171 + }, + { + "epoch": 1.56, + "learning_rate": 3.58311685388965e-07, + "loss": 0.0123, + "step": 1172 + }, + { + "epoch": 1.56, + "logps_train/chosen": -62.23689270019531, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -59.25, + "logps_train/rejected": -101.80392456054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.14076361060142517, + "rewards_train/margins": 4.406313270330429, + "rewards_train/rejected": -4.265549659729004, + "step": 1172 + }, + { + "epoch": 1.56, + "logps_train/chosen": -71.29048156738281, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -110.56542205810547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18188969790935516, + "rewards_train/margins": 4.810307189822197, + "rewards_train/rejected": -4.628417491912842, + "step": 1173 + }, + { + "epoch": 1.56, + "learning_rate": 3.578158970780082e-07, + "loss": 0.0245, + "step": 1174 + }, + { + "epoch": 1.56, + "logps_train/chosen": -62.290924072265625, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -101.00851440429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.06971733272075653, + "rewards_train/margins": 3.6217594891786575, + "rewards_train/rejected": -3.691476821899414, + "step": 1174 + }, + { + "epoch": 1.56, + "logps_train/chosen": -48.831886291503906, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -59.5, + "logps_train/rejected": -100.26868438720703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42149895429611206, + "rewards_train/margins": 4.4999300837516785, + "rewards_train/rejected": -4.078431129455566, + "step": 1175 + }, + { + "epoch": 1.56, + "learning_rate": 3.573195873059932e-07, + "loss": 0.0447, + "step": 1176 + }, + { + "epoch": 1.56, + "logps_train/chosen": -40.008216857910156, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -76.95565795898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22886571288108826, + "rewards_train/margins": 2.8271656930446625, + "rewards_train/rejected": -2.598299980163574, + "step": 1176 + }, + { + "epoch": 1.56, + "logps_train/chosen": -42.883583068847656, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -85.88471221923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9749228954315186, + "rewards_train/margins": 4.632534742355347, + "rewards_train/rejected": -3.657611846923828, + "step": 1177 + }, + { + "epoch": 1.56, + "learning_rate": 3.568227584733656e-07, + "loss": 0.1012, + "step": 1178 + }, + { + "epoch": 1.56, + "logps_train/chosen": -37.951316833496094, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -73.7178955078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1142435073852539, + "rewards_train/margins": 3.128220558166504, + "rewards_train/rejected": -3.01397705078125, + "step": 1178 + }, + { + "epoch": 1.57, + "logps_train/chosen": -50.12800598144531, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -100.78193664550781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.07608184218406677, + "rewards_train/margins": 4.1896118223667145, + "rewards_train/rejected": -4.265693664550781, + "step": 1179 + }, + { + "epoch": 1.57, + "learning_rate": 3.5632541298308194e-07, + "loss": 0.0523, + "step": 1180 + }, + { + "epoch": 1.57, + "logps_train/chosen": -69.09870147705078, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -117.29118347167969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1557547152042389, + "rewards_train/margins": 4.663778930902481, + "rewards_train/rejected": -4.508024215698242, + "step": 1180 + }, + { + "epoch": 1.57, + "logps_train/chosen": -63.69761657714844, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -126.79061889648438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8177383542060852, + "rewards_train/margins": 5.1624258160591125, + "rewards_train/rejected": -4.344687461853027, + "step": 1181 + }, + { + "epoch": 1.57, + "learning_rate": 3.5582755324059727e-07, + "loss": 0.0183, + "step": 1182 + }, + { + "epoch": 1.57, + "logps_train/chosen": -87.4903564453125, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -97.0, + "logps_train/rejected": -144.6702880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8415901064872742, + "rewards_train/margins": 5.608618915081024, + "rewards_train/rejected": -4.76702880859375, + "step": 1182 + }, + { + "epoch": 1.57, + "logps_train/chosen": -60.89975357055664, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -107.13526916503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5334621667861938, + "rewards_train/margins": 4.8805824518203735, + "rewards_train/rejected": -4.34712028503418, + "step": 1183 + }, + { + "epoch": 1.57, + "learning_rate": 3.5532918165385394e-07, + "loss": 0.0181, + "step": 1184 + }, + { + "epoch": 1.57, + "logps_train/chosen": -78.36991119384766, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -137.6177215576172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7301962375640869, + "rewards_train/margins": 5.438843011856079, + "rewards_train/rejected": -4.708646774291992, + "step": 1184 + }, + { + "epoch": 1.57, + "logps_train/chosen": -61.22022247314453, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -112.34793853759766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31469622254371643, + "rewards_train/margins": 4.821365267038345, + "rewards_train/rejected": -4.506669044494629, + "step": 1185 + }, + { + "epoch": 1.58, + "learning_rate": 3.5483030063327e-07, + "loss": 0.0148, + "step": 1186 + }, + { + "epoch": 1.58, + "logps_train/chosen": -60.09109115600586, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -133.95828247070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2237035036087036, + "rewards_train/margins": 6.324219822883606, + "rewards_train/rejected": -5.100516319274902, + "step": 1186 + }, + { + "epoch": 1.58, + "logps_train/chosen": -95.78916931152344, + "logps_train/ref_chosen": -98.0, + "logps_train/ref_rejected": -96.0, + "logps_train/rejected": -143.90614318847656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22108271718025208, + "rewards_train/margins": 5.030447036027908, + "rewards_train/rejected": -4.809364318847656, + "step": 1187 + }, + { + "epoch": 1.58, + "learning_rate": 3.543309125917272e-07, + "loss": 0.0498, + "step": 1188 + }, + { + "epoch": 1.58, + "logps_train/chosen": -50.32183074951172, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -46.25, + "logps_train/rejected": -74.55839538574219, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.12094205617904663, + "rewards_train/margins": 2.9533440470695496, + "rewards_train/rejected": -2.832401990890503, + "step": 1188 + }, + { + "epoch": 1.58, + "logps_train/chosen": -46.14740753173828, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -105.67840576171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6493219137191772, + "rewards_train/margins": 4.544506669044495, + "rewards_train/rejected": -3.8951847553253174, + "step": 1189 + }, + { + "epoch": 1.58, + "learning_rate": 3.5383101994455977e-07, + "loss": 0.1152, + "step": 1190 + }, + { + "epoch": 1.58, + "logps_train/chosen": -68.40486145019531, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -82.69146728515625, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.23060722649097443, + "rewards_train/margins": 3.468504622578621, + "rewards_train/rejected": -3.2378973960876465, + "step": 1190 + }, + { + "epoch": 1.58, + "logps_train/chosen": -30.870201110839844, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -30.625, + "logps_train/rejected": -56.294700622558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.732511043548584, + "rewards_train/margins": 3.30143404006958, + "rewards_train/rejected": -2.568922996520996, + "step": 1191 + }, + { + "epoch": 1.58, + "learning_rate": 3.533306251095425e-07, + "loss": 0.1014, + "step": 1192 + }, + { + "epoch": 1.58, + "logps_train/chosen": -45.39210510253906, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -84.66703796386719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.00864112377166748, + "rewards_train/margins": 3.762454390525818, + "rewards_train/rejected": -3.7538132667541504, + "step": 1192 + }, + { + "epoch": 1.58, + "logps_train/chosen": -88.29261779785156, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -102.97598266601562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2426128387451172, + "rewards_train/margins": 4.42927360534668, + "rewards_train/rejected": -3.1866607666015625, + "step": 1193 + }, + { + "epoch": 1.59, + "learning_rate": 3.5282973050687875e-07, + "loss": 0.0568, + "step": 1194 + }, + { + "epoch": 1.59, + "logps_train/chosen": -63.27000427246094, + "logps_train/ref_chosen": -60.0, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -108.3096923828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.33676597476005554, + "rewards_train/margins": 3.7551415264606476, + "rewards_train/rejected": -4.091907501220703, + "step": 1194 + }, + { + "epoch": 1.59, + "logps_train/chosen": -58.997398376464844, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -90.92861938476562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4861975610256195, + "rewards_train/margins": 4.091559022665024, + "rewards_train/rejected": -3.6053614616394043, + "step": 1195 + }, + { + "epoch": 1.59, + "learning_rate": 3.523283385591895e-07, + "loss": 0.0535, + "step": 1196 + }, + { + "epoch": 1.59, + "logps_train/chosen": -56.97357177734375, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -109.07212829589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3094664216041565, + "rewards_train/margins": 4.800871789455414, + "rewards_train/rejected": -5.11033821105957, + "step": 1196 + }, + { + "epoch": 1.59, + "logps_train/chosen": -46.01885223388672, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -82.29767608642578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.17610406875610352, + "rewards_train/margins": 3.611085891723633, + "rewards_train/rejected": -3.7871899604797363, + "step": 1197 + }, + { + "epoch": 1.59, + "learning_rate": 3.518264516915008e-07, + "loss": 0.0366, + "step": 1198 + }, + { + "epoch": 1.59, + "logps_train/chosen": -49.95056915283203, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -88.54023742675781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.44166162610054016, + "rewards_train/margins": 3.819123536348343, + "rewards_train/rejected": -3.3774619102478027, + "step": 1198 + }, + { + "epoch": 1.59, + "logps_train/chosen": -44.609657287597656, + "logps_train/ref_chosen": -42.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -76.62826538085938, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.26877841353416443, + "rewards_train/margins": 2.64873543381691, + "rewards_train/rejected": -2.917513847351074, + "step": 1199 + }, + { + "epoch": 1.59, + "learning_rate": 3.513240723312326e-07, + "loss": 0.2121, + "step": 1200 + }, + { + "epoch": 1.59, + "logps_train/chosen": -62.72941589355469, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -98.19572448730469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5458085536956787, + "rewards_train/margins": 3.8396003246307373, + "rewards_train/rejected": -3.2937917709350586, + "step": 1200 + }, + { + "epoch": 1.59, + "logps_train/chosen": -53.761287689208984, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -58.75, + "logps_train/rejected": -98.62472534179688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26215219497680664, + "rewards_train/margins": 4.246500015258789, + "rewards_train/rejected": -3.9843478202819824, + "step": 1201 + }, + { + "epoch": 1.6, + "learning_rate": 3.5082120290818685e-07, + "loss": 0.0451, + "step": 1202 + }, + { + "epoch": 1.6, + "logps_train/chosen": -56.65771484375, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -97.57115173339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6967286467552185, + "rewards_train/margins": 4.6272823214530945, + "rewards_train/rejected": -3.930553674697876, + "step": 1202 + }, + { + "epoch": 1.6, + "logps_train/chosen": -76.63764190673828, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -146.3726348876953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2862357199192047, + "rewards_train/margins": 5.442249208688736, + "rewards_train/rejected": -5.156013488769531, + "step": 1203 + }, + { + "epoch": 1.6, + "learning_rate": 3.5031784585453564e-07, + "loss": 0.0303, + "step": 1204 + }, + { + "epoch": 1.6, + "logps_train/chosen": -62.111236572265625, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -106.24639129638672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.18924865126609802, + "rewards_train/margins": 4.051015287637711, + "rewards_train/rejected": -4.240263938903809, + "step": 1204 + }, + { + "epoch": 1.6, + "logps_train/chosen": -86.25396728515625, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -127.8245849609375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.35430294275283813, + "rewards_train/margins": 4.792999684810638, + "rewards_train/rejected": -5.147302627563477, + "step": 1205 + }, + { + "epoch": 1.6, + "learning_rate": 3.498140036048098e-07, + "loss": 0.0505, + "step": 1206 + }, + { + "epoch": 1.6, + "logps_train/chosen": -78.15673828125, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -131.0347137451172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28432610630989075, + "rewards_train/margins": 4.1042037308216095, + "rewards_train/rejected": -3.8198776245117188, + "step": 1206 + }, + { + "epoch": 1.6, + "logps_train/chosen": -58.03449249267578, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -97.57174682617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5809259414672852, + "rewards_train/margins": 4.359975576400757, + "rewards_train/rejected": -3.7790496349334717, + "step": 1207 + }, + { + "epoch": 1.6, + "learning_rate": 3.493096785958863e-07, + "loss": 0.039, + "step": 1208 + }, + { + "epoch": 1.6, + "logps_train/chosen": -75.44340515136719, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -129.421630859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.616596519947052, + "rewards_train/margins": 5.293134033679962, + "rewards_train/rejected": -4.67653751373291, + "step": 1208 + }, + { + "epoch": 1.61, + "logps_train/chosen": -71.10272979736328, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -120.60340881347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07566455006599426, + "rewards_train/margins": 4.165692359209061, + "rewards_train/rejected": -4.090027809143066, + "step": 1209 + }, + { + "epoch": 1.61, + "learning_rate": 3.488048732669776e-07, + "loss": 0.0165, + "step": 1210 + }, + { + "epoch": 1.61, + "logps_train/chosen": -59.339744567871094, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -127.5440902709961, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2285252809524536, + "rewards_train/margins": 6.329809308052063, + "rewards_train/rejected": -5.101284027099609, + "step": 1210 + }, + { + "epoch": 1.61, + "logps_train/chosen": -59.625404357910156, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -99.11957550048828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4937094449996948, + "rewards_train/margins": 4.430667281150818, + "rewards_train/rejected": -3.936957836151123, + "step": 1211 + }, + { + "epoch": 1.61, + "learning_rate": 3.4829959005961885e-07, + "loss": 0.0523, + "step": 1212 + }, + { + "epoch": 1.61, + "logps_train/chosen": -92.18133544921875, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -131.87074279785156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15999111533164978, + "rewards_train/margins": 4.354878276586533, + "rewards_train/rejected": -4.194887161254883, + "step": 1212 + }, + { + "epoch": 1.61, + "logps_train/chosen": -66.0252685546875, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -125.59088134765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.25721460580825806, + "rewards_train/margins": 4.74874871969223, + "rewards_train/rejected": -5.005963325500488, + "step": 1213 + }, + { + "epoch": 1.61, + "learning_rate": 3.4779383141765685e-07, + "loss": 0.0283, + "step": 1214 + }, + { + "epoch": 1.61, + "logps_train/chosen": -49.56360626220703, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -125.35612487792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4983266592025757, + "rewards_train/margins": 5.301126837730408, + "rewards_train/rejected": -4.802800178527832, + "step": 1214 + }, + { + "epoch": 1.61, + "logps_train/chosen": -51.14799499511719, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -61.25, + "logps_train/rejected": -98.65750122070312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3371535837650299, + "rewards_train/margins": 4.070091158151627, + "rewards_train/rejected": -3.7329375743865967, + "step": 1215 + }, + { + "epoch": 1.61, + "learning_rate": 3.4728759978723756e-07, + "loss": 0.0633, + "step": 1216 + }, + { + "epoch": 1.61, + "logps_train/chosen": -41.30083084106445, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -93.78266906738281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.956244945526123, + "rewards_train/margins": 4.874355792999268, + "rewards_train/rejected": -3.9181108474731445, + "step": 1216 + }, + { + "epoch": 1.62, + "logps_train/chosen": -75.86930847167969, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -97.98799133300781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9615062475204468, + "rewards_train/margins": 4.86889922618866, + "rewards_train/rejected": -3.907392978668213, + "step": 1217 + }, + { + "epoch": 1.62, + "learning_rate": 3.4678089761679484e-07, + "loss": 0.0284, + "step": 1218 + }, + { + "epoch": 1.62, + "logps_train/chosen": -45.80632400512695, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -77.6366958618164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6521801352500916, + "rewards_train/margins": 3.8248338103294373, + "rewards_train/rejected": -3.1726536750793457, + "step": 1218 + }, + { + "epoch": 1.62, + "logps_train/chosen": -101.06536865234375, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -152.53598022460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.012212730944156647, + "rewards_train/margins": 5.637685276567936, + "rewards_train/rejected": -5.625472545623779, + "step": 1219 + }, + { + "epoch": 1.62, + "learning_rate": 3.4627372735703816e-07, + "loss": 0.0252, + "step": 1220 + }, + { + "epoch": 1.62, + "logps_train/chosen": -39.80760955810547, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -88.94658660888672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7004889249801636, + "rewards_train/margins": 5.056084990501404, + "rewards_train/rejected": -4.35559606552124, + "step": 1220 + }, + { + "epoch": 1.62, + "logps_train/chosen": -56.40376281738281, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -101.62013244628906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7744676470756531, + "rewards_train/margins": 3.9200751185417175, + "rewards_train/rejected": -3.1456074714660645, + "step": 1221 + }, + { + "epoch": 1.62, + "learning_rate": 3.457660914609411e-07, + "loss": 0.0717, + "step": 1222 + }, + { + "epoch": 1.62, + "logps_train/chosen": -75.43949890136719, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -111.02293395996094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6974563598632812, + "rewards_train/margins": 3.5106873512268066, + "rewards_train/rejected": -2.8132309913635254, + "step": 1222 + }, + { + "epoch": 1.62, + "logps_train/chosen": -69.11103820800781, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -105.63573455810547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1693650186061859, + "rewards_train/margins": 4.284501165151596, + "rewards_train/rejected": -4.11513614654541, + "step": 1223 + }, + { + "epoch": 1.63, + "learning_rate": 3.452579923837292e-07, + "loss": 0.0426, + "step": 1224 + }, + { + "epoch": 1.63, + "logps_train/chosen": -75.84112548828125, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -105.89423370361328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1471378803253174, + "rewards_train/margins": 4.413904905319214, + "rewards_train/rejected": -3.2667670249938965, + "step": 1224 + }, + { + "epoch": 1.63, + "logps_train/chosen": -37.039520263671875, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -95.42317962646484, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8288604021072388, + "rewards_train/margins": 4.6032103300094604, + "rewards_train/rejected": -3.7743499279022217, + "step": 1225 + }, + { + "epoch": 1.63, + "learning_rate": 3.447494325828685e-07, + "loss": 0.0413, + "step": 1226 + }, + { + "epoch": 1.63, + "logps_train/chosen": -48.159095764160156, + "logps_train/ref_chosen": -47.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -89.51729583740234, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08934725821018219, + "rewards_train/margins": 4.0967575162649155, + "rewards_train/rejected": -4.186104774475098, + "step": 1226 + }, + { + "epoch": 1.63, + "logps_train/chosen": -84.19065856933594, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -134.36306762695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2425037920475006, + "rewards_train/margins": 5.409428924322128, + "rewards_train/rejected": -5.651932716369629, + "step": 1227 + }, + { + "epoch": 1.63, + "learning_rate": 3.442404145180528e-07, + "loss": 0.0432, + "step": 1228 + }, + { + "epoch": 1.63, + "logps_train/chosen": -38.32874298095703, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -38.5, + "logps_train/rejected": -68.63412475585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6839224696159363, + "rewards_train/margins": 3.6903037428855896, + "rewards_train/rejected": -3.0063812732696533, + "step": 1228 + }, + { + "epoch": 1.63, + "logps_train/chosen": -73.37425231933594, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -108.77545166015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.29382458329200745, + "rewards_train/margins": 3.9494947493076324, + "rewards_train/rejected": -3.655670166015625, + "step": 1229 + }, + { + "epoch": 1.63, + "learning_rate": 3.43730940651193e-07, + "loss": 0.0368, + "step": 1230 + }, + { + "epoch": 1.63, + "logps_train/chosen": -70.14106750488281, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -109.42921447753906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.18637317419052124, + "rewards_train/margins": 4.055767118930817, + "rewards_train/rejected": -4.242140293121338, + "step": 1230 + }, + { + "epoch": 1.63, + "logps_train/chosen": -62.73509979248047, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -93.09678649902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0218026638031006, + "rewards_train/margins": 3.65804386138916, + "rewards_train/rejected": -2.6362411975860596, + "step": 1231 + }, + { + "epoch": 1.64, + "learning_rate": 3.4322101344640404e-07, + "loss": 0.0791, + "step": 1232 + }, + { + "epoch": 1.64, + "logps_train/chosen": -89.06459045410156, + "logps_train/ref_chosen": -94.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -145.55136108398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5497916340827942, + "rewards_train/margins": 4.658833801746368, + "rewards_train/rejected": -4.109042167663574, + "step": 1232 + }, + { + "epoch": 1.64, + "logps_train/chosen": -69.12921142578125, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -145.77700805664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9370793700218201, + "rewards_train/margins": 5.205406129360199, + "rewards_train/rejected": -4.268326759338379, + "step": 1233 + }, + { + "epoch": 1.64, + "learning_rate": 3.427106353699937e-07, + "loss": 0.0509, + "step": 1234 + }, + { + "epoch": 1.64, + "logps_train/chosen": -77.95858764648438, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -108.58003997802734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4329679608345032, + "rewards_train/margins": 3.6328489184379578, + "rewards_train/rejected": -4.065816879272461, + "step": 1234 + }, + { + "epoch": 1.64, + "logps_train/chosen": -76.71270751953125, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -132.99984741210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.0556458979845047, + "rewards_train/margins": 4.6990261524915695, + "rewards_train/rejected": -4.754672050476074, + "step": 1235 + }, + { + "epoch": 1.64, + "learning_rate": 3.421998088904504e-07, + "loss": 0.0467, + "step": 1236 + }, + { + "epoch": 1.64, + "logps_train/chosen": -43.599815368652344, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -39.25, + "logps_train/rejected": -70.81554412841797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09880734980106354, + "rewards_train/margins": 3.2440338283777237, + "rewards_train/rejected": -3.14522647857666, + "step": 1236 + }, + { + "epoch": 1.64, + "logps_train/chosen": -99.39512634277344, + "logps_train/ref_chosen": -102.0, + "logps_train/ref_rejected": -115.5, + "logps_train/rejected": -180.20960998535156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.27220600843429565, + "rewards_train/margins": 6.7302767634391785, + "rewards_train/rejected": -6.458070755004883, + "step": 1237 + }, + { + "epoch": 1.64, + "learning_rate": 3.416885364784313e-07, + "loss": 0.0708, + "step": 1238 + }, + { + "epoch": 1.64, + "logps_train/chosen": -79.3169174194336, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -120.51239013671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2026832103729248, + "rewards_train/margins": 5.737516641616821, + "rewards_train/rejected": -4.5348334312438965, + "step": 1238 + }, + { + "epoch": 1.65, + "logps_train/chosen": -64.22528839111328, + "logps_train/ref_chosen": -64.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -125.53378295898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.03463807702064514, + "rewards_train/margins": 4.275771468877792, + "rewards_train/rejected": -4.3104095458984375, + "step": 1239 + }, + { + "epoch": 1.65, + "learning_rate": 3.411768206067503e-07, + "loss": 0.0428, + "step": 1240 + }, + { + "epoch": 1.65, + "logps_train/chosen": -49.51910400390625, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -90.89720153808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7508238554000854, + "rewards_train/margins": 4.360856413841248, + "rewards_train/rejected": -3.610032558441162, + "step": 1240 + }, + { + "epoch": 1.65, + "logps_train/chosen": -53.82635498046875, + "logps_train/ref_chosen": -60.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -91.87954711914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6658023595809937, + "rewards_train/margins": 4.072490811347961, + "rewards_train/rejected": -3.4066884517669678, + "step": 1241 + }, + { + "epoch": 1.65, + "learning_rate": 3.4066466375036617e-07, + "loss": 0.0831, + "step": 1242 + }, + { + "epoch": 1.65, + "logps_train/chosen": -80.1483154296875, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -132.58181762695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5070439577102661, + "rewards_train/margins": 4.799601197242737, + "rewards_train/rejected": -4.292557239532471, + "step": 1242 + }, + { + "epoch": 1.65, + "logps_train/chosen": -46.889129638671875, + "logps_train/ref_chosen": -46.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -82.32633209228516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.07172523438930511, + "rewards_train/margins": 3.551533356308937, + "rewards_train/rejected": -3.623258590698242, + "step": 1243 + }, + { + "epoch": 1.65, + "learning_rate": 3.401520683863706e-07, + "loss": 0.0464, + "step": 1244 + }, + { + "epoch": 1.65, + "logps_train/chosen": -33.233970642089844, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -82.40876770019531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4750401973724365, + "rewards_train/margins": 4.036229848861694, + "rewards_train/rejected": -3.561189651489258, + "step": 1244 + }, + { + "epoch": 1.65, + "logps_train/chosen": -40.76352310180664, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -93.35983276367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3650539517402649, + "rewards_train/margins": 3.613537609577179, + "rewards_train/rejected": -3.248483657836914, + "step": 1245 + }, + { + "epoch": 1.65, + "learning_rate": 3.39639036993976e-07, + "loss": 0.0773, + "step": 1246 + }, + { + "epoch": 1.65, + "logps_train/chosen": -89.75995635986328, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -99.5, + "logps_train/rejected": -142.31585693359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3599419593811035, + "rewards_train/margins": 4.6329345703125, + "rewards_train/rejected": -4.2729926109313965, + "step": 1246 + }, + { + "epoch": 1.66, + "logps_train/chosen": -49.58206558227539, + "logps_train/ref_chosen": -47.75, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -121.36693572998047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1794956773519516, + "rewards_train/margins": 5.027509823441505, + "rewards_train/rejected": -5.207005500793457, + "step": 1247 + }, + { + "epoch": 1.66, + "learning_rate": 3.391255720545039e-07, + "loss": 0.0413, + "step": 1248 + }, + { + "epoch": 1.66, + "logps_train/chosen": -46.93727111816406, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -100.52733612060547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1140857934951782, + "rewards_train/margins": 3.9293192625045776, + "rewards_train/rejected": -2.8152334690093994, + "step": 1248 + }, + { + "epoch": 1.66, + "logps_train/chosen": -71.68449401855469, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -143.488525390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08311371505260468, + "rewards_train/margins": 5.366341635584831, + "rewards_train/rejected": -5.283227920532227, + "step": 1249 + }, + { + "epoch": 1.66, + "learning_rate": 3.386116760513724e-07, + "loss": 0.0267, + "step": 1250 + }, + { + "epoch": 1.66, + "logps_train/chosen": -43.33943176269531, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -59.0, + "logps_train/rejected": -97.82997131347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1644940972328186, + "rewards_train/margins": 4.059990704059601, + "rewards_train/rejected": -3.8954966068267822, + "step": 1250 + }, + { + "epoch": 1.66, + "logps_train/chosen": -69.80758666992188, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -87.5067367553711, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5895535945892334, + "rewards_train/margins": 3.7371022701263428, + "rewards_train/rejected": -3.1475486755371094, + "step": 1251 + }, + { + "epoch": 1.66, + "learning_rate": 3.380973514700849e-07, + "loss": 0.0384, + "step": 1252 + }, + { + "epoch": 1.66, + "logps_train/chosen": -61.50464630126953, + "logps_train/ref_chosen": -59.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -116.66973876953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.21140184998512268, + "rewards_train/margins": 4.689947217702866, + "rewards_train/rejected": -4.901349067687988, + "step": 1252 + }, + { + "epoch": 1.66, + "logps_train/chosen": -65.73272705078125, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -114.96029663085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.06350772082805634, + "rewards_train/margins": 4.2254908829927444, + "rewards_train/rejected": -4.288998603820801, + "step": 1253 + }, + { + "epoch": 1.67, + "learning_rate": 3.375826007982172e-07, + "loss": 0.0365, + "step": 1254 + }, + { + "epoch": 1.67, + "logps_train/chosen": -82.12040710449219, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -165.12677001953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4738973379135132, + "rewards_train/margins": 6.805324912071228, + "rewards_train/rejected": -6.331427574157715, + "step": 1254 + }, + { + "epoch": 1.67, + "logps_train/chosen": -41.567134857177734, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -47.25, + "logps_train/rejected": -81.79862213134766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3729740381240845, + "rewards_train/margins": 3.8278361558914185, + "rewards_train/rejected": -3.454862117767334, + "step": 1255 + }, + { + "epoch": 1.67, + "learning_rate": 3.3706742652540635e-07, + "loss": 0.0312, + "step": 1256 + }, + { + "epoch": 1.67, + "logps_train/chosen": -68.29951477050781, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -137.4251708984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04973659664392471, + "rewards_train/margins": 4.779754258692265, + "rewards_train/rejected": -4.73001766204834, + "step": 1256 + }, + { + "epoch": 1.67, + "logps_train/chosen": -34.21421813964844, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -72.66361999511719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6504534482955933, + "rewards_train/margins": 3.2168158292770386, + "rewards_train/rejected": -2.5663623809814453, + "step": 1257 + }, + { + "epoch": 1.67, + "learning_rate": 3.3655183114333783e-07, + "loss": 0.0653, + "step": 1258 + }, + { + "epoch": 1.67, + "logps_train/chosen": -65.99408721923828, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -140.14695739746094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5646533966064453, + "rewards_train/margins": 4.665286064147949, + "rewards_train/rejected": -4.100632667541504, + "step": 1258 + }, + { + "epoch": 1.67, + "logps_train/chosen": -58.06837844848633, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -124.74282836914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31347498297691345, + "rewards_train/margins": 5.089125961065292, + "rewards_train/rejected": -4.775650978088379, + "step": 1259 + }, + { + "epoch": 1.67, + "learning_rate": 3.3603581714573414e-07, + "loss": 0.0321, + "step": 1260 + }, + { + "epoch": 1.67, + "logps_train/chosen": -55.80063247680664, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -117.1109619140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1379057168960571, + "rewards_train/margins": 5.629470467567444, + "rewards_train/rejected": -4.491564750671387, + "step": 1260 + }, + { + "epoch": 1.67, + "logps_train/chosen": -78.77391815185547, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -123.04335021972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3460455536842346, + "rewards_train/margins": 4.267567694187164, + "rewards_train/rejected": -3.9215221405029297, + "step": 1261 + }, + { + "epoch": 1.68, + "learning_rate": 3.355193870283422e-07, + "loss": 0.0735, + "step": 1262 + }, + { + "epoch": 1.68, + "logps_train/chosen": -78.79950714111328, + "logps_train/ref_chosen": -84.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -103.21495056152344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5762991905212402, + "rewards_train/margins": 4.366544008255005, + "rewards_train/rejected": -3.7902448177337646, + "step": 1262 + }, + { + "epoch": 1.68, + "logps_train/chosen": -109.18474578857422, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -130.0, + "logps_train/rejected": -193.97239685058594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4028494954109192, + "rewards_train/margins": 6.012358367443085, + "rewards_train/rejected": -6.415207862854004, + "step": 1263 + }, + { + "epoch": 1.68, + "learning_rate": 3.3500254328892154e-07, + "loss": 0.0396, + "step": 1264 + }, + { + "epoch": 1.68, + "logps_train/chosen": -94.49217224121094, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -162.7421112060547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04453215003013611, + "rewards_train/margins": 6.174992889165878, + "rewards_train/rejected": -6.130460739135742, + "step": 1264 + }, + { + "epoch": 1.68, + "logps_train/chosen": -101.43020629882812, + "logps_train/ref_chosen": -100.5, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -166.46823120117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09614566713571548, + "rewards_train/margins": 5.439740143716335, + "rewards_train/rejected": -5.535885810852051, + "step": 1265 + }, + { + "epoch": 1.68, + "learning_rate": 3.3448528842723255e-07, + "loss": 0.0297, + "step": 1266 + }, + { + "epoch": 1.68, + "logps_train/chosen": -60.23590087890625, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -61.5, + "logps_train/rejected": -103.900146484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.34203481674194336, + "rewards_train/margins": 4.57267427444458, + "rewards_train/rejected": -4.230639457702637, + "step": 1266 + }, + { + "epoch": 1.68, + "logps_train/chosen": -53.927001953125, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -87.64691162109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3165586292743683, + "rewards_train/margins": 3.380663186311722, + "rewards_train/rejected": -3.0641045570373535, + "step": 1267 + }, + { + "epoch": 1.68, + "learning_rate": 3.3396762494502373e-07, + "loss": 0.0633, + "step": 1268 + }, + { + "epoch": 1.68, + "logps_train/chosen": -41.89490509033203, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -69.8431396484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2326774597167969, + "rewards_train/margins": 3.6228513717651367, + "rewards_train/rejected": -2.39017391204834, + "step": 1268 + }, + { + "epoch": 1.69, + "logps_train/chosen": -69.00054168701172, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -96.66023254394531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.40033674240112305, + "rewards_train/margins": 3.6028835773468018, + "rewards_train/rejected": -3.2025468349456787, + "step": 1269 + }, + { + "epoch": 1.69, + "learning_rate": 3.3344955534601993e-07, + "loss": 0.1047, + "step": 1270 + }, + { + "epoch": 1.69, + "logps_train/chosen": -45.46272659301758, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.25, + "logps_train/rejected": -70.67433166503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.14705386757850647, + "rewards_train/margins": 2.8000667989254, + "rewards_train/rejected": -2.9471206665039062, + "step": 1270 + }, + { + "epoch": 1.69, + "logps_train/chosen": -43.96276092529297, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -92.99159240722656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.630286693572998, + "rewards_train/margins": 4.413820743560791, + "rewards_train/rejected": -3.783534049987793, + "step": 1271 + }, + { + "epoch": 1.69, + "learning_rate": 3.329310821359103e-07, + "loss": 0.0707, + "step": 1272 + }, + { + "epoch": 1.69, + "logps_train/chosen": -43.77528381347656, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -60.25, + "logps_train/rejected": -98.27178955078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26544010639190674, + "rewards_train/margins": 4.076994061470032, + "rewards_train/rejected": -3.811553955078125, + "step": 1272 + }, + { + "epoch": 1.69, + "logps_train/chosen": -41.8399658203125, + "logps_train/ref_chosen": -44.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -87.28164672851562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2628786265850067, + "rewards_train/margins": 4.003543347120285, + "rewards_train/rejected": -3.7406647205352783, + "step": 1273 + }, + { + "epoch": 1.69, + "learning_rate": 3.324122078223361e-07, + "loss": 0.0237, + "step": 1274 + }, + { + "epoch": 1.69, + "logps_train/chosen": -32.85054397583008, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -80.73304748535156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4895550310611725, + "rewards_train/margins": 4.506219059228897, + "rewards_train/rejected": -4.016664028167725, + "step": 1274 + }, + { + "epoch": 1.69, + "logps_train/chosen": -89.41511535644531, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -94.0, + "logps_train/rejected": -143.1314697265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4475514590740204, + "rewards_train/margins": 5.384525507688522, + "rewards_train/rejected": -4.936974048614502, + "step": 1275 + }, + { + "epoch": 1.69, + "learning_rate": 3.318929349148786e-07, + "loss": 0.034, + "step": 1276 + }, + { + "epoch": 1.69, + "logps_train/chosen": -62.960792541503906, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -118.03072357177734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.4320461750030518, + "rewards_train/margins": 5.686681509017944, + "rewards_train/rejected": -4.254635334014893, + "step": 1276 + }, + { + "epoch": 1.7, + "logps_train/chosen": -84.21917724609375, + "logps_train/ref_chosen": -95.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -154.69448852539062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0749566555023193, + "rewards_train/margins": 6.483468294143677, + "rewards_train/rejected": -5.408511638641357, + "step": 1277 + }, + { + "epoch": 1.7, + "learning_rate": 3.313732659250467e-07, + "loss": 0.0187, + "step": 1278 + }, + { + "epoch": 1.7, + "logps_train/chosen": -45.43123245239258, + "logps_train/ref_chosen": -46.5, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -87.5999526977539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.11361478269100189, + "rewards_train/margins": 2.879859670996666, + "rewards_train/rejected": -2.766244888305664, + "step": 1278 + }, + { + "epoch": 1.7, + "logps_train/chosen": -61.76457977294922, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -85.24491119384766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09003237634897232, + "rewards_train/margins": 3.2684433087706566, + "rewards_train/rejected": -3.358475685119629, + "step": 1279 + }, + { + "epoch": 1.7, + "learning_rate": 3.3085320336626515e-07, + "loss": 0.1438, + "step": 1280 + }, + { + "epoch": 1.7, + "logps_train/chosen": -61.346763610839844, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -119.7281494140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42157384753227234, + "rewards_train/margins": 4.979935377836227, + "rewards_train/rejected": -4.558361530303955, + "step": 1280 + }, + { + "epoch": 1.7, + "logps_train/chosen": -48.748931884765625, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -110.11878204345703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26416897773742676, + "rewards_train/margins": 4.830734968185425, + "rewards_train/rejected": -4.566565990447998, + "step": 1281 + }, + { + "epoch": 1.7, + "learning_rate": 3.3033274975386233e-07, + "loss": 0.0336, + "step": 1282 + }, + { + "epoch": 1.7, + "logps_train/chosen": -46.555450439453125, + "logps_train/ref_chosen": -53.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -81.43418884277344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7061734199523926, + "rewards_train/margins": 3.6132636070251465, + "rewards_train/rejected": -2.907090187072754, + "step": 1282 + }, + { + "epoch": 1.7, + "logps_train/chosen": -61.14817810058594, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -112.43848419189453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.36721375584602356, + "rewards_train/margins": 4.464187175035477, + "rewards_train/rejected": -4.096973419189453, + "step": 1283 + }, + { + "epoch": 1.71, + "learning_rate": 3.2981190760505765e-07, + "loss": 0.0464, + "step": 1284 + }, + { + "epoch": 1.71, + "logps_train/chosen": -60.14622497558594, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -114.14445495605469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20568996667861938, + "rewards_train/margins": 4.806073725223541, + "rewards_train/rejected": -4.600383758544922, + "step": 1284 + }, + { + "epoch": 1.71, + "logps_train/chosen": -54.61853790283203, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -55.25, + "logps_train/rejected": -101.54710388183594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10298976302146912, + "rewards_train/margins": 4.732309907674789, + "rewards_train/rejected": -4.62932014465332, + "step": 1285 + }, + { + "epoch": 1.71, + "learning_rate": 3.292906794389502e-07, + "loss": 0.0315, + "step": 1286 + }, + { + "epoch": 1.71, + "logps_train/chosen": -56.822174072265625, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -84.07611846923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0806734561920166, + "rewards_train/margins": 3.540238618850708, + "rewards_train/rejected": -3.4595651626586914, + "step": 1286 + }, + { + "epoch": 1.71, + "logps_train/chosen": -77.15886688232422, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -139.53407287597656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.37786340713500977, + "rewards_train/margins": 5.300020694732666, + "rewards_train/rejected": -4.922157287597656, + "step": 1287 + }, + { + "epoch": 1.71, + "learning_rate": 3.287690677765055e-07, + "loss": 0.0776, + "step": 1288 + }, + { + "epoch": 1.71, + "logps_train/chosen": -62.614532470703125, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -90.66036987304688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.001828014850616455, + "rewards_train/margins": 4.078802406787872, + "rewards_train/rejected": -4.076974391937256, + "step": 1288 + }, + { + "epoch": 1.71, + "logps_train/chosen": -60.615699768066406, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -118.24978637695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04936763644218445, + "rewards_train/margins": 4.41497203707695, + "rewards_train/rejected": -4.365604400634766, + "step": 1289 + }, + { + "epoch": 1.71, + "learning_rate": 3.2824707514054433e-07, + "loss": 0.0342, + "step": 1290 + }, + { + "epoch": 1.71, + "logps_train/chosen": -74.69686889648438, + "logps_train/ref_chosen": -85.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -108.50785827636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.013126015663147, + "rewards_train/margins": 4.848286747932434, + "rewards_train/rejected": -3.835160732269287, + "step": 1290 + }, + { + "epoch": 1.71, + "logps_train/chosen": -64.16402435302734, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -78.5, + "logps_train/rejected": -129.58990478515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7031289935112, + "rewards_train/margins": 5.807431876659393, + "rewards_train/rejected": -5.104302883148193, + "step": 1291 + }, + { + "epoch": 1.72, + "learning_rate": 3.2772470405572994e-07, + "loss": 0.0125, + "step": 1292 + }, + { + "epoch": 1.72, + "logps_train/chosen": -50.25048065185547, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -92.09669494628906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.21401438117027283, + "rewards_train/margins": 4.4080584943294525, + "rewards_train/rejected": -4.19404411315918, + "step": 1292 + }, + { + "epoch": 1.72, + "logps_train/chosen": -81.95832824707031, + "logps_train/ref_chosen": -91.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -131.42608642578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9760424494743347, + "rewards_train/margins": 5.707713305950165, + "rewards_train/rejected": -4.73167085647583, + "step": 1293 + }, + { + "epoch": 1.72, + "learning_rate": 3.272019570485559e-07, + "loss": 0.0133, + "step": 1294 + }, + { + "epoch": 1.72, + "logps_train/chosen": -88.58802795410156, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -152.03662109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.036926984786987305, + "rewards_train/margins": 4.297986268997192, + "rewards_train/rejected": -4.33491325378418, + "step": 1294 + }, + { + "epoch": 1.72, + "logps_train/chosen": -67.58811950683594, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -114.74232482910156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6646257638931274, + "rewards_train/margins": 4.795107960700989, + "rewards_train/rejected": -4.130482196807861, + "step": 1295 + }, + { + "epoch": 1.72, + "learning_rate": 3.266788366473342e-07, + "loss": 0.0345, + "step": 1296 + }, + { + "epoch": 1.72, + "logps_train/chosen": -89.37545776367188, + "logps_train/ref_chosen": -99.0, + "logps_train/ref_rejected": -103.5, + "logps_train/rejected": -152.94842529296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9780793190002441, + "rewards_train/margins": 5.938547611236572, + "rewards_train/rejected": -4.960468292236328, + "step": 1296 + }, + { + "epoch": 1.72, + "logps_train/chosen": -79.32011413574219, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -129.9368896484375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.021113455295562744, + "rewards_train/margins": 4.710116446018219, + "rewards_train/rejected": -4.689002990722656, + "step": 1297 + }, + { + "epoch": 1.72, + "learning_rate": 3.261553453821825e-07, + "loss": 0.0172, + "step": 1298 + }, + { + "epoch": 1.72, + "logps_train/chosen": -68.74508666992188, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -99.09082794189453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4676787257194519, + "rewards_train/margins": 4.200198829174042, + "rewards_train/rejected": -3.73252010345459, + "step": 1298 + }, + { + "epoch": 1.73, + "logps_train/chosen": -83.87431335449219, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -131.5478515625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.30319344997406006, + "rewards_train/margins": 4.748603701591492, + "rewards_train/rejected": -4.445410251617432, + "step": 1299 + }, + { + "epoch": 1.73, + "learning_rate": 3.2563148578501227e-07, + "loss": 0.0485, + "step": 1300 + }, + { + "epoch": 1.73, + "logps_train/chosen": -31.09895133972168, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -44.5, + "logps_train/rejected": -75.96563720703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5858080387115479, + "rewards_train/margins": 3.7446768283843994, + "rewards_train/rejected": -3.1588687896728516, + "step": 1300 + }, + { + "epoch": 1.73, + "logps_train/chosen": -51.630897521972656, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -83.62006378173828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5837852954864502, + "rewards_train/margins": 3.0903234481811523, + "rewards_train/rejected": -2.506538152694702, + "step": 1301 + }, + { + "epoch": 1.73, + "learning_rate": 3.2510726038951646e-07, + "loss": 0.0963, + "step": 1302 + }, + { + "epoch": 1.73, + "logps_train/chosen": -87.4047622680664, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -113.35118865966797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08218032121658325, + "rewards_train/margins": 4.093471348285675, + "rewards_train/rejected": -4.011291027069092, + "step": 1302 + }, + { + "epoch": 1.73, + "logps_train/chosen": -74.77301025390625, + "logps_train/ref_chosen": -75.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -100.62956237792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09349971264600754, + "rewards_train/margins": 3.402939848601818, + "rewards_train/rejected": -3.3094401359558105, + "step": 1303 + }, + { + "epoch": 1.73, + "learning_rate": 3.2458267173115737e-07, + "loss": 0.0907, + "step": 1304 + }, + { + "epoch": 1.73, + "logps_train/chosen": -51.64720916748047, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -105.64906311035156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1561269462108612, + "rewards_train/margins": 4.755654841661453, + "rewards_train/rejected": -4.9117817878723145, + "step": 1304 + }, + { + "epoch": 1.73, + "logps_train/chosen": -72.72596740722656, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -123.30240631103516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5460348129272461, + "rewards_train/margins": 4.737330913543701, + "rewards_train/rejected": -5.283365726470947, + "step": 1305 + }, + { + "epoch": 1.73, + "learning_rate": 3.24057722347154e-07, + "loss": 0.0497, + "step": 1306 + }, + { + "epoch": 1.73, + "logps_train/chosen": -63.780242919921875, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -103.94377136230469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1569300889968872, + "rewards_train/margins": 4.264010310173035, + "rewards_train/rejected": -4.420940399169922, + "step": 1306 + }, + { + "epoch": 1.74, + "logps_train/chosen": -77.91183471679688, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -114.68812561035156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.10993298143148422, + "rewards_train/margins": 4.554192461073399, + "rewards_train/rejected": -4.664125442504883, + "step": 1307 + }, + { + "epoch": 1.74, + "learning_rate": 3.235324147764703e-07, + "loss": 0.0471, + "step": 1308 + }, + { + "epoch": 1.74, + "logps_train/chosen": -56.85551452636719, + "logps_train/ref_chosen": -63.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -109.82720947265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.664448618888855, + "rewards_train/margins": 4.522169470787048, + "rewards_train/rejected": -3.8577208518981934, + "step": 1308 + }, + { + "epoch": 1.74, + "logps_train/chosen": -42.04686737060547, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -58.00616455078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8031256794929504, + "rewards_train/margins": 3.019074499607086, + "rewards_train/rejected": -2.2159488201141357, + "step": 1309 + }, + { + "epoch": 1.74, + "learning_rate": 3.230067515598024e-07, + "loss": 0.0603, + "step": 1310 + }, + { + "epoch": 1.74, + "logps_train/chosen": -40.37076950073242, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -88.0704345703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8988605737686157, + "rewards_train/margins": 3.9996544122695923, + "rewards_train/rejected": -3.1007938385009766, + "step": 1310 + }, + { + "epoch": 1.74, + "logps_train/chosen": -25.2216796875, + "logps_train/ref_chosen": -30.5, + "logps_train/ref_rejected": -33.0, + "logps_train/rejected": -59.214603424072266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5270507335662842, + "rewards_train/margins": 3.1594483852386475, + "rewards_train/rejected": -2.6323976516723633, + "step": 1311 + }, + { + "epoch": 1.74, + "learning_rate": 3.224807352395666e-07, + "loss": 0.0905, + "step": 1312 + }, + { + "epoch": 1.74, + "logps_train/chosen": -34.91630172729492, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -76.10845947265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3021198511123657, + "rewards_train/margins": 3.6141377687454224, + "rewards_train/rejected": -3.3120179176330566, + "step": 1312 + }, + { + "epoch": 1.74, + "logps_train/chosen": -46.99993133544922, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -81.25511932373047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.14686810970306396, + "rewards_train/margins": 3.9263006448745728, + "rewards_train/rejected": -4.073168754577637, + "step": 1313 + }, + { + "epoch": 1.75, + "learning_rate": 3.219543683598871e-07, + "loss": 0.0724, + "step": 1314 + }, + { + "epoch": 1.75, + "logps_train/chosen": -45.52054977416992, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -92.04312896728516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3526324927806854, + "rewards_train/margins": 3.9659297168254852, + "rewards_train/rejected": -3.6132972240448, + "step": 1314 + }, + { + "epoch": 1.75, + "logps_train/chosen": -72.036376953125, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -137.9612274169922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22448745369911194, + "rewards_train/margins": 5.926860481500626, + "rewards_train/rejected": -5.702373027801514, + "step": 1315 + }, + { + "epoch": 1.75, + "learning_rate": 3.2142765346658365e-07, + "loss": 0.054, + "step": 1316 + }, + { + "epoch": 1.75, + "logps_train/chosen": -77.81909942626953, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -118.39262390136719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6170035004615784, + "rewards_train/margins": 4.7422032952308655, + "rewards_train/rejected": -4.125199794769287, + "step": 1316 + }, + { + "epoch": 1.75, + "logps_train/chosen": -83.16867065429688, + "logps_train/ref_chosen": -97.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -116.9627685546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.4487583637237549, + "rewards_train/margins": 5.229410171508789, + "rewards_train/rejected": -3.780651807785034, + "step": 1317 + }, + { + "epoch": 1.75, + "learning_rate": 3.2090059310715883e-07, + "loss": 0.0192, + "step": 1318 + }, + { + "epoch": 1.75, + "logps_train/chosen": -61.10783004760742, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -96.2449722290039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4907795488834381, + "rewards_train/margins": 3.826214224100113, + "rewards_train/rejected": -3.335434675216675, + "step": 1318 + }, + { + "epoch": 1.75, + "logps_train/chosen": -36.37995910644531, + "logps_train/ref_chosen": -38.0, + "logps_train/ref_rejected": -52.75, + "logps_train/rejected": -91.6414794921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1729416847229004, + "rewards_train/margins": 4.052714824676514, + "rewards_train/rejected": -3.8797731399536133, + "step": 1319 + }, + { + "epoch": 1.75, + "learning_rate": 3.203731898307867e-07, + "loss": 0.0541, + "step": 1320 + }, + { + "epoch": 1.75, + "logps_train/chosen": -54.88543701171875, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -102.17388916015625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4903629422187805, + "rewards_train/margins": 4.4983771443367, + "rewards_train/rejected": -4.00801420211792, + "step": 1320 + }, + { + "epoch": 1.75, + "logps_train/chosen": -75.27879333496094, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -126.5898666381836, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5471201539039612, + "rewards_train/margins": 5.068606436252594, + "rewards_train/rejected": -4.521486282348633, + "step": 1321 + }, + { + "epoch": 1.76, + "learning_rate": 3.1984544618829923e-07, + "loss": 0.0172, + "step": 1322 + }, + { + "epoch": 1.76, + "logps_train/chosen": -81.05743408203125, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -130.18179321289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09636864066123962, + "rewards_train/margins": 4.442123562097549, + "rewards_train/rejected": -4.538492202758789, + "step": 1322 + }, + { + "epoch": 1.76, + "logps_train/chosen": -45.12523651123047, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -89.20333862304688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9437263011932373, + "rewards_train/margins": 4.018747806549072, + "rewards_train/rejected": -3.075021505355835, + "step": 1323 + }, + { + "epoch": 1.76, + "learning_rate": 3.1931736473217517e-07, + "loss": 0.0527, + "step": 1324 + }, + { + "epoch": 1.76, + "logps_train/chosen": -53.421531677246094, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -116.16659545898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08121564239263535, + "rewards_train/margins": 5.327631212770939, + "rewards_train/rejected": -5.408846855163574, + "step": 1324 + }, + { + "epoch": 1.76, + "logps_train/chosen": -54.0560188293457, + "logps_train/ref_chosen": -56.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -91.33196258544922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26002299785614014, + "rewards_train/margins": 3.7357975244522095, + "rewards_train/rejected": -3.4757745265960693, + "step": 1325 + }, + { + "epoch": 1.76, + "learning_rate": 3.1878894801652673e-07, + "loss": 0.0372, + "step": 1326 + }, + { + "epoch": 1.76, + "logps_train/chosen": -34.62523651123047, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -37.25, + "logps_train/rejected": -66.30241394042969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8921639323234558, + "rewards_train/margins": 3.801311194896698, + "rewards_train/rejected": -2.909147262573242, + "step": 1326 + }, + { + "epoch": 1.76, + "logps_train/chosen": -78.89883422851562, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -117.92451477050781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6773035526275635, + "rewards_train/margins": 4.893192529678345, + "rewards_train/rejected": -4.215888977050781, + "step": 1327 + }, + { + "epoch": 1.76, + "learning_rate": 3.182601985970878e-07, + "loss": 0.0394, + "step": 1328 + }, + { + "epoch": 1.76, + "logps_train/chosen": -70.3931655883789, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -94.61600494384766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.41700220108032227, + "rewards_train/margins": 3.1875672340393066, + "rewards_train/rejected": -3.604569435119629, + "step": 1328 + }, + { + "epoch": 1.76, + "logps_train/chosen": -67.95346069335938, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -103.56622314453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0749666839838028, + "rewards_train/margins": 4.701901212334633, + "rewards_train/rejected": -4.62693452835083, + "step": 1329 + }, + { + "epoch": 1.77, + "learning_rate": 3.177311190312015e-07, + "loss": 0.0359, + "step": 1330 + }, + { + "epoch": 1.77, + "logps_train/chosen": -53.2960319519043, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -100.29216766357422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3296031057834625, + "rewards_train/margins": 4.238676160573959, + "rewards_train/rejected": -4.568279266357422, + "step": 1330 + }, + { + "epoch": 1.77, + "logps_train/chosen": -41.40813446044922, + "logps_train/ref_chosen": -38.5, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -75.69696044921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.29237616062164307, + "rewards_train/margins": 2.7179449796676636, + "rewards_train/rejected": -3.0103211402893066, + "step": 1331 + }, + { + "epoch": 1.77, + "learning_rate": 3.172017118778075e-07, + "loss": 0.0622, + "step": 1332 + }, + { + "epoch": 1.77, + "logps_train/chosen": -51.72608184814453, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -94.88812255859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04956158250570297, + "rewards_train/margins": 3.7173761501908302, + "rewards_train/rejected": -3.766937732696533, + "step": 1332 + }, + { + "epoch": 1.77, + "logps_train/chosen": -39.467689514160156, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -94.80979919433594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7989343404769897, + "rewards_train/margins": 5.029133200645447, + "rewards_train/rejected": -4.230198860168457, + "step": 1333 + }, + { + "epoch": 1.77, + "learning_rate": 3.166719796974301e-07, + "loss": 0.0445, + "step": 1334 + }, + { + "epoch": 1.77, + "logps_train/chosen": -58.66218185424805, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -140.8789520263672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4697190523147583, + "rewards_train/margins": 5.642769932746887, + "rewards_train/rejected": -5.173050880432129, + "step": 1334 + }, + { + "epoch": 1.77, + "logps_train/chosen": -61.016143798828125, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -102.59158325195312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07026086747646332, + "rewards_train/margins": 4.2325442880392075, + "rewards_train/rejected": -4.162283420562744, + "step": 1335 + }, + { + "epoch": 1.77, + "learning_rate": 3.161419250521654e-07, + "loss": 0.025, + "step": 1336 + }, + { + "epoch": 1.77, + "logps_train/chosen": -64.99016571044922, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -109.13330078125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.060358256101608276, + "rewards_train/margins": 4.165876120328903, + "rewards_train/rejected": -4.105517864227295, + "step": 1336 + }, + { + "epoch": 1.78, + "logps_train/chosen": -47.07199478149414, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -57.5, + "logps_train/rejected": -94.9906997680664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2529566287994385, + "rewards_train/margins": 3.999682903289795, + "rewards_train/rejected": -3.7467262744903564, + "step": 1337 + }, + { + "epoch": 1.78, + "learning_rate": 3.156115505056695e-07, + "loss": 0.0522, + "step": 1338 + }, + { + "epoch": 1.78, + "logps_train/chosen": -52.91387939453125, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -85.16996002197266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0101745128631592, + "rewards_train/margins": 3.552170515060425, + "rewards_train/rejected": -2.5419960021972656, + "step": 1338 + }, + { + "epoch": 1.78, + "logps_train/chosen": -70.00985717773438, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -128.4418487548828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18338876962661743, + "rewards_train/margins": 5.141636431217194, + "rewards_train/rejected": -4.958247661590576, + "step": 1339 + }, + { + "epoch": 1.78, + "learning_rate": 3.150808586231452e-07, + "loss": 0.065, + "step": 1340 + }, + { + "epoch": 1.78, + "logps_train/chosen": -53.93753433227539, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -104.31352233886719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32187145948410034, + "rewards_train/margins": 4.325098216533661, + "rewards_train/rejected": -4.0032267570495605, + "step": 1340 + }, + { + "epoch": 1.78, + "logps_train/chosen": -69.31382751464844, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -113.26199340820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.22669503092765808, + "rewards_train/margins": 4.296379595994949, + "rewards_train/rejected": -4.523074626922607, + "step": 1341 + }, + { + "epoch": 1.78, + "learning_rate": 3.145498519713306e-07, + "loss": 0.0498, + "step": 1342 + }, + { + "epoch": 1.78, + "logps_train/chosen": -61.64352798461914, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -97.69657135009766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5583034753799438, + "rewards_train/margins": 4.020148158073425, + "rewards_train/rejected": -3.4618446826934814, + "step": 1342 + }, + { + "epoch": 1.78, + "logps_train/chosen": -54.30946350097656, + "logps_train/ref_chosen": -53.75, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -103.72923278808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04813361167907715, + "rewards_train/margins": 4.229477167129517, + "rewards_train/rejected": -4.277610778808594, + "step": 1343 + }, + { + "epoch": 1.78, + "learning_rate": 3.1401853311848596e-07, + "loss": 0.0545, + "step": 1344 + }, + { + "epoch": 1.78, + "logps_train/chosen": -53.456260681152344, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -86.74205780029297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4582805633544922, + "rewards_train/margins": 4.213736534118652, + "rewards_train/rejected": -3.75545597076416, + "step": 1344 + }, + { + "epoch": 1.79, + "logps_train/chosen": -50.258140563964844, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -74.61735534667969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.104720339179039, + "rewards_train/margins": 2.8138508945703506, + "rewards_train/rejected": -2.9185712337493896, + "step": 1345 + }, + { + "epoch": 1.79, + "learning_rate": 3.1348690463438165e-07, + "loss": 0.1014, + "step": 1346 + }, + { + "epoch": 1.79, + "logps_train/chosen": -68.87425231933594, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -118.25076293945312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3375745415687561, + "rewards_train/margins": 4.933354914188385, + "rewards_train/rejected": -4.595780372619629, + "step": 1346 + }, + { + "epoch": 1.79, + "logps_train/chosen": -50.996910095214844, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -82.05500030517578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4098471999168396, + "rewards_train/margins": 2.964402973651886, + "rewards_train/rejected": -3.3742501735687256, + "step": 1347 + }, + { + "epoch": 1.79, + "learning_rate": 3.1295496909028543e-07, + "loss": 0.1072, + "step": 1348 + }, + { + "epoch": 1.79, + "logps_train/chosen": -86.40998840332031, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -153.81192016601562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -1.0511549711227417, + "rewards_train/margins": 5.5425368547439575, + "rewards_train/rejected": -6.593691825866699, + "step": 1348 + }, + { + "epoch": 1.79, + "logps_train/chosen": -30.314226150512695, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -69.81155395507812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5365461111068726, + "rewards_train/margins": 3.830983281135559, + "rewards_train/rejected": -3.2944371700286865, + "step": 1349 + }, + { + "epoch": 1.79, + "learning_rate": 3.1242272905895046e-07, + "loss": 0.1951, + "step": 1350 + }, + { + "epoch": 1.79, + "logps_train/chosen": -68.84539794921875, + "logps_train/ref_chosen": -74.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -114.88736724853516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5037418007850647, + "rewards_train/margins": 4.87685352563858, + "rewards_train/rejected": -4.373111724853516, + "step": 1350 + }, + { + "epoch": 1.79, + "logps_train/chosen": -59.06312561035156, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -139.22659301757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20931226015090942, + "rewards_train/margins": 5.7147844433784485, + "rewards_train/rejected": -5.505472183227539, + "step": 1351 + }, + { + "epoch": 1.8, + "learning_rate": 3.118901871146022e-07, + "loss": 0.0268, + "step": 1352 + }, + { + "epoch": 1.8, + "logps_train/chosen": -46.557125091552734, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -124.31275177001953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4902836084365845, + "rewards_train/margins": 5.107496857643127, + "rewards_train/rejected": -4.617213249206543, + "step": 1352 + }, + { + "epoch": 1.8, + "logps_train/chosen": -40.084022521972656, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -99.239013671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8228479623794556, + "rewards_train/margins": 4.921749711036682, + "rewards_train/rejected": -4.098901748657227, + "step": 1353 + }, + { + "epoch": 1.8, + "learning_rate": 3.1135734583292673e-07, + "loss": 0.0388, + "step": 1354 + }, + { + "epoch": 1.8, + "logps_train/chosen": -51.53060531616211, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -50.5, + "logps_train/rejected": -91.71237182617188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.17662709951400757, + "rewards_train/margins": 4.291614472866058, + "rewards_train/rejected": -4.114987373352051, + "step": 1354 + }, + { + "epoch": 1.8, + "logps_train/chosen": -66.9827651977539, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -119.54243469238281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.21546393632888794, + "rewards_train/margins": 5.341806590557098, + "rewards_train/rejected": -5.557270526885986, + "step": 1355 + }, + { + "epoch": 1.8, + "learning_rate": 3.108242077910576e-07, + "loss": 0.05, + "step": 1356 + }, + { + "epoch": 1.8, + "logps_train/chosen": -49.62389373779297, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -102.19573211669922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5012826919555664, + "rewards_train/margins": 3.2974185943603516, + "rewards_train/rejected": -2.796135902404785, + "step": 1356 + }, + { + "epoch": 1.8, + "logps_train/chosen": -44.61186599731445, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -52.5, + "logps_train/rejected": -85.93205261230469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10600095987319946, + "rewards_train/margins": 3.443933069705963, + "rewards_train/rejected": -3.3379321098327637, + "step": 1357 + }, + { + "epoch": 1.8, + "learning_rate": 3.102907755675638e-07, + "loss": 0.1483, + "step": 1358 + }, + { + "epoch": 1.8, + "logps_train/chosen": -63.868865966796875, + "logps_train/ref_chosen": -74.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -114.36643981933594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0756137371063232, + "rewards_train/margins": 5.237258195877075, + "rewards_train/rejected": -4.161644458770752, + "step": 1358 + }, + { + "epoch": 1.8, + "logps_train/chosen": -86.20753479003906, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -117.86534118652344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10580922663211823, + "rewards_train/margins": 4.81734324991703, + "rewards_train/rejected": -4.711534023284912, + "step": 1359 + }, + { + "epoch": 1.81, + "learning_rate": 3.097570517424373e-07, + "loss": 0.0238, + "step": 1360 + }, + { + "epoch": 1.81, + "logps_train/chosen": -78.66271209716797, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -138.128662109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.44349443912506104, + "rewards_train/margins": 5.445423245429993, + "rewards_train/rejected": -5.001928806304932, + "step": 1360 + }, + { + "epoch": 1.81, + "logps_train/chosen": -87.16780090332031, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -95.0, + "logps_train/rejected": -141.34060668945312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15353183448314667, + "rewards_train/margins": 4.781342312693596, + "rewards_train/rejected": -4.627810478210449, + "step": 1361 + }, + { + "epoch": 1.81, + "learning_rate": 3.0922303889708007e-07, + "loss": 0.0549, + "step": 1362 + }, + { + "epoch": 1.81, + "logps_train/chosen": -74.5985107421875, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -122.53484344482422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12471897900104523, + "rewards_train/margins": 4.932500675320625, + "rewards_train/rejected": -4.80778169631958, + "step": 1362 + }, + { + "epoch": 1.81, + "logps_train/chosen": -65.30064392089844, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -105.40562438964844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4910295903682709, + "rewards_train/margins": 4.389404147863388, + "rewards_train/rejected": -3.898374557495117, + "step": 1363 + }, + { + "epoch": 1.81, + "learning_rate": 3.0868873961429225e-07, + "loss": 0.0566, + "step": 1364 + }, + { + "epoch": 1.81, + "logps_train/chosen": -83.2791748046875, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -113.5, + "logps_train/rejected": -178.60520935058594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4408326745033264, + "rewards_train/margins": 6.935729563236237, + "rewards_train/rejected": -6.49489688873291, + "step": 1364 + }, + { + "epoch": 1.81, + "logps_train/chosen": -56.840614318847656, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -104.32028198242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.02296966314315796, + "rewards_train/margins": 4.161247670650482, + "rewards_train/rejected": -4.138278007507324, + "step": 1365 + }, + { + "epoch": 1.81, + "learning_rate": 3.081541564782592e-07, + "loss": 0.0369, + "step": 1366 + }, + { + "epoch": 1.81, + "logps_train/chosen": -63.11859130859375, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -109.56233215332031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5115785598754883, + "rewards_train/margins": 4.5162482261657715, + "rewards_train/rejected": -4.004669666290283, + "step": 1366 + }, + { + "epoch": 1.82, + "logps_train/chosen": -41.38029479980469, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -69.97128295898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5994703769683838, + "rewards_train/margins": 3.5626144409179688, + "rewards_train/rejected": -2.963144063949585, + "step": 1367 + }, + { + "epoch": 1.82, + "learning_rate": 3.0761929207453935e-07, + "loss": 0.0316, + "step": 1368 + }, + { + "epoch": 1.82, + "logps_train/chosen": -49.12171173095703, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -93.09168243408203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8253287076950073, + "rewards_train/margins": 3.757153868675232, + "rewards_train/rejected": -2.9318251609802246, + "step": 1368 + }, + { + "epoch": 1.82, + "logps_train/chosen": -59.18602752685547, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -104.74286651611328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7985846996307373, + "rewards_train/margins": 4.64474630355835, + "rewards_train/rejected": -3.8461616039276123, + "step": 1369 + }, + { + "epoch": 1.82, + "learning_rate": 3.0708414899005126e-07, + "loss": 0.0386, + "step": 1370 + }, + { + "epoch": 1.82, + "logps_train/chosen": -66.77318572998047, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -100.14771270751953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23869720101356506, + "rewards_train/margins": 3.458155781030655, + "rewards_train/rejected": -3.21945858001709, + "step": 1370 + }, + { + "epoch": 1.82, + "logps_train/chosen": -35.78307342529297, + "logps_train/ref_chosen": -35.5, + "logps_train/ref_rejected": -48.0, + "logps_train/rejected": -86.46635437011719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.027526073157787323, + "rewards_train/margins": 3.8073905184865, + "rewards_train/rejected": -3.834916591644287, + "step": 1371 + }, + { + "epoch": 1.82, + "learning_rate": 3.065487298130615e-07, + "loss": 0.0487, + "step": 1372 + }, + { + "epoch": 1.82, + "logps_train/chosen": -54.715187072753906, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -95.5133056640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04847194254398346, + "rewards_train/margins": 4.31301449239254, + "rewards_train/rejected": -4.361486434936523, + "step": 1372 + }, + { + "epoch": 1.82, + "logps_train/chosen": -81.32508850097656, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -126.1264877319336, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7253035306930542, + "rewards_train/margins": 4.677796721458435, + "rewards_train/rejected": -3.952493190765381, + "step": 1373 + }, + { + "epoch": 1.82, + "learning_rate": 3.0601303713317193e-07, + "loss": 0.0426, + "step": 1374 + }, + { + "epoch": 1.82, + "logps_train/chosen": -65.03947448730469, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -117.6639175415039, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26489973068237305, + "rewards_train/margins": 4.2225022315979, + "rewards_train/rejected": -3.9576025009155273, + "step": 1374 + }, + { + "epoch": 1.83, + "logps_train/chosen": -52.906349182128906, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -98.38458251953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9304587244987488, + "rewards_train/margins": 5.2571980357170105, + "rewards_train/rejected": -4.326739311218262, + "step": 1375 + }, + { + "epoch": 1.83, + "learning_rate": 3.0547707354130734e-07, + "loss": 0.0586, + "step": 1376 + }, + { + "epoch": 1.83, + "logps_train/chosen": -53.370025634765625, + "logps_train/ref_chosen": -57.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -110.36749267578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4239352345466614, + "rewards_train/margins": 4.55443412065506, + "rewards_train/rejected": -4.130498886108398, + "step": 1376 + }, + { + "epoch": 1.83, + "logps_train/chosen": -85.61115264892578, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -98.0, + "logps_train/rejected": -148.0464630126953, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12325990200042725, + "rewards_train/margins": 5.131031394004822, + "rewards_train/rejected": -5.0077714920043945, + "step": 1377 + }, + { + "epoch": 1.83, + "learning_rate": 3.049408416297026e-07, + "loss": 0.0178, + "step": 1378 + }, + { + "epoch": 1.83, + "logps_train/chosen": -52.071075439453125, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -115.71488952636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3452361524105072, + "rewards_train/margins": 5.394850105047226, + "rewards_train/rejected": -5.049613952636719, + "step": 1378 + }, + { + "epoch": 1.83, + "logps_train/chosen": -41.01616287231445, + "logps_train/ref_chosen": -41.5, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -87.26249694824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04103497788310051, + "rewards_train/margins": 3.7391598634421825, + "rewards_train/rejected": -3.698124885559082, + "step": 1379 + }, + { + "epoch": 1.83, + "learning_rate": 3.044043439918907e-07, + "loss": 0.0449, + "step": 1380 + }, + { + "epoch": 1.83, + "logps_train/chosen": -56.04023742675781, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -98.2576904296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38191360235214233, + "rewards_train/margins": 4.482682645320892, + "rewards_train/rejected": -4.10076904296875, + "step": 1380 + }, + { + "epoch": 1.83, + "logps_train/chosen": -42.693115234375, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -97.98914337158203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28459465503692627, + "rewards_train/margins": 3.8905402421951294, + "rewards_train/rejected": -3.605945587158203, + "step": 1381 + }, + { + "epoch": 1.84, + "learning_rate": 3.038675832226893e-07, + "loss": 0.0483, + "step": 1382 + }, + { + "epoch": 1.84, + "logps_train/chosen": -66.4197998046875, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -93.89173889160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.028331995010376, + "rewards_train/margins": 4.651880502700806, + "rewards_train/rejected": -3.6235485076904297, + "step": 1382 + }, + { + "epoch": 1.84, + "logps_train/chosen": -61.731849670410156, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -104.83549499511719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2596275508403778, + "rewards_train/margins": 3.9259893596172333, + "rewards_train/rejected": -3.6663618087768555, + "step": 1383 + }, + { + "epoch": 1.84, + "learning_rate": 3.0333056191818925e-07, + "loss": 0.0232, + "step": 1384 + }, + { + "epoch": 1.84, + "logps_train/chosen": -66.45545196533203, + "logps_train/ref_chosen": -69.0, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -106.81305694580078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.24820439517498016, + "rewards_train/margins": 4.028728649020195, + "rewards_train/rejected": -3.780524253845215, + "step": 1384 + }, + { + "epoch": 1.84, + "logps_train/chosen": -85.05638122558594, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -109.67033386230469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5771746039390564, + "rewards_train/margins": 4.189520061016083, + "rewards_train/rejected": -3.6123454570770264, + "step": 1385 + }, + { + "epoch": 1.84, + "learning_rate": 3.027932826757411e-07, + "loss": 0.0873, + "step": 1386 + }, + { + "epoch": 1.84, + "logps_train/chosen": -43.98091125488281, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -82.68589782714844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.28550249338150024, + "rewards_train/margins": 3.1714263558387756, + "rewards_train/rejected": -2.8859238624572754, + "step": 1386 + }, + { + "epoch": 1.84, + "logps_train/chosen": -60.19977569580078, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -119.281494140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.04966506361961365, + "rewards_train/margins": 4.203484445810318, + "rewards_train/rejected": -4.253149509429932, + "step": 1387 + }, + { + "epoch": 1.84, + "learning_rate": 3.022557480939432e-07, + "loss": 0.0857, + "step": 1388 + }, + { + "epoch": 1.84, + "logps_train/chosen": -59.94446563720703, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -121.92713928222656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.09444694221019745, + "rewards_train/margins": 4.720141604542732, + "rewards_train/rejected": -4.81458854675293, + "step": 1388 + }, + { + "epoch": 1.84, + "logps_train/chosen": -69.99658966064453, + "logps_train/ref_chosen": -79.5, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -134.48577880859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9550284147262573, + "rewards_train/margins": 5.450480818748474, + "rewards_train/rejected": -4.495452404022217, + "step": 1389 + }, + { + "epoch": 1.85, + "learning_rate": 3.017179607726288e-07, + "loss": 0.0496, + "step": 1390 + }, + { + "epoch": 1.85, + "logps_train/chosen": -46.09320068359375, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -48.5, + "logps_train/rejected": -91.91936492919922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31724274158477783, + "rewards_train/margins": 4.6708985567092896, + "rewards_train/rejected": -4.353655815124512, + "step": 1390 + }, + { + "epoch": 1.85, + "logps_train/chosen": -88.20574951171875, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -150.81411743164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.16848719120025635, + "rewards_train/margins": 5.192086815834045, + "rewards_train/rejected": -5.023599624633789, + "step": 1391 + }, + { + "epoch": 1.85, + "learning_rate": 3.0117992331285346e-07, + "loss": 0.0288, + "step": 1392 + }, + { + "epoch": 1.85, + "logps_train/chosen": -76.96282958984375, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -123.3883056640625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4216860234737396, + "rewards_train/margins": 4.7558296620845795, + "rewards_train/rejected": -4.33414363861084, + "step": 1392 + }, + { + "epoch": 1.85, + "logps_train/chosen": -71.67112731933594, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -121.38059997558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4161360263824463, + "rewards_train/margins": 4.078564882278442, + "rewards_train/rejected": -4.494700908660889, + "step": 1393 + }, + { + "epoch": 1.85, + "learning_rate": 3.0064163831688274e-07, + "loss": 0.0609, + "step": 1394 + }, + { + "epoch": 1.85, + "logps_train/chosen": -58.07894515991211, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -111.1194839477539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.26414453983306885, + "rewards_train/margins": 3.816553473472595, + "rewards_train/rejected": -4.080698013305664, + "step": 1394 + }, + { + "epoch": 1.85, + "logps_train/chosen": -47.85907745361328, + "logps_train/ref_chosen": -51.75, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -96.53277587890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.380498468875885, + "rewards_train/margins": 4.1837761998176575, + "rewards_train/rejected": -3.8032777309417725, + "step": 1395 + }, + { + "epoch": 1.85, + "learning_rate": 3.001031083881791e-07, + "loss": 0.0422, + "step": 1396 + }, + { + "epoch": 1.85, + "logps_train/chosen": -49.59112548828125, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -97.49778747558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5502627491950989, + "rewards_train/margins": 4.206291019916534, + "rewards_train/rejected": -3.6560282707214355, + "step": 1396 + }, + { + "epoch": 1.86, + "logps_train/chosen": -52.57244110107422, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -110.08491516113281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.055255651473999, + "rewards_train/margins": 5.044997453689575, + "rewards_train/rejected": -3.989741802215576, + "step": 1397 + }, + { + "epoch": 1.86, + "learning_rate": 2.995643361313901e-07, + "loss": 0.0265, + "step": 1398 + }, + { + "epoch": 1.86, + "logps_train/chosen": -52.32046127319336, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -116.98666381835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.9452974796295166, + "rewards_train/margins": 4.649042367935181, + "rewards_train/rejected": -3.703744888305664, + "step": 1398 + }, + { + "epoch": 1.86, + "logps_train/chosen": -48.648155212402344, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -90.46015930175781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12971559166908264, + "rewards_train/margins": 4.2390128672122955, + "rewards_train/rejected": -4.109297275543213, + "step": 1399 + }, + { + "epoch": 1.86, + "learning_rate": 2.990253241523349e-07, + "loss": 0.034, + "step": 1400 + }, + { + "epoch": 1.86, + "logps_train/chosen": -61.207550048828125, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -99.67433166503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6120576858520508, + "rewards_train/margins": 4.791991233825684, + "rewards_train/rejected": -4.179933547973633, + "step": 1400 + }, + { + "epoch": 1.86, + "logps_train/chosen": -74.7815170288086, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -109.3410873413086, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.3038794994354248, + "rewards_train/margins": 5.159863471984863, + "rewards_train/rejected": -3.8559839725494385, + "step": 1401 + }, + { + "epoch": 1.86, + "learning_rate": 2.9848607505799245e-07, + "loss": 0.0403, + "step": 1402 + }, + { + "epoch": 1.86, + "logps_train/chosen": -35.02925109863281, + "logps_train/ref_chosen": -42.25, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -101.90641021728516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7111372947692871, + "rewards_train/margins": 4.845528602600098, + "rewards_train/rejected": -4.1343913078308105, + "step": 1402 + }, + { + "epoch": 1.86, + "logps_train/chosen": -80.09613037109375, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -110.01890563964844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19741851091384888, + "rewards_train/margins": 3.974309265613556, + "rewards_train/rejected": -3.776890754699707, + "step": 1403 + }, + { + "epoch": 1.86, + "learning_rate": 2.9794659145648814e-07, + "loss": 0.0535, + "step": 1404 + }, + { + "epoch": 1.86, + "logps_train/chosen": -50.37534713745117, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -93.46333312988281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.21715278923511505, + "rewards_train/margins": 3.934580758213997, + "rewards_train/rejected": -3.717427968978882, + "step": 1404 + }, + { + "epoch": 1.87, + "logps_train/chosen": -60.197166442871094, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -121.78974914550781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.19002953171730042, + "rewards_train/margins": 5.2483201920986176, + "rewards_train/rejected": -5.438349723815918, + "step": 1405 + }, + { + "epoch": 1.87, + "learning_rate": 2.97406875957082e-07, + "loss": 0.0632, + "step": 1406 + }, + { + "epoch": 1.87, + "logps_train/chosen": -51.564117431640625, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -102.11001586914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6717129945755005, + "rewards_train/margins": 3.973730683326721, + "rewards_train/rejected": -3.3020176887512207, + "step": 1406 + }, + { + "epoch": 1.87, + "logps_train/chosen": -49.958805084228516, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -96.32981872558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5056820511817932, + "rewards_train/margins": 3.9699142575263977, + "rewards_train/rejected": -3.4642322063446045, + "step": 1407 + }, + { + "epoch": 1.87, + "learning_rate": 2.968669311701555e-07, + "loss": 0.0476, + "step": 1408 + }, + { + "epoch": 1.87, + "logps_train/chosen": -54.87819290161133, + "logps_train/ref_chosen": -63.75, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -117.6095962524414, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.877805769443512, + "rewards_train/margins": 6.093452990055084, + "rewards_train/rejected": -5.215647220611572, + "step": 1408 + }, + { + "epoch": 1.87, + "logps_train/chosen": -79.81818389892578, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -124.37107849121094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15568169951438904, + "rewards_train/margins": 5.692789167165756, + "rewards_train/rejected": -5.537107467651367, + "step": 1409 + }, + { + "epoch": 1.87, + "learning_rate": 2.963267597071988e-07, + "loss": 0.0149, + "step": 1410 + }, + { + "epoch": 1.87, + "logps_train/chosen": -32.90620422363281, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -34.5, + "logps_train/rejected": -55.9189567565918, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.2570356130599976, + "rewards_train/margins": 3.4086965322494507, + "rewards_train/rejected": -2.151660919189453, + "step": 1410 + }, + { + "epoch": 1.87, + "logps_train/chosen": -48.267337799072266, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -87.08444213867188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.44748491048812866, + "rewards_train/margins": 4.073897540569305, + "rewards_train/rejected": -3.6264126300811768, + "step": 1411 + }, + { + "epoch": 1.88, + "learning_rate": 2.9578636418079894e-07, + "loss": 0.0714, + "step": 1412 + }, + { + "epoch": 1.88, + "logps_train/chosen": -92.38131713867188, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -102.0, + "logps_train/rejected": -156.2255859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1725068986415863, + "rewards_train/margins": 5.25630322098732, + "rewards_train/rejected": -5.428810119628906, + "step": 1412 + }, + { + "epoch": 1.88, + "logps_train/chosen": -63.55613708496094, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -116.3621826171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15366077423095703, + "rewards_train/margins": 4.344276428222656, + "rewards_train/rejected": -4.497937202453613, + "step": 1413 + }, + { + "epoch": 1.88, + "learning_rate": 2.952457472046261e-07, + "loss": 0.0124, + "step": 1414 + }, + { + "epoch": 1.88, + "logps_train/chosen": -74.3561782836914, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -100.5, + "logps_train/rejected": -148.5107879638672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32375743985176086, + "rewards_train/margins": 5.1373365223407745, + "rewards_train/rejected": -4.813579082489014, + "step": 1414 + }, + { + "epoch": 1.88, + "logps_train/chosen": -44.059532165527344, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -79.52403259277344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.11748401820659637, + "rewards_train/margins": 3.5456690043210983, + "rewards_train/rejected": -3.428184986114502, + "step": 1415 + }, + { + "epoch": 1.88, + "learning_rate": 2.947049113934219e-07, + "loss": 0.0369, + "step": 1416 + }, + { + "epoch": 1.88, + "logps_train/chosen": -70.86552429199219, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -107.1171875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.025947168469429016, + "rewards_train/margins": 3.9083684235811234, + "rewards_train/rejected": -3.8824212551116943, + "step": 1416 + }, + { + "epoch": 1.88, + "logps_train/chosen": -48.52073669433594, + "logps_train/ref_chosen": -53.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -99.45140075683594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45339542627334595, + "rewards_train/margins": 4.861035883426666, + "rewards_train/rejected": -4.40764045715332, + "step": 1417 + }, + { + "epoch": 1.88, + "learning_rate": 2.941638593629863e-07, + "loss": 0.0284, + "step": 1418 + }, + { + "epoch": 1.88, + "logps_train/chosen": -35.82040023803711, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -98.71434783935547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.37468594312667847, + "rewards_train/margins": 4.558620631694794, + "rewards_train/rejected": -4.183934688568115, + "step": 1418 + }, + { + "epoch": 1.88, + "logps_train/chosen": -42.31189727783203, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -79.3904037475586, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5063104033470154, + "rewards_train/margins": 4.03519481420517, + "rewards_train/rejected": -3.5288844108581543, + "step": 1419 + }, + { + "epoch": 1.89, + "learning_rate": 2.9362259373016483e-07, + "loss": 0.0267, + "step": 1420 + }, + { + "epoch": 1.89, + "logps_train/chosen": -60.00281524658203, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -122.89885711669922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18682795763015747, + "rewards_train/margins": 5.225542366504669, + "rewards_train/rejected": -5.038714408874512, + "step": 1420 + }, + { + "epoch": 1.89, + "logps_train/chosen": -52.390525817871094, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -99.8812484741211, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3828224539756775, + "rewards_train/margins": 4.5131348967552185, + "rewards_train/rejected": -4.130312442779541, + "step": 1421 + }, + { + "epoch": 1.89, + "learning_rate": 2.9308111711283633e-07, + "loss": 0.026, + "step": 1422 + }, + { + "epoch": 1.89, + "logps_train/chosen": -70.5267105102539, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -127.44424438476562, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": 0.2817041873931885, + "rewards_train/margins": 5.202691316604614, + "rewards_train/rejected": -4.920987129211426, + "step": 1422 + }, + { + "epoch": 1.89, + "logps_train/chosen": -55.425819396972656, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -120.81999206542969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.046488210558891296, + "rewards_train/margins": 4.937073782086372, + "rewards_train/rejected": -4.983561992645264, + "step": 1423 + }, + { + "epoch": 1.89, + "learning_rate": 2.925394321299002e-07, + "loss": 0.12, + "step": 1424 + }, + { + "epoch": 1.89, + "logps_train/chosen": -64.47172546386719, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -129.3775634765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.09188999235630035, + "rewards_train/margins": 5.49527134001255, + "rewards_train/rejected": -5.40338134765625, + "step": 1424 + }, + { + "epoch": 1.89, + "logps_train/chosen": -107.11774444580078, + "logps_train/ref_chosen": -104.5, + "logps_train/ref_rejected": -99.0, + "logps_train/rejected": -143.26275634765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.23833663761615753, + "rewards_train/margins": 4.184032365679741, + "rewards_train/rejected": -4.422369003295898, + "step": 1425 + }, + { + "epoch": 1.89, + "learning_rate": 2.919975414012632e-07, + "loss": 0.0337, + "step": 1426 + }, + { + "epoch": 1.89, + "logps_train/chosen": -74.57425689697266, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -138.93374633789062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22460520267486572, + "rewards_train/margins": 5.713682770729065, + "rewards_train/rejected": -5.489077568054199, + "step": 1426 + }, + { + "epoch": 1.9, + "logps_train/chosen": -39.20121383666992, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -93.17411804199219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8892538547515869, + "rewards_train/margins": 5.139869928359985, + "rewards_train/rejected": -4.250616073608398, + "step": 1427 + }, + { + "epoch": 1.9, + "learning_rate": 2.9145544754782766e-07, + "loss": 0.0245, + "step": 1428 + }, + { + "epoch": 1.9, + "logps_train/chosen": -82.14473724365234, + "logps_train/ref_chosen": -87.5, + "logps_train/ref_rejected": -84.0, + "logps_train/rejected": -128.3704833984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.527713418006897, + "rewards_train/margins": 4.969449162483215, + "rewards_train/rejected": -4.441735744476318, + "step": 1428 + }, + { + "epoch": 1.9, + "logps_train/chosen": -35.23230743408203, + "logps_train/ref_chosen": -35.25, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -84.66827392578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.0009880661964416504, + "rewards_train/margins": 3.5846124291419983, + "rewards_train/rejected": -3.5836243629455566, + "step": 1429 + }, + { + "epoch": 1.9, + "learning_rate": 2.909131531914779e-07, + "loss": 0.0461, + "step": 1430 + }, + { + "epoch": 1.9, + "logps_train/chosen": -96.21036529541016, + "logps_train/ref_chosen": -96.0, + "logps_train/ref_rejected": -104.0, + "logps_train/rejected": -153.4139404296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.002792343497276306, + "rewards_train/margins": 4.91918657720089, + "rewards_train/rejected": -4.916394233703613, + "step": 1430 + }, + { + "epoch": 1.9, + "logps_train/chosen": -79.72184753417969, + "logps_train/ref_chosen": -88.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -122.22181701660156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8121906518936157, + "rewards_train/margins": 5.806247353553772, + "rewards_train/rejected": -4.994056701660156, + "step": 1431 + }, + { + "epoch": 1.9, + "learning_rate": 2.9037066095506844e-07, + "loss": 0.0229, + "step": 1432 + }, + { + "epoch": 1.9, + "logps_train/chosen": -69.42741394042969, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -107.76435089111328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.009602285921573639, + "rewards_train/margins": 4.163380362093449, + "rewards_train/rejected": -4.153778076171875, + "step": 1432 + }, + { + "epoch": 1.9, + "logps_train/chosen": -56.00580596923828, + "logps_train/ref_chosen": -59.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -116.01048278808594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3276299834251404, + "rewards_train/margins": 4.127505719661713, + "rewards_train/rejected": -3.7998757362365723, + "step": 1433 + }, + { + "epoch": 1.9, + "learning_rate": 2.898279734624105e-07, + "loss": 0.0571, + "step": 1434 + }, + { + "epoch": 1.9, + "logps_train/chosen": -42.48843002319336, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -98.35917663574219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8495945930480957, + "rewards_train/margins": 5.057387351989746, + "rewards_train/rejected": -4.20779275894165, + "step": 1434 + }, + { + "epoch": 1.91, + "logps_train/chosen": -66.51753234863281, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -129.76156616210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.35605907440185547, + "rewards_train/margins": 5.3915910720825195, + "rewards_train/rejected": -5.035531997680664, + "step": 1435 + }, + { + "epoch": 1.91, + "learning_rate": 2.8928509333825986e-07, + "loss": 0.0115, + "step": 1436 + }, + { + "epoch": 1.91, + "logps_train/chosen": -53.16557312011719, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -116.50257110595703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.12016167491674423, + "rewards_train/margins": 4.847762726247311, + "rewards_train/rejected": -4.727601051330566, + "step": 1436 + }, + { + "epoch": 1.91, + "logps_train/chosen": -52.30064392089844, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -98.35253143310547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6800918579101562, + "rewards_train/margins": 5.034095764160156, + "rewards_train/rejected": -4.35400390625, + "step": 1437 + }, + { + "epoch": 1.91, + "learning_rate": 2.88742023208304e-07, + "loss": 0.0195, + "step": 1438 + }, + { + "epoch": 1.91, + "logps_train/chosen": -65.48428344726562, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -108.00885772705078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8148536682128906, + "rewards_train/margins": 5.8329267501831055, + "rewards_train/rejected": -5.018073081970215, + "step": 1438 + }, + { + "epoch": 1.91, + "logps_train/chosen": -36.332275390625, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -86.35623931884766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2652098536491394, + "rewards_train/margins": 4.152787387371063, + "rewards_train/rejected": -3.887577533721924, + "step": 1439 + }, + { + "epoch": 1.91, + "learning_rate": 2.881987656991491e-07, + "loss": 0.0465, + "step": 1440 + }, + { + "epoch": 1.91, + "logps_train/chosen": -41.27156066894531, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -79.32283020019531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.14121852815151215, + "rewards_train/margins": 3.80200232565403, + "rewards_train/rejected": -3.943220853805542, + "step": 1440 + }, + { + "epoch": 1.91, + "logps_train/chosen": -42.398807525634766, + "logps_train/ref_chosen": -37.5, + "logps_train/ref_rejected": -36.5, + "logps_train/rejected": -76.52741241455078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.49066174030303955, + "rewards_train/margins": 3.5167664289474487, + "rewards_train/rejected": -4.007428169250488, + "step": 1441 + }, + { + "epoch": 1.92, + "learning_rate": 2.8765532343830815e-07, + "loss": 0.0917, + "step": 1442 + }, + { + "epoch": 1.92, + "logps_train/chosen": -84.64006805419922, + "logps_train/ref_chosen": -77.5, + "logps_train/ref_rejected": -108.0, + "logps_train/rejected": -167.8251953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7171317338943481, + "rewards_train/margins": 5.268512606620789, + "rewards_train/rejected": -5.985644340515137, + "step": 1442 + }, + { + "epoch": 1.92, + "logps_train/chosen": -79.37599182128906, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -125.7872085571289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3108382821083069, + "rewards_train/margins": 5.1208091378211975, + "rewards_train/rejected": -4.809970855712891, + "step": 1443 + }, + { + "epoch": 1.92, + "learning_rate": 2.8711169905418714e-07, + "loss": 0.0132, + "step": 1444 + }, + { + "epoch": 1.92, + "logps_train/chosen": -68.53081512451172, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -77.0, + "logps_train/rejected": -125.01752471923828, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.22816842794418335, + "rewards_train/margins": 5.012734353542328, + "rewards_train/rejected": -4.7845659255981445, + "step": 1444 + }, + { + "epoch": 1.92, + "logps_train/chosen": -35.465431213378906, + "logps_train/ref_chosen": -35.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -99.49406433105469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04056597501039505, + "rewards_train/margins": 4.3977847173810005, + "rewards_train/rejected": -4.3572187423706055, + "step": 1445 + }, + { + "epoch": 1.92, + "learning_rate": 2.8656789517607326e-07, + "loss": 0.0286, + "step": 1446 + }, + { + "epoch": 1.92, + "logps_train/chosen": -68.00939178466797, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -48.75, + "logps_train/rejected": -82.16260528564453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2365611493587494, + "rewards_train/margins": 3.5715712010860443, + "rewards_train/rejected": -3.335010051727295, + "step": 1446 + }, + { + "epoch": 1.92, + "logps_train/chosen": -50.444820404052734, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -90.63224792480469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3574712872505188, + "rewards_train/margins": 4.203117668628693, + "rewards_train/rejected": -3.845646381378174, + "step": 1447 + }, + { + "epoch": 1.92, + "learning_rate": 2.860239144341217e-07, + "loss": 0.041, + "step": 1448 + }, + { + "epoch": 1.92, + "logps_train/chosen": -86.54007720947266, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -143.39755249023438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3381800055503845, + "rewards_train/margins": 5.865435063838959, + "rewards_train/rejected": -5.527255058288574, + "step": 1448 + }, + { + "epoch": 1.92, + "logps_train/chosen": -50.87220764160156, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -61.75, + "logps_train/rejected": -99.9171142578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.42527899146080017, + "rewards_train/margins": 4.24980291724205, + "rewards_train/rejected": -3.82452392578125, + "step": 1449 + }, + { + "epoch": 1.93, + "learning_rate": 2.8547975945934317e-07, + "loss": 0.0531, + "step": 1450 + }, + { + "epoch": 1.93, + "logps_train/chosen": -58.34137725830078, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -139.39535522460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.30961209535598755, + "rewards_train/margins": 5.649149239063263, + "rewards_train/rejected": -5.339537143707275, + "step": 1450 + }, + { + "epoch": 1.93, + "logps_train/chosen": -44.89744186401367, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -130.18356323242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.00869344174861908, + "rewards_train/margins": 5.633299097418785, + "rewards_train/rejected": -5.624605655670166, + "step": 1451 + }, + { + "epoch": 1.93, + "learning_rate": 2.8493543288359095e-07, + "loss": 0.013, + "step": 1452 + }, + { + "epoch": 1.93, + "logps_train/chosen": -46.73979568481445, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -41.75, + "logps_train/rejected": -76.02203369140625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.34252452850341797, + "rewards_train/margins": 3.77207088470459, + "rewards_train/rejected": -3.429546356201172, + "step": 1452 + }, + { + "epoch": 1.93, + "logps_train/chosen": -69.0582275390625, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -80.5, + "logps_train/rejected": -136.95826721191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.13949009776115417, + "rewards_train/margins": 5.763442009687424, + "rewards_train/rejected": -5.6239519119262695, + "step": 1453 + }, + { + "epoch": 1.93, + "learning_rate": 2.843909373395484e-07, + "loss": 0.0283, + "step": 1454 + }, + { + "epoch": 1.93, + "logps_train/chosen": -45.812522888183594, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -45.0, + "logps_train/rejected": -89.51861572265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2468721866607666, + "rewards_train/margins": 4.687796354293823, + "rewards_train/rejected": -4.440924167633057, + "step": 1454 + }, + { + "epoch": 1.93, + "logps_train/chosen": -70.66246032714844, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -134.38975524902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5837537050247192, + "rewards_train/margins": 5.7336665391922, + "rewards_train/rejected": -5.1499128341674805, + "step": 1455 + }, + { + "epoch": 1.93, + "learning_rate": 2.838462754607159e-07, + "loss": 0.0389, + "step": 1456 + }, + { + "epoch": 1.93, + "logps_train/chosen": -91.08067321777344, + "logps_train/ref_chosen": -94.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -143.36907958984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2927139401435852, + "rewards_train/margins": 5.610871136188507, + "rewards_train/rejected": -5.318157196044922, + "step": 1456 + }, + { + "epoch": 1.93, + "logps_train/chosen": -61.23676681518555, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -96.6188735961914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.442338764667511, + "rewards_train/margins": 3.5593042969703674, + "rewards_train/rejected": -3.1169655323028564, + "step": 1457 + }, + { + "epoch": 1.94, + "learning_rate": 2.8330144988139884e-07, + "loss": 0.0644, + "step": 1458 + }, + { + "epoch": 1.94, + "logps_train/chosen": -71.24273681640625, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -125.11076354980469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.004241734743118286, + "rewards_train/margins": 5.030943185091019, + "rewards_train/rejected": -5.0267014503479, + "step": 1458 + }, + { + "epoch": 1.94, + "logps_train/chosen": -44.77756881713867, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -98.69107055664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4284931719303131, + "rewards_train/margins": 4.474163204431534, + "rewards_train/rejected": -4.045670032501221, + "step": 1459 + }, + { + "epoch": 1.94, + "learning_rate": 2.8275646323669357e-07, + "loss": 0.0413, + "step": 1460 + }, + { + "epoch": 1.94, + "logps_train/chosen": -54.52716064453125, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -56.25, + "logps_train/rejected": -99.99333190917969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1722475290298462, + "rewards_train/margins": 4.205211043357849, + "rewards_train/rejected": -4.377458572387695, + "step": 1460 + }, + { + "epoch": 1.94, + "logps_train/chosen": -113.48202514648438, + "logps_train/ref_chosen": -120.5, + "logps_train/ref_rejected": -112.5, + "logps_train/rejected": -172.98388671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7252349853515625, + "rewards_train/margins": 6.7533111572265625, + "rewards_train/rejected": -6.028076171875, + "step": 1461 + }, + { + "epoch": 1.94, + "learning_rate": 2.822113181624761e-07, + "loss": 0.053, + "step": 1462 + }, + { + "epoch": 1.94, + "logps_train/chosen": -55.837181091308594, + "logps_train/ref_chosen": -66.0, + "logps_train/ref_rejected": -74.0, + "logps_train/rejected": -113.34998321533203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.0076881647109985, + "rewards_train/margins": 4.925499081611633, + "rewards_train/rejected": -3.9178109169006348, + "step": 1462 + }, + { + "epoch": 1.94, + "logps_train/chosen": -63.88433074951172, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -130.2223663330078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07719208300113678, + "rewards_train/margins": 5.093569055199623, + "rewards_train/rejected": -5.016376972198486, + "step": 1463 + }, + { + "epoch": 1.94, + "learning_rate": 2.8166601729538846e-07, + "loss": 0.015, + "step": 1464 + }, + { + "epoch": 1.94, + "logps_train/chosen": -53.39549255371094, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -101.66741180419922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04795091599225998, + "rewards_train/margins": 3.571723870933056, + "rewards_train/rejected": -3.523772954940796, + "step": 1464 + }, + { + "epoch": 1.95, + "logps_train/chosen": -45.021942138671875, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -52.25, + "logps_train/rejected": -89.75994873046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6481965780258179, + "rewards_train/margins": 4.394503951072693, + "rewards_train/rejected": -3.746307373046875, + "step": 1465 + }, + { + "epoch": 1.95, + "learning_rate": 2.811205632728262e-07, + "loss": 0.064, + "step": 1466 + }, + { + "epoch": 1.95, + "logps_train/chosen": -38.482234954833984, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -44.25, + "logps_train/rejected": -99.0985107421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7193544507026672, + "rewards_train/margins": 6.205767452716827, + "rewards_train/rejected": -5.48641300201416, + "step": 1466 + }, + { + "epoch": 1.95, + "logps_train/chosen": -38.944766998291016, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -45.5, + "logps_train/rejected": -83.85580444335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38364824652671814, + "rewards_train/margins": 4.2301667630672455, + "rewards_train/rejected": -3.8465185165405273, + "step": 1467 + }, + { + "epoch": 1.95, + "learning_rate": 2.805749587329256e-07, + "loss": 0.0198, + "step": 1468 + }, + { + "epoch": 1.95, + "logps_train/chosen": -80.60104370117188, + "logps_train/ref_chosen": -86.5, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -133.9772491455078, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.610989511013031, + "rewards_train/margins": 6.119261682033539, + "rewards_train/rejected": -5.508272171020508, + "step": 1468 + }, + { + "epoch": 1.95, + "logps_train/chosen": -91.69876098632812, + "logps_train/ref_chosen": -105.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -134.55972290039062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.3144994974136353, + "rewards_train/margins": 6.207972168922424, + "rewards_train/rejected": -4.893472671508789, + "step": 1469 + }, + { + "epoch": 1.95, + "learning_rate": 2.800292063145509e-07, + "loss": 0.0241, + "step": 1470 + }, + { + "epoch": 1.95, + "logps_train/chosen": -52.74502182006836, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -104.6856689453125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20518550276756287, + "rewards_train/margins": 4.98312720656395, + "rewards_train/rejected": -4.777941703796387, + "step": 1470 + }, + { + "epoch": 1.95, + "logps_train/chosen": -71.55609130859375, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -116.09407043457031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.007171005010604858, + "rewards_train/margins": 4.297548919916153, + "rewards_train/rejected": -4.304719924926758, + "step": 1471 + }, + { + "epoch": 1.95, + "learning_rate": 2.7948330865728173e-07, + "loss": 0.0603, + "step": 1472 + }, + { + "epoch": 1.95, + "logps_train/chosen": -65.50558471679688, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -63.75, + "logps_train/rejected": -109.54410552978516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.11662900447845459, + "rewards_train/margins": 4.691351771354675, + "rewards_train/rejected": -4.574722766876221, + "step": 1472 + }, + { + "epoch": 1.96, + "logps_train/chosen": -40.497886657714844, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -89.035888671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3642734885215759, + "rewards_train/margins": 3.9506747126579285, + "rewards_train/rejected": -3.5864012241363525, + "step": 1473 + }, + { + "epoch": 1.96, + "learning_rate": 2.789372684014e-07, + "loss": 0.0409, + "step": 1474 + }, + { + "epoch": 1.96, + "logps_train/chosen": -82.15108489990234, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -147.2525177001953, + "rewards_train/accuracies": 0.875, + "rewards_train/chosen": -0.09323322772979736, + "rewards_train/margins": 5.689831614494324, + "rewards_train/rejected": -5.783064842224121, + "step": 1474 + }, + { + "epoch": 1.96, + "logps_train/chosen": -63.462379455566406, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -110.44020080566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4722146987915039, + "rewards_train/margins": 3.590555191040039, + "rewards_train/rejected": -4.062769889831543, + "step": 1475 + }, + { + "epoch": 1.96, + "learning_rate": 2.783910881878774e-07, + "loss": 0.1127, + "step": 1476 + }, + { + "epoch": 1.96, + "logps_train/chosen": -47.33250427246094, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -104.92821502685547, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.35346829891204834, + "rewards_train/margins": 4.177540421485901, + "rewards_train/rejected": -3.8240721225738525, + "step": 1476 + }, + { + "epoch": 1.96, + "logps_train/chosen": -70.73235321044922, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -69.5, + "logps_train/rejected": -113.67223358154297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.043547987937927246, + "rewards_train/margins": 4.37211287021637, + "rewards_train/rejected": -4.415660858154297, + "step": 1477 + }, + { + "epoch": 1.96, + "learning_rate": 2.778447706583625e-07, + "loss": 0.0605, + "step": 1478 + }, + { + "epoch": 1.96, + "logps_train/chosen": -81.16079711914062, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -144.43612670898438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23079533874988556, + "rewards_train/margins": 5.436908200383186, + "rewards_train/rejected": -5.206112861633301, + "step": 1478 + }, + { + "epoch": 1.96, + "logps_train/chosen": -47.40415954589844, + "logps_train/ref_chosen": -47.5, + "logps_train/ref_rejected": -51.25, + "logps_train/rejected": -96.43424224853516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.02130260318517685, + "rewards_train/margins": 4.548321150243282, + "rewards_train/rejected": -4.5270185470581055, + "step": 1479 + }, + { + "epoch": 1.97, + "learning_rate": 2.7729831845516804e-07, + "loss": 0.0251, + "step": 1480 + }, + { + "epoch": 1.97, + "logps_train/chosen": -51.33964538574219, + "logps_train/ref_chosen": -48.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -93.49244689941406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3348585367202759, + "rewards_train/margins": 3.467511296272278, + "rewards_train/rejected": -3.8023698329925537, + "step": 1480 + }, + { + "epoch": 1.97, + "logps_train/chosen": -82.95388793945312, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -97.5, + "logps_train/rejected": -149.5021209716797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.41711124777793884, + "rewards_train/margins": 5.609510749578476, + "rewards_train/rejected": -5.192399501800537, + "step": 1481 + }, + { + "epoch": 1.97, + "learning_rate": 2.7675173422125806e-07, + "loss": 0.0483, + "step": 1482 + }, + { + "epoch": 1.97, + "logps_train/chosen": -69.3114013671875, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -104.64030456542969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.43895331025123596, + "rewards_train/margins": 4.186014264822006, + "rewards_train/rejected": -4.624967575073242, + "step": 1482 + }, + { + "epoch": 1.97, + "logps_train/chosen": -67.50462341308594, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -161.12814331054688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38860011100769043, + "rewards_train/margins": 6.9529759883880615, + "rewards_train/rejected": -6.564375877380371, + "step": 1483 + }, + { + "epoch": 1.97, + "learning_rate": 2.7620502060023534e-07, + "loss": 0.0262, + "step": 1484 + }, + { + "epoch": 1.97, + "logps_train/chosen": -65.65058135986328, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -117.5969009399414, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5271295309066772, + "rewards_train/margins": 5.218069434165955, + "rewards_train/rejected": -4.690939903259277, + "step": 1484 + }, + { + "epoch": 1.97, + "logps_train/chosen": -68.64362335205078, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -127.66253662109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.571393609046936, + "rewards_train/margins": 4.962047457695007, + "rewards_train/rejected": -5.533441066741943, + "step": 1485 + }, + { + "epoch": 1.97, + "learning_rate": 2.756581802363282e-07, + "loss": 0.021, + "step": 1486 + }, + { + "epoch": 1.97, + "logps_train/chosen": -72.97940063476562, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -79.0, + "logps_train/rejected": -140.24441528320312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.11356444656848907, + "rewards_train/margins": 6.007751986384392, + "rewards_train/rejected": -6.121316432952881, + "step": 1486 + }, + { + "epoch": 1.97, + "logps_train/chosen": -68.36685180664062, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -117.40311431884766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.30777865648269653, + "rewards_train/margins": 4.398157775402069, + "rewards_train/rejected": -4.705936431884766, + "step": 1487 + }, + { + "epoch": 1.98, + "learning_rate": 2.751112157743782e-07, + "loss": 0.0245, + "step": 1488 + }, + { + "epoch": 1.98, + "logps_train/chosen": -64.34452819824219, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -101.63462829589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4999223053455353, + "rewards_train/margins": 4.169634610414505, + "rewards_train/rejected": -3.6697123050689697, + "step": 1488 + }, + { + "epoch": 1.98, + "logps_train/chosen": -67.4752426147461, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -113.42302703857422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4443991184234619, + "rewards_train/margins": 4.4260289669036865, + "rewards_train/rejected": -4.870428085327148, + "step": 1489 + }, + { + "epoch": 1.98, + "learning_rate": 2.74564129859827e-07, + "loss": 0.0544, + "step": 1490 + }, + { + "epoch": 1.98, + "logps_train/chosen": -36.124183654785156, + "logps_train/ref_chosen": -39.5, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -84.19427490234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.33387088775634766, + "rewards_train/margins": 4.070876598358154, + "rewards_train/rejected": -3.7370057106018066, + "step": 1490 + }, + { + "epoch": 1.98, + "logps_train/chosen": -54.76097869873047, + "logps_train/ref_chosen": -56.0, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -139.83721923828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1317143440246582, + "rewards_train/margins": 5.313873291015625, + "rewards_train/rejected": -5.182158946990967, + "step": 1491 + }, + { + "epoch": 1.98, + "learning_rate": 2.7401692513870374e-07, + "loss": 0.0814, + "step": 1492 + }, + { + "epoch": 1.98, + "logps_train/chosen": -51.678688049316406, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -103.94388580322266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8008812665939331, + "rewards_train/margins": 4.192144751548767, + "rewards_train/rejected": -3.391263484954834, + "step": 1492 + }, + { + "epoch": 1.98, + "logps_train/chosen": -52.904869079589844, + "logps_train/ref_chosen": -55.0, + "logps_train/ref_rejected": -70.5, + "logps_train/rejected": -110.48863220214844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20795035362243652, + "rewards_train/margins": 4.189625978469849, + "rewards_train/rejected": -3.981675624847412, + "step": 1493 + }, + { + "epoch": 1.98, + "learning_rate": 2.7346960425761196e-07, + "loss": 0.0263, + "step": 1494 + }, + { + "epoch": 1.98, + "logps_train/chosen": -65.22968292236328, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -141.08441162109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32390648126602173, + "rewards_train/margins": 5.43547397851944, + "rewards_train/rejected": -5.111567497253418, + "step": 1494 + }, + { + "epoch": 1.99, + "logps_train/chosen": -54.40276336669922, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -88.49136352539062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.20816144347190857, + "rewards_train/margins": 3.5096417367458344, + "rewards_train/rejected": -3.301480293273926, + "step": 1495 + }, + { + "epoch": 1.99, + "learning_rate": 2.7292216986371724e-07, + "loss": 0.0567, + "step": 1496 + }, + { + "epoch": 1.99, + "logps_train/chosen": -46.14875030517578, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -89.08484649658203, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12034371495246887, + "rewards_train/margins": 3.617828816175461, + "rewards_train/rejected": -3.7381725311279297, + "step": 1496 + }, + { + "epoch": 1.99, + "logps_train/chosen": -60.46788787841797, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -67.0, + "logps_train/rejected": -106.10638427734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1395392417907715, + "rewards_train/margins": 5.034552335739136, + "rewards_train/rejected": -3.8950130939483643, + "step": 1497 + }, + { + "epoch": 1.99, + "learning_rate": 2.723746246047343e-07, + "loss": 0.057, + "step": 1498 + }, + { + "epoch": 1.99, + "logps_train/chosen": -79.06304168701172, + "logps_train/ref_chosen": -80.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -129.55767822265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1530708372592926, + "rewards_train/margins": 5.021338850259781, + "rewards_train/rejected": -4.868268013000488, + "step": 1498 + }, + { + "epoch": 1.99, + "logps_train/chosen": -72.53982543945312, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -76.5, + "logps_train/rejected": -123.5213623046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9668728709220886, + "rewards_train/margins": 3.7411234974861145, + "rewards_train/rejected": -4.707996368408203, + "step": 1499 + }, + { + "epoch": 1.99, + "learning_rate": 2.718269711289137e-07, + "loss": 0.0348, + "step": 1500 + }, + { + "epoch": 1.99, + "logps_train/chosen": -66.54116821289062, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -95.76277160644531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1947413980960846, + "rewards_train/margins": 4.04716095328331, + "rewards_train/rejected": -4.2419023513793945, + "step": 1500 + }, + { + "epoch": 1.99, + "logps_train/chosen": -92.96195983886719, + "logps_train/ref_chosen": -91.0, + "logps_train/ref_rejected": -96.5, + "logps_train/rejected": -152.18441772460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.20557141304016113, + "rewards_train/margins": 5.351933240890503, + "rewards_train/rejected": -5.557504653930664, + "step": 1501 + }, + { + "epoch": 1.99, + "learning_rate": 2.712792120850297e-07, + "loss": 0.0568, + "step": 1502 + }, + { + "epoch": 1.99, + "logps_train/chosen": -41.53184509277344, + "logps_train/ref_chosen": -41.0, + "logps_train/ref_rejected": -43.5, + "logps_train/rejected": -86.88002014160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.060996681451797485, + "rewards_train/margins": 4.282082885503769, + "rewards_train/rejected": -4.343079566955566, + "step": 1502 + }, + { + "epoch": 2.0, + "logps_train/chosen": -64.42831420898438, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -117.857421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1881433129310608, + "rewards_train/margins": 4.031974732875824, + "rewards_train/rejected": -4.220118045806885, + "step": 1503 + }, + { + "epoch": 2.0, + "learning_rate": 2.7073135012236684e-07, + "loss": 0.061, + "step": 1504 + }, + { + "epoch": 2.0, + "logps_train/chosen": -54.92771530151367, + "logps_train/ref_chosen": -48.25, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -93.7679214477539, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6708967089653015, + "rewards_train/margins": 4.118395626544952, + "rewards_train/rejected": -4.789292335510254, + "step": 1504 + }, + { + "epoch": 2.0, + "logps_train/chosen": -73.65581512451172, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -137.98806762695312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.07261277735233307, + "rewards_train/margins": 5.2761950343847275, + "rewards_train/rejected": -5.3488078117370605, + "step": 1505 + }, + { + "epoch": 2.0, + "learning_rate": 2.7018338789070793e-07, + "loss": 0.036, + "step": 1506 + }, + { + "epoch": 2.0, + "logps_train/chosen": -80.58269500732422, + "logps_train/ref_chosen": -83.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -146.76889038085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2971992492675781, + "rewards_train/margins": 5.706118583679199, + "rewards_train/rejected": -5.408919334411621, + "step": 1506 + }, + { + "epoch": 2.0, + "logps_train/chosen": -98.68017578125, + "logps_train/ref_chosen": -92.0, + "logps_train/ref_rejected": -105.0, + "logps_train/rejected": -162.65444946289062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6664549112319946, + "rewards_train/margins": 5.113051772117615, + "rewards_train/rejected": -5.779506683349609, + "step": 1507 + }, + { + "epoch": 2.0, + "learning_rate": 2.6963532804032027e-07, + "loss": 0.0147, + "step": 1508 + }, + { + "epoch": 2.0, + "logps_train/chosen": -91.02877044677734, + "logps_train/ref_chosen": -89.5, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -167.73997497558594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.13412678241729736, + "rewards_train/margins": 7.06799590587616, + "rewards_train/rejected": -7.202122688293457, + "step": 1508 + }, + { + "epoch": 2.0, + "logps_train/chosen": -70.12113952636719, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -66.5, + "logps_train/rejected": -120.04388427734375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.17617684602737427, + "rewards_train/margins": 5.170399367809296, + "rewards_train/rejected": -5.34657621383667, + "step": 1509 + }, + { + "epoch": 2.01, + "learning_rate": 2.690871732219435e-07, + "loss": 0.01, + "step": 1510 + }, + { + "epoch": 2.01, + "logps_train/chosen": -57.403995513916016, + "logps_train/ref_chosen": -61.25, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -91.07113647460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.38928788900375366, + "rewards_train/margins": 3.885464608669281, + "rewards_train/rejected": -3.4961767196655273, + "step": 1510 + }, + { + "epoch": 2.01, + "logps_train/chosen": -65.37919616699219, + "logps_train/ref_chosen": -67.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -119.10092163085938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.14645498991012573, + "rewards_train/margins": 5.4409219622612, + "rewards_train/rejected": -5.294466972351074, + "step": 1511 + }, + { + "epoch": 2.01, + "learning_rate": 2.685389260867765e-07, + "loss": 0.0234, + "step": 1512 + }, + { + "epoch": 2.01, + "logps_train/chosen": -63.641632080078125, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -124.69804382324219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1782255619764328, + "rewards_train/margins": 4.980641320347786, + "rewards_train/rejected": -5.158866882324219, + "step": 1512 + }, + { + "epoch": 2.01, + "logps_train/chosen": -92.23426818847656, + "logps_train/ref_chosen": -100.0, + "logps_train/ref_rejected": -90.5, + "logps_train/rejected": -135.4663543701172, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7953237891197205, + "rewards_train/margins": 5.276334226131439, + "rewards_train/rejected": -4.481010437011719, + "step": 1513 + }, + { + "epoch": 2.01, + "learning_rate": 2.6799058928646477e-07, + "loss": 0.0138, + "step": 1514 + }, + { + "epoch": 2.01, + "logps_train/chosen": -77.5556640625, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -122.76179504394531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5708003640174866, + "rewards_train/margins": 4.214753568172455, + "rewards_train/rejected": -4.785553932189941, + "step": 1514 + }, + { + "epoch": 2.01, + "logps_train/chosen": -59.10913848876953, + "logps_train/ref_chosen": -55.75, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -105.50821685791016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3296639919281006, + "rewards_train/margins": 4.58209490776062, + "rewards_train/rejected": -4.911758899688721, + "step": 1515 + }, + { + "epoch": 2.01, + "learning_rate": 2.6744216547308747e-07, + "loss": 0.0169, + "step": 1516 + }, + { + "epoch": 2.01, + "logps_train/chosen": -48.72401428222656, + "logps_train/ref_chosen": -50.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -95.6735610961914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.11744228005409241, + "rewards_train/margins": 3.8668293058872223, + "rewards_train/rejected": -3.74938702583313, + "step": 1516 + }, + { + "epoch": 2.01, + "logps_train/chosen": -53.20874786376953, + "logps_train/ref_chosen": -49.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -106.03361511230469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3497812747955322, + "rewards_train/margins": 4.195768117904663, + "rewards_train/rejected": -4.545549392700195, + "step": 1517 + }, + { + "epoch": 2.02, + "learning_rate": 2.668936572991444e-07, + "loss": 0.0483, + "step": 1518 + }, + { + "epoch": 2.02, + "logps_train/chosen": -59.403358459472656, + "logps_train/ref_chosen": -58.25, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -110.39976501464844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12471073865890503, + "rewards_train/margins": 4.123078644275665, + "rewards_train/rejected": -4.24778938293457, + "step": 1518 + }, + { + "epoch": 2.02, + "logps_train/chosen": -49.50419616699219, + "logps_train/ref_chosen": -50.5, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -103.47373962402344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.08708065748214722, + "rewards_train/margins": 4.662580192089081, + "rewards_train/rejected": -4.575499534606934, + "step": 1519 + }, + { + "epoch": 2.02, + "learning_rate": 2.663450674175437e-07, + "loss": 0.0348, + "step": 1520 + }, + { + "epoch": 2.02, + "logps_train/chosen": -79.5920181274414, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -85.5, + "logps_train/rejected": -134.07373046875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.16857658326625824, + "rewards_train/margins": 4.68879522383213, + "rewards_train/rejected": -4.857371807098389, + "step": 1520 + }, + { + "epoch": 2.02, + "logps_train/chosen": -49.621028900146484, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -50.25, + "logps_train/rejected": -107.77963256835938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5261653661727905, + "rewards_train/margins": 5.2244545221328735, + "rewards_train/rejected": -5.750619888305664, + "step": 1521 + }, + { + "epoch": 2.02, + "learning_rate": 2.657963984815885e-07, + "loss": 0.0372, + "step": 1522 + }, + { + "epoch": 2.02, + "logps_train/chosen": -44.043548583984375, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -81.99150848388672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.026895053684711456, + "rewards_train/margins": 3.4518268182873726, + "rewards_train/rejected": -3.424931764602661, + "step": 1522 + }, + { + "epoch": 2.02, + "logps_train/chosen": -34.81300735473633, + "logps_train/ref_chosen": -40.25, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -84.9473876953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5499492883682251, + "rewards_train/margins": 4.390390753746033, + "rewards_train/rejected": -3.8404414653778076, + "step": 1523 + }, + { + "epoch": 2.02, + "learning_rate": 2.6524765314496416e-07, + "loss": 0.0793, + "step": 1524 + }, + { + "epoch": 2.02, + "logps_train/chosen": -49.01211929321289, + "logps_train/ref_chosen": -44.0, + "logps_train/ref_rejected": -41.5, + "logps_train/rejected": -76.61344909667969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5012120008468628, + "rewards_train/margins": 3.0093518495559692, + "rewards_train/rejected": -3.510563850402832, + "step": 1524 + }, + { + "epoch": 2.03, + "logps_train/chosen": -51.36775207519531, + "logps_train/ref_chosen": -52.75, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -104.84258270263672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.13822507858276367, + "rewards_train/margins": 4.406858444213867, + "rewards_train/rejected": -4.2686333656311035, + "step": 1525 + }, + { + "epoch": 2.03, + "learning_rate": 2.646988340617258e-07, + "loss": 0.0973, + "step": 1526 + }, + { + "epoch": 2.03, + "logps_train/chosen": -65.84739685058594, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -58.25, + "logps_train/rejected": -97.88469696044922, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4402603805065155, + "rewards_train/margins": 4.411542862653732, + "rewards_train/rejected": -3.971282482147217, + "step": 1526 + }, + { + "epoch": 2.03, + "logps_train/chosen": -84.15038299560547, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -89.0, + "logps_train/rejected": -128.93423461914062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.414649099111557, + "rewards_train/margins": 4.419790834188461, + "rewards_train/rejected": -4.005141735076904, + "step": 1527 + }, + { + "epoch": 2.03, + "learning_rate": 2.6414994388628525e-07, + "loss": 0.0265, + "step": 1528 + }, + { + "epoch": 2.03, + "logps_train/chosen": -64.61827087402344, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -133.75851440429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3416889011859894, + "rewards_train/margins": 4.995665341615677, + "rewards_train/rejected": -4.6539764404296875, + "step": 1528 + }, + { + "epoch": 2.03, + "logps_train/chosen": -74.78620910644531, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -91.0, + "logps_train/rejected": -152.0070343017578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5184650421142578, + "rewards_train/margins": 5.588488578796387, + "rewards_train/rejected": -6.1069536209106445, + "step": 1529 + }, + { + "epoch": 2.03, + "learning_rate": 2.636009852733979e-07, + "loss": 0.0469, + "step": 1530 + }, + { + "epoch": 2.03, + "logps_train/chosen": -37.29755783081055, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -82.67414855957031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.020380735397338867, + "rewards_train/margins": 4.0360963344573975, + "rewards_train/rejected": -4.056477069854736, + "step": 1530 + }, + { + "epoch": 2.03, + "logps_train/chosen": -38.68263244628906, + "logps_train/ref_chosen": -37.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -79.23599243164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.16435717046260834, + "rewards_train/margins": 3.1619768291711807, + "rewards_train/rejected": -3.326333999633789, + "step": 1531 + }, + { + "epoch": 2.03, + "learning_rate": 2.630519608781505e-07, + "loss": 0.061, + "step": 1532 + }, + { + "epoch": 2.03, + "logps_train/chosen": -64.52886199951172, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -70.0, + "logps_train/rejected": -114.98080444335938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2807074785232544, + "rewards_train/margins": 4.763163685798645, + "rewards_train/rejected": -4.482456207275391, + "step": 1532 + }, + { + "epoch": 2.04, + "logps_train/chosen": -61.22148895263672, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.5, + "logps_train/rejected": -114.10621643066406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3091011941432953, + "rewards_train/margins": 4.479097455739975, + "rewards_train/rejected": -4.16999626159668, + "step": 1533 + }, + { + "epoch": 2.04, + "learning_rate": 2.6250287335594746e-07, + "loss": 0.052, + "step": 1534 + }, + { + "epoch": 2.04, + "logps_train/chosen": -49.084747314453125, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -114.23832702636719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.40613073110580444, + "rewards_train/margins": 4.219265043735504, + "rewards_train/rejected": -4.625395774841309, + "step": 1534 + }, + { + "epoch": 2.04, + "logps_train/chosen": -52.24105453491211, + "logps_train/ref_chosen": -57.75, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -84.32142639160156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5555819869041443, + "rewards_train/margins": 4.219756543636322, + "rewards_train/rejected": -3.6641745567321777, + "step": 1535 + }, + { + "epoch": 2.04, + "learning_rate": 2.6195372536249913e-07, + "loss": 0.0416, + "step": 1536 + }, + { + "epoch": 2.04, + "logps_train/chosen": -77.5198974609375, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -87.0, + "logps_train/rejected": -134.8009796142578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.25582292675971985, + "rewards_train/margins": 5.023420602083206, + "rewards_train/rejected": -4.767597675323486, + "step": 1536 + }, + { + "epoch": 2.04, + "logps_train/chosen": -43.38502502441406, + "logps_train/ref_chosen": -45.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -93.40028381347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18229836225509644, + "rewards_train/margins": 4.181310832500458, + "rewards_train/rejected": -3.9990124702453613, + "step": 1537 + }, + { + "epoch": 2.04, + "learning_rate": 2.614045195538078e-07, + "loss": 0.0602, + "step": 1538 + }, + { + "epoch": 2.04, + "logps_train/chosen": -69.01332092285156, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -80.0, + "logps_train/rejected": -118.67607116699219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.23648770153522491, + "rewards_train/margins": 3.6068999022245407, + "rewards_train/rejected": -3.8433876037597656, + "step": 1538 + }, + { + "epoch": 2.04, + "logps_train/chosen": -61.692901611328125, + "logps_train/ref_chosen": -70.5, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -98.7035903930664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8666471242904663, + "rewards_train/margins": 4.11669385433197, + "rewards_train/rejected": -3.250046730041504, + "step": 1539 + }, + { + "epoch": 2.05, + "learning_rate": 2.608552585861559e-07, + "loss": 0.0606, + "step": 1540 + }, + { + "epoch": 2.05, + "logps_train/chosen": -42.5904541015625, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -81.87644958496094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.006774693727493286, + "rewards_train/margins": 3.491294652223587, + "rewards_train/rejected": -3.4845199584960938, + "step": 1540 + }, + { + "epoch": 2.05, + "logps_train/chosen": -55.27981185913086, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -93.18289184570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.033737726509571075, + "rewards_train/margins": 4.031714983284473, + "rewards_train/rejected": -3.9979772567749023, + "step": 1541 + }, + { + "epoch": 2.05, + "learning_rate": 2.6030594511609194e-07, + "loss": 0.0557, + "step": 1542 + }, + { + "epoch": 2.05, + "logps_train/chosen": -40.53589630126953, + "logps_train/ref_chosen": -34.0, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -100.51864624023438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.650855302810669, + "rewards_train/margins": 3.946321725845337, + "rewards_train/rejected": -4.597177028656006, + "step": 1542 + }, + { + "epoch": 2.05, + "logps_train/chosen": -89.83085632324219, + "logps_train/ref_chosen": -84.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -113.07759094238281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5619921088218689, + "rewards_train/margins": 3.563540995121002, + "rewards_train/rejected": -4.125533103942871, + "step": 1543 + }, + { + "epoch": 2.05, + "learning_rate": 2.5975658180041917e-07, + "loss": 0.0726, + "step": 1544 + }, + { + "epoch": 2.05, + "logps_train/chosen": -52.169281005859375, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -99.7624282836914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15364661812782288, + "rewards_train/margins": 4.677283614873886, + "rewards_train/rejected": -4.830930233001709, + "step": 1544 + }, + { + "epoch": 2.05, + "logps_train/chosen": -56.277809143066406, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -44.0, + "logps_train/rejected": -89.40313720703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4746555984020233, + "rewards_train/margins": 4.076595813035965, + "rewards_train/rejected": -4.551251411437988, + "step": 1545 + }, + { + "epoch": 2.05, + "learning_rate": 2.592071712961813e-07, + "loss": 0.0489, + "step": 1546 + }, + { + "epoch": 2.05, + "logps_train/chosen": -61.126136779785156, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -103.38435363769531, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.16863611340522766, + "rewards_train/margins": 4.711758404970169, + "rewards_train/rejected": -4.543122291564941, + "step": 1546 + }, + { + "epoch": 2.05, + "logps_train/chosen": -96.93534851074219, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -84.5, + "logps_train/rejected": -133.3983154296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.390409380197525, + "rewards_train/margins": 4.488484472036362, + "rewards_train/rejected": -4.878893852233887, + "step": 1547 + }, + { + "epoch": 2.06, + "learning_rate": 2.586577162606506e-07, + "loss": 0.0174, + "step": 1548 + }, + { + "epoch": 2.06, + "logps_train/chosen": -66.61924743652344, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -118.21217346191406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18230153620243073, + "rewards_train/margins": 4.461330905556679, + "rewards_train/rejected": -4.279029369354248, + "step": 1548 + }, + { + "epoch": 2.06, + "logps_train/chosen": -42.583744049072266, + "logps_train/ref_chosen": -38.75, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -89.92491149902344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.39314013719558716, + "rewards_train/margins": 3.598178803920746, + "rewards_train/rejected": -3.991318941116333, + "step": 1549 + }, + { + "epoch": 2.06, + "learning_rate": 2.5810821935131456e-07, + "loss": 0.0465, + "step": 1550 + }, + { + "epoch": 2.06, + "logps_train/chosen": -93.02580261230469, + "logps_train/ref_chosen": -85.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -159.73475646972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.768204927444458, + "rewards_train/margins": 5.930270433425903, + "rewards_train/rejected": -6.698475360870361, + "step": 1550 + }, + { + "epoch": 2.06, + "logps_train/chosen": -46.9946403503418, + "logps_train/ref_chosen": -46.75, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -84.75387573242188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.022901445627212524, + "rewards_train/margins": 3.3661580979824066, + "rewards_train/rejected": -3.389059543609619, + "step": 1551 + }, + { + "epoch": 2.06, + "learning_rate": 2.5755868322586327e-07, + "loss": 0.0323, + "step": 1552 + }, + { + "epoch": 2.06, + "logps_train/chosen": -42.54875946044922, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -40.25, + "logps_train/rejected": -67.339111328125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6794989109039307, + "rewards_train/margins": 3.3829407691955566, + "rewards_train/rejected": -2.703441858291626, + "step": 1552 + }, + { + "epoch": 2.06, + "logps_train/chosen": -83.93661499023438, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -108.704833984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2842859923839569, + "rewards_train/margins": 4.060025244951248, + "rewards_train/rejected": -4.344311237335205, + "step": 1553 + }, + { + "epoch": 2.06, + "learning_rate": 2.570091105421765e-07, + "loss": 0.0599, + "step": 1554 + }, + { + "epoch": 2.06, + "logps_train/chosen": -66.35104370117188, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -87.71920776367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.13833293318748474, + "rewards_train/margins": 3.4032226502895355, + "rewards_train/rejected": -3.264889717102051, + "step": 1554 + }, + { + "epoch": 2.07, + "logps_train/chosen": -71.66230773925781, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -126.78219604492188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.024824365973472595, + "rewards_train/margins": 4.540894761681557, + "rewards_train/rejected": -4.565719127655029, + "step": 1555 + }, + { + "epoch": 2.07, + "learning_rate": 2.564595039583109e-07, + "loss": 0.0441, + "step": 1556 + }, + { + "epoch": 2.07, + "logps_train/chosen": -73.87794494628906, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -92.5, + "logps_train/rejected": -147.35125732421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2747054100036621, + "rewards_train/margins": 5.7410807609558105, + "rewards_train/rejected": -5.466375350952148, + "step": 1556 + }, + { + "epoch": 2.07, + "logps_train/chosen": -22.887451171875, + "logps_train/ref_chosen": -25.0, + "logps_train/ref_rejected": -31.5, + "logps_train/rejected": -70.28204345703125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.21164549887180328, + "rewards_train/margins": 4.090435400605202, + "rewards_train/rejected": -3.8787899017333984, + "step": 1557 + }, + { + "epoch": 2.07, + "learning_rate": 2.559098661324868e-07, + "loss": 0.0318, + "step": 1558 + }, + { + "epoch": 2.07, + "logps_train/chosen": -85.10887145996094, + "logps_train/ref_chosen": -81.5, + "logps_train/ref_rejected": -88.0, + "logps_train/rejected": -139.03176879882812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3476055860519409, + "rewards_train/margins": 4.742290139198303, + "rewards_train/rejected": -5.089895725250244, + "step": 1558 + }, + { + "epoch": 2.07, + "logps_train/chosen": -73.78115844726562, + "logps_train/ref_chosen": -69.5, + "logps_train/ref_rejected": -85.0, + "logps_train/rejected": -144.37875366210938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4521392285823822, + "rewards_train/margins": 5.4622980654239655, + "rewards_train/rejected": -5.914437294006348, + "step": 1559 + }, + { + "epoch": 2.07, + "learning_rate": 2.55360199723076e-07, + "loss": 0.0374, + "step": 1560 + }, + { + "epoch": 2.07, + "logps_train/chosen": -36.88570785522461, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -84.48214721679688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.31533533334732056, + "rewards_train/margins": 4.468383848667145, + "rewards_train/rejected": -4.153048515319824, + "step": 1560 + }, + { + "epoch": 2.07, + "logps_train/chosen": -54.943965911865234, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -71.0, + "logps_train/rejected": -118.80177307128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.01080290973186493, + "rewards_train/margins": 4.7553113251924515, + "rewards_train/rejected": -4.766114234924316, + "step": 1561 + }, + { + "epoch": 2.07, + "learning_rate": 2.5481050738858836e-07, + "loss": 0.0241, + "step": 1562 + }, + { + "epoch": 2.07, + "logps_train/chosen": -106.90838623046875, + "logps_train/ref_chosen": -111.0, + "logps_train/ref_rejected": -81.0, + "logps_train/rejected": -130.95278930664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3919735848903656, + "rewards_train/margins": 5.3810034692287445, + "rewards_train/rejected": -4.989029884338379, + "step": 1562 + }, + { + "epoch": 2.08, + "logps_train/chosen": -69.67191314697266, + "logps_train/ref_chosen": -68.0, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -122.857421875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15078496932983398, + "rewards_train/margins": 4.575387001037598, + "rewards_train/rejected": -4.726171970367432, + "step": 1563 + }, + { + "epoch": 2.08, + "learning_rate": 2.54260791787659e-07, + "loss": 0.0571, + "step": 1564 + }, + { + "epoch": 2.08, + "logps_train/chosen": -85.62047576904297, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -128.25270080566406, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.44517719745635986, + "rewards_train/margins": 4.138296008110046, + "rewards_train/rejected": -4.583473205566406, + "step": 1564 + }, + { + "epoch": 2.08, + "logps_train/chosen": -60.68268966674805, + "logps_train/ref_chosen": -61.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -104.92230224609375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07235600054264069, + "rewards_train/margins": 4.592711225152016, + "rewards_train/rejected": -4.520355224609375, + "step": 1565 + }, + { + "epoch": 2.08, + "learning_rate": 2.5371105557903593e-07, + "loss": 0.0471, + "step": 1566 + }, + { + "epoch": 2.08, + "logps_train/chosen": -55.982749938964844, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -94.38992309570312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.13811860978603363, + "rewards_train/margins": 3.8003846555948257, + "rewards_train/rejected": -3.9385032653808594, + "step": 1566 + }, + { + "epoch": 2.08, + "logps_train/chosen": -52.22447204589844, + "logps_train/ref_chosen": -57.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -105.70120239257812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4730607867240906, + "rewards_train/margins": 4.500310599803925, + "rewards_train/rejected": -4.027249813079834, + "step": 1567 + }, + { + "epoch": 2.08, + "learning_rate": 2.531613014215665e-07, + "loss": 0.0341, + "step": 1568 + }, + { + "epoch": 2.08, + "logps_train/chosen": -56.40673828125, + "logps_train/ref_chosen": -58.75, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -103.34124755859375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.24213893711566925, + "rewards_train/margins": 4.310638502240181, + "rewards_train/rejected": -4.068499565124512, + "step": 1568 + }, + { + "epoch": 2.08, + "logps_train/chosen": -44.017494201660156, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -100.64871215820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.10370241105556488, + "rewards_train/margins": 4.248668804764748, + "rewards_train/rejected": -4.3523712158203125, + "step": 1569 + }, + { + "epoch": 2.08, + "learning_rate": 2.52611531974185e-07, + "loss": 0.0451, + "step": 1570 + }, + { + "epoch": 2.08, + "logps_train/chosen": -76.9195785522461, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -76.0, + "logps_train/rejected": -117.16419219970703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.583042323589325, + "rewards_train/margins": 4.674461543560028, + "rewards_train/rejected": -4.091419219970703, + "step": 1570 + }, + { + "epoch": 2.09, + "logps_train/chosen": -47.77491760253906, + "logps_train/ref_chosen": -44.25, + "logps_train/ref_rejected": -57.75, + "logps_train/rejected": -103.7301025390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3614758849143982, + "rewards_train/margins": 4.233408987522125, + "rewards_train/rejected": -4.594884872436523, + "step": 1571 + }, + { + "epoch": 2.09, + "learning_rate": 2.520617498958997e-07, + "loss": 0.0374, + "step": 1572 + }, + { + "epoch": 2.09, + "logps_train/chosen": -75.93413543701172, + "logps_train/ref_chosen": -71.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -109.85200500488281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.46216312050819397, + "rewards_train/margins": 3.5996006429195404, + "rewards_train/rejected": -4.061763763427734, + "step": 1572 + }, + { + "epoch": 2.09, + "logps_train/chosen": -45.1778450012207, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -50.0, + "logps_train/rejected": -89.40332794189453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.025965616106987, + "rewards_train/margins": 3.9709860533475876, + "rewards_train/rejected": -3.9450204372406006, + "step": 1573 + }, + { + "epoch": 2.09, + "learning_rate": 2.515119578457799e-07, + "loss": 0.0676, + "step": 1574 + }, + { + "epoch": 2.09, + "logps_train/chosen": -54.127647399902344, + "logps_train/ref_chosen": -51.0, + "logps_train/ref_rejected": -51.5, + "logps_train/rejected": -98.14857482910156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3127646744251251, + "rewards_train/margins": 4.339593380689621, + "rewards_train/rejected": -4.652358055114746, + "step": 1574 + }, + { + "epoch": 2.09, + "logps_train/chosen": -76.50881958007812, + "logps_train/ref_chosen": -78.5, + "logps_train/ref_rejected": -73.5, + "logps_train/rejected": -113.28398132324219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.202243372797966, + "rewards_train/margins": 4.175954148173332, + "rewards_train/rejected": -3.973710775375366, + "step": 1575 + }, + { + "epoch": 2.09, + "learning_rate": 2.5096215848294305e-07, + "loss": 0.0214, + "step": 1576 + }, + { + "epoch": 2.09, + "logps_train/chosen": -96.40092468261719, + "logps_train/ref_chosen": -90.0, + "logps_train/ref_rejected": -89.5, + "logps_train/rejected": -141.45005798339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6158736944198608, + "rewards_train/margins": 4.559601426124573, + "rewards_train/rejected": -5.175475120544434, + "step": 1576 + }, + { + "epoch": 2.09, + "logps_train/chosen": -55.775482177734375, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -69.0, + "logps_train/rejected": -109.15546417236328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6755768060684204, + "rewards_train/margins": 4.687997937202454, + "rewards_train/rejected": -4.012421131134033, + "step": 1577 + }, + { + "epoch": 2.1, + "learning_rate": 2.504123544665423e-07, + "loss": 0.0684, + "step": 1578 + }, + { + "epoch": 2.1, + "logps_train/chosen": -68.91365051269531, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -116.5, + "logps_train/rejected": -171.9080047607422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3164474368095398, + "rewards_train/margins": 5.86662358045578, + "rewards_train/rejected": -5.55017614364624, + "step": 1578 + }, + { + "epoch": 2.1, + "logps_train/chosen": -43.86370849609375, + "logps_train/ref_chosen": -45.0, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -86.76457214355469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.11284802854061127, + "rewards_train/margins": 4.461180433630943, + "rewards_train/rejected": -4.348332405090332, + "step": 1579 + }, + { + "epoch": 2.1, + "learning_rate": 2.498625484557529e-07, + "loss": 0.0259, + "step": 1580 + }, + { + "epoch": 2.1, + "logps_train/chosen": -46.797454833984375, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -47.5, + "logps_train/rejected": -82.36903381347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.545254111289978, + "rewards_train/margins": 4.027469515800476, + "rewards_train/rejected": -3.482215404510498, + "step": 1580 + }, + { + "epoch": 2.1, + "logps_train/chosen": -53.979408264160156, + "logps_train/ref_chosen": -53.25, + "logps_train/ref_rejected": -40.0, + "logps_train/rejected": -74.20802307128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.06512857228517532, + "rewards_train/margins": 3.358017675578594, + "rewards_train/rejected": -3.4231462478637695, + "step": 1581 + }, + { + "epoch": 2.1, + "learning_rate": 2.4931274310975996e-07, + "loss": 0.0775, + "step": 1582 + }, + { + "epoch": 2.1, + "logps_train/chosen": -64.98932647705078, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -111.77056884765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.02919236570596695, + "rewards_train/margins": 4.954686559736729, + "rewards_train/rejected": -4.925494194030762, + "step": 1582 + }, + { + "epoch": 2.1, + "logps_train/chosen": -94.63760375976562, + "logps_train/ref_chosen": -87.0, + "logps_train/ref_rejected": -114.0, + "logps_train/rejected": -172.94630432128906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.769033670425415, + "rewards_train/margins": 5.139659643173218, + "rewards_train/rejected": -5.908693313598633, + "step": 1583 + }, + { + "epoch": 2.1, + "learning_rate": 2.487629410877453e-07, + "loss": 0.027, + "step": 1584 + }, + { + "epoch": 2.1, + "logps_train/chosen": -35.84782791137695, + "logps_train/ref_chosen": -38.25, + "logps_train/ref_rejected": -45.25, + "logps_train/rejected": -82.56978607177734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.24802957475185394, + "rewards_train/margins": 3.9909456819295883, + "rewards_train/rejected": -3.7429161071777344, + "step": 1584 + }, + { + "epoch": 2.1, + "logps_train/chosen": -41.21533966064453, + "logps_train/ref_chosen": -47.0, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -84.68006896972656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5897940993309021, + "rewards_train/margins": 4.235926568508148, + "rewards_train/rejected": -3.646132469177246, + "step": 1585 + }, + { + "epoch": 2.11, + "learning_rate": 2.482131450488748e-07, + "loss": 0.0533, + "step": 1586 + }, + { + "epoch": 2.11, + "logps_train/chosen": -59.50300598144531, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -113.67447662353516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15889409184455872, + "rewards_train/margins": 4.72417876124382, + "rewards_train/rejected": -4.883072853088379, + "step": 1586 + }, + { + "epoch": 2.11, + "logps_train/chosen": -57.84709548950195, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -75.0, + "logps_train/rejected": -119.77457427978516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.32622769474983215, + "rewards_train/margins": 4.802123099565506, + "rewards_train/rejected": -4.475895404815674, + "step": 1587 + }, + { + "epoch": 2.11, + "learning_rate": 2.4766335765228523e-07, + "loss": 0.0269, + "step": 1588 + }, + { + "epoch": 2.11, + "logps_train/chosen": -71.62646484375, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -53.25, + "logps_train/rejected": -91.20127868652344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.17670877277851105, + "rewards_train/margins": 3.625450536608696, + "rewards_train/rejected": -3.802159309387207, + "step": 1588 + }, + { + "epoch": 2.11, + "logps_train/chosen": -101.00092315673828, + "logps_train/ref_chosen": -93.5, + "logps_train/ref_rejected": -121.0, + "logps_train/rejected": -188.58261108398438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.728217601776123, + "rewards_train/margins": 6.017544269561768, + "rewards_train/rejected": -6.745761871337891, + "step": 1589 + }, + { + "epoch": 2.11, + "learning_rate": 2.4711358155707167e-07, + "loss": 0.0267, + "step": 1590 + }, + { + "epoch": 2.11, + "logps_train/chosen": -64.10111236572266, + "logps_train/ref_chosen": -67.5, + "logps_train/ref_rejected": -67.5, + "logps_train/rejected": -107.20722961425781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.35707587003707886, + "rewards_train/margins": 4.341861665248871, + "rewards_train/rejected": -3.984785795211792, + "step": 1590 + }, + { + "epoch": 2.11, + "logps_train/chosen": -39.69798278808594, + "logps_train/ref_chosen": -40.75, + "logps_train/ref_rejected": -36.0, + "logps_train/rejected": -82.97311401367188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.10988955199718475, + "rewards_train/margins": 4.799779459834099, + "rewards_train/rejected": -4.689889907836914, + "step": 1591 + }, + { + "epoch": 2.11, + "learning_rate": 2.465638194222745e-07, + "loss": 0.0221, + "step": 1592 + }, + { + "epoch": 2.11, + "logps_train/chosen": -56.59600830078125, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -63.0, + "logps_train/rejected": -110.76094818115234, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23221127688884735, + "rewards_train/margins": 5.017681285738945, + "rewards_train/rejected": -4.785470008850098, + "step": 1592 + }, + { + "epoch": 2.12, + "logps_train/chosen": -49.70170211791992, + "logps_train/ref_chosen": -57.25, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -107.25511932373047, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7634236812591553, + "rewards_train/margins": 4.651435375213623, + "rewards_train/rejected": -3.8880116939544678, + "step": 1593 + }, + { + "epoch": 2.12, + "learning_rate": 2.4601407390686653e-07, + "loss": 0.0192, + "step": 1594 + }, + { + "epoch": 2.12, + "logps_train/chosen": -76.11460876464844, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -60.0, + "logps_train/rejected": -103.22503662109375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.00885099172592163, + "rewards_train/margins": 4.325104653835297, + "rewards_train/rejected": -4.316253662109375, + "step": 1594 + }, + { + "epoch": 2.12, + "logps_train/chosen": -47.336090087890625, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -90.42121887207031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.21912571787834167, + "rewards_train/margins": 4.599919766187668, + "rewards_train/rejected": -4.380794048309326, + "step": 1595 + }, + { + "epoch": 2.12, + "learning_rate": 2.454643476697404e-07, + "loss": 0.0165, + "step": 1596 + }, + { + "epoch": 2.12, + "logps_train/chosen": -54.9783821105957, + "logps_train/ref_chosen": -61.0, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -90.58058166503906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5912240743637085, + "rewards_train/margins": 4.099281668663025, + "rewards_train/rejected": -3.5080575942993164, + "step": 1596 + }, + { + "epoch": 2.12, + "logps_train/chosen": -39.15232467651367, + "logps_train/ref_chosen": -45.75, + "logps_train/ref_rejected": -35.75, + "logps_train/rejected": -76.30804443359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6644551753997803, + "rewards_train/margins": 4.7108848094940186, + "rewards_train/rejected": -4.046429634094238, + "step": 1597 + }, + { + "epoch": 2.12, + "learning_rate": 2.4491464336969515e-07, + "loss": 0.0364, + "step": 1598 + }, + { + "epoch": 2.12, + "logps_train/chosen": -45.36869430541992, + "logps_train/ref_chosen": -43.25, + "logps_train/ref_rejected": -50.75, + "logps_train/rejected": -97.99908447265625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2056192010641098, + "rewards_train/margins": 4.518507614731789, + "rewards_train/rejected": -4.724126815795898, + "step": 1598 + }, + { + "epoch": 2.12, + "logps_train/chosen": -91.71726989746094, + "logps_train/ref_chosen": -95.5, + "logps_train/ref_rejected": -108.5, + "logps_train/rejected": -158.6529998779297, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3590348958969116, + "rewards_train/margins": 5.393865942955017, + "rewards_train/rejected": -5.0348310470581055, + "step": 1599 + }, + { + "epoch": 2.12, + "learning_rate": 2.44364963665424e-07, + "loss": 0.0442, + "step": 1600 + }, + { + "epoch": 2.12, + "logps_train/chosen": -64.96986389160156, + "logps_train/ref_chosen": -62.75, + "logps_train/ref_rejected": -73.0, + "logps_train/rejected": -128.32882690429688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.23292358219623566, + "rewards_train/margins": 5.2858976572752, + "rewards_train/rejected": -5.5188212394714355, + "step": 1600 + }, + { + "epoch": 2.13, + "logps_train/chosen": -46.02593994140625, + "logps_train/ref_chosen": -48.75, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -122.32823181152344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.26615601778030396, + "rewards_train/margins": 5.692728817462921, + "rewards_train/rejected": -5.426572799682617, + "step": 1601 + }, + { + "epoch": 2.13, + "learning_rate": 2.438153112155012e-07, + "loss": 0.0122, + "step": 1602 + }, + { + "epoch": 2.13, + "logps_train/chosen": -78.45476531982422, + "logps_train/ref_chosen": -90.5, + "logps_train/ref_rejected": -83.5, + "logps_train/rejected": -129.31253051757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.1928045749664307, + "rewards_train/margins": 5.77249550819397, + "rewards_train/rejected": -4.579690933227539, + "step": 1602 + }, + { + "epoch": 2.13, + "logps_train/chosen": -75.72451782226562, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -74.5, + "logps_train/rejected": -119.27732849121094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.04629838466644287, + "rewards_train/margins": 4.506062865257263, + "rewards_train/rejected": -4.45976448059082, + "step": 1603 + }, + { + "epoch": 2.13, + "learning_rate": 2.4326568867836906e-07, + "loss": 0.0353, + "step": 1604 + }, + { + "epoch": 2.13, + "logps_train/chosen": -56.995269775390625, + "logps_train/ref_chosen": -50.75, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -126.45884704589844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6315581202507019, + "rewards_train/margins": 4.197920620441437, + "rewards_train/rejected": -4.829478740692139, + "step": 1604 + }, + { + "epoch": 2.13, + "logps_train/chosen": -58.013954162597656, + "logps_train/ref_chosen": -62.25, + "logps_train/ref_rejected": -56.5, + "logps_train/rejected": -89.17253112792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4259481132030487, + "rewards_train/margins": 3.7057017982006073, + "rewards_train/rejected": -3.2797536849975586, + "step": 1605 + }, + { + "epoch": 2.13, + "learning_rate": 2.427160987123252e-07, + "loss": 0.082, + "step": 1606 + }, + { + "epoch": 2.13, + "logps_train/chosen": -63.029266357421875, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -49.5, + "logps_train/rejected": -95.93510437011719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.36347946524620056, + "rewards_train/margins": 5.014021724462509, + "rewards_train/rejected": -4.650542259216309, + "step": 1606 + }, + { + "epoch": 2.13, + "logps_train/chosen": -115.98505401611328, + "logps_train/ref_chosen": -111.5, + "logps_train/ref_rejected": -104.5, + "logps_train/rejected": -157.87326049804688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4703811705112457, + "rewards_train/margins": 4.87632092833519, + "rewards_train/rejected": -5.3467020988464355, + "step": 1607 + }, + { + "epoch": 2.14, + "learning_rate": 2.421665439755099e-07, + "loss": 0.0143, + "step": 1608 + }, + { + "epoch": 2.14, + "logps_train/chosen": -59.8670539855957, + "logps_train/ref_chosen": -60.25, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -125.93307495117188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.044544339179992676, + "rewards_train/margins": 4.8128520250320435, + "rewards_train/rejected": -4.768307685852051, + "step": 1608 + }, + { + "epoch": 2.14, + "logps_train/chosen": -44.045658111572266, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -57.0, + "logps_train/rejected": -95.89444732666016, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.743383526802063, + "rewards_train/margins": 4.635953068733215, + "rewards_train/rejected": -3.8925695419311523, + "step": 1609 + }, + { + "epoch": 2.14, + "learning_rate": 2.4161702712589284e-07, + "loss": 0.0215, + "step": 1610 + }, + { + "epoch": 2.14, + "logps_train/chosen": -78.1297836303711, + "logps_train/ref_chosen": -77.0, + "logps_train/ref_rejected": -101.5, + "logps_train/rejected": -159.01239013671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.11141559481620789, + "rewards_train/margins": 5.617948800325394, + "rewards_train/rejected": -5.729364395141602, + "step": 1610 + }, + { + "epoch": 2.14, + "logps_train/chosen": -97.22563934326172, + "logps_train/ref_chosen": -105.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -116.85176086425781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8454047441482544, + "rewards_train/margins": 5.379408955574036, + "rewards_train/rejected": -4.534004211425781, + "step": 1611 + }, + { + "epoch": 2.14, + "learning_rate": 2.410675508212606e-07, + "loss": 0.0293, + "step": 1612 + }, + { + "epoch": 2.14, + "logps_train/chosen": -78.36993408203125, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -65.0, + "logps_train/rejected": -114.35699462890625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3950382173061371, + "rewards_train/margins": 5.335425943136215, + "rewards_train/rejected": -4.940387725830078, + "step": 1612 + }, + { + "epoch": 2.14, + "logps_train/chosen": -31.0833797454834, + "logps_train/ref_chosen": -37.75, + "logps_train/ref_rejected": -36.75, + "logps_train/rejected": -70.13264465332031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6588495969772339, + "rewards_train/margins": 3.9947701692581177, + "rewards_train/rejected": -3.335920572280884, + "step": 1613 + }, + { + "epoch": 2.14, + "learning_rate": 2.405181177192035e-07, + "loss": 0.0228, + "step": 1614 + }, + { + "epoch": 2.14, + "logps_train/chosen": -41.733909606933594, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -49.25, + "logps_train/rejected": -90.10077667236328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.1820778101682663, + "rewards_train/margins": 4.259342595934868, + "rewards_train/rejected": -4.077264785766602, + "step": 1614 + }, + { + "epoch": 2.14, + "logps_train/chosen": -56.06945037841797, + "logps_train/ref_chosen": -52.25, + "logps_train/ref_rejected": -53.5, + "logps_train/rejected": -105.80155944824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.38145679235458374, + "rewards_train/margins": 4.846355974674225, + "rewards_train/rejected": -5.227812767028809, + "step": 1615 + }, + { + "epoch": 2.15, + "learning_rate": 2.399687304771031e-07, + "loss": 0.0767, + "step": 1616 + }, + { + "epoch": 2.15, + "logps_train/chosen": -50.83064270019531, + "logps_train/ref_chosen": -48.5, + "logps_train/ref_rejected": -57.25, + "logps_train/rejected": -106.90767669677734, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.22603324055671692, + "rewards_train/margins": 4.7287969291210175, + "rewards_train/rejected": -4.954830169677734, + "step": 1616 + }, + { + "epoch": 2.15, + "logps_train/chosen": -66.17247009277344, + "logps_train/ref_chosen": -71.0, + "logps_train/ref_rejected": -65.5, + "logps_train/rejected": -110.68878936767578, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4702523946762085, + "rewards_train/margins": 5.001631140708923, + "rewards_train/rejected": -4.531378746032715, + "step": 1617 + }, + { + "epoch": 2.15, + "learning_rate": 2.39419391752119e-07, + "loss": 0.0188, + "step": 1618 + }, + { + "epoch": 2.15, + "logps_train/chosen": -68.6185302734375, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -127.30271911621094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3564090132713318, + "rewards_train/margins": 4.831993043422699, + "rewards_train/rejected": -4.475584030151367, + "step": 1618 + }, + { + "epoch": 2.15, + "logps_train/chosen": -73.25057220458984, + "logps_train/ref_chosen": -73.0, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -102.1708755493164, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.02974472939968109, + "rewards_train/margins": 4.598280802369118, + "rewards_train/rejected": -4.628025531768799, + "step": 1619 + }, + { + "epoch": 2.15, + "learning_rate": 2.388701042011763e-07, + "loss": 0.0376, + "step": 1620 + }, + { + "epoch": 2.15, + "logps_train/chosen": -48.103763580322266, + "logps_train/ref_chosen": -49.0, + "logps_train/ref_rejected": -43.0, + "logps_train/rejected": -79.97691345214844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07712352275848389, + "rewards_train/margins": 3.763876795768738, + "rewards_train/rejected": -3.686753273010254, + "step": 1620 + }, + { + "epoch": 2.15, + "logps_train/chosen": -113.75464630126953, + "logps_train/ref_chosen": -118.0, + "logps_train/ref_rejected": -111.0, + "logps_train/rejected": -177.00018310546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.430785208940506, + "rewards_train/margins": 7.046427756547928, + "rewards_train/rejected": -6.615642547607422, + "step": 1621 + }, + { + "epoch": 2.15, + "learning_rate": 2.3832087048095239e-07, + "loss": 0.0413, + "step": 1622 + }, + { + "epoch": 2.15, + "logps_train/chosen": -55.05327606201172, + "logps_train/ref_chosen": -59.0, + "logps_train/ref_rejected": -64.5, + "logps_train/rejected": -106.33682250976562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.40199631452560425, + "rewards_train/margins": 4.5950533747673035, + "rewards_train/rejected": -4.193057060241699, + "step": 1622 + }, + { + "epoch": 2.16, + "logps_train/chosen": -81.50466918945312, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -156.93157958984375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.9364046454429626, + "rewards_train/margins": 5.744254648685455, + "rewards_train/rejected": -6.680659294128418, + "step": 1623 + }, + { + "epoch": 2.16, + "learning_rate": 2.3777169324786444e-07, + "loss": 0.0237, + "step": 1624 + }, + { + "epoch": 2.16, + "logps_train/chosen": -56.124969482421875, + "logps_train/ref_chosen": -51.5, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -116.07621765136719, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4695281982421875, + "rewards_train/margins": 5.025592803955078, + "rewards_train/rejected": -5.495121002197266, + "step": 1624 + }, + { + "epoch": 2.16, + "logps_train/chosen": -54.168025970458984, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -52.0, + "logps_train/rejected": -96.92337799072266, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.03319740295410156, + "rewards_train/margins": 4.513035297393799, + "rewards_train/rejected": -4.479837894439697, + "step": 1625 + }, + { + "epoch": 2.16, + "learning_rate": 2.3722257515805648e-07, + "loss": 0.0134, + "step": 1626 + }, + { + "epoch": 2.16, + "logps_train/chosen": -96.09199523925781, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -95.5, + "logps_train/rejected": -152.91561889648438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.07283129543066025, + "rewards_train/margins": 5.794081829488277, + "rewards_train/rejected": -5.721250534057617, + "step": 1626 + }, + { + "epoch": 2.16, + "logps_train/chosen": -67.76876068115234, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -71.5, + "logps_train/rejected": -124.48509216308594, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.246407151222229, + "rewards_train/margins": 5.070851683616638, + "rewards_train/rejected": -5.317258834838867, + "step": 1627 + }, + { + "epoch": 2.16, + "learning_rate": 2.3667351886738627e-07, + "loss": 0.0112, + "step": 1628 + }, + { + "epoch": 2.16, + "logps_train/chosen": -64.56925964355469, + "logps_train/ref_chosen": -66.5, + "logps_train/ref_rejected": -51.0, + "logps_train/rejected": -90.9698257446289, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2055739164352417, + "rewards_train/margins": 4.198650240898132, + "rewards_train/rejected": -3.9930763244628906, + "step": 1628 + }, + { + "epoch": 2.16, + "logps_train/chosen": -50.277252197265625, + "logps_train/ref_chosen": -54.25, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -122.89326477050781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.40039992332458496, + "rewards_train/margins": 6.066288709640503, + "rewards_train/rejected": -5.665888786315918, + "step": 1629 + }, + { + "epoch": 2.16, + "learning_rate": 2.3612452703141286e-07, + "loss": 0.0255, + "step": 1630 + }, + { + "epoch": 2.16, + "logps_train/chosen": -53.14012908935547, + "logps_train/ref_chosen": -60.75, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -101.79084777832031, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7531749606132507, + "rewards_train/margins": 5.366635024547577, + "rewards_train/rejected": -4.613460063934326, + "step": 1630 + }, + { + "epoch": 2.17, + "logps_train/chosen": -47.96973419189453, + "logps_train/ref_chosen": -40.5, + "logps_train/ref_rejected": -48.25, + "logps_train/rejected": -91.8565902709961, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7399420738220215, + "rewards_train/margins": 3.619154453277588, + "rewards_train/rejected": -4.359096527099609, + "step": 1631 + }, + { + "epoch": 2.17, + "learning_rate": 2.3557560230538347e-07, + "loss": 0.0236, + "step": 1632 + }, + { + "epoch": 2.17, + "logps_train/chosen": -63.61260986328125, + "logps_train/ref_chosen": -58.0, + "logps_train/ref_rejected": -62.25, + "logps_train/rejected": -110.71380615234375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5596988201141357, + "rewards_train/margins": 4.278869867324829, + "rewards_train/rejected": -4.838568687438965, + "step": 1632 + }, + { + "epoch": 2.17, + "logps_train/chosen": -66.30958557128906, + "logps_train/ref_chosen": -73.5, + "logps_train/ref_rejected": -68.5, + "logps_train/rejected": -113.25575256347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7081039547920227, + "rewards_train/margins": 5.205554783344269, + "rewards_train/rejected": -4.497450828552246, + "step": 1633 + }, + { + "epoch": 2.17, + "learning_rate": 2.3502674734422078e-07, + "loss": 0.0218, + "step": 1634 + }, + { + "epoch": 2.17, + "logps_train/chosen": -84.64749908447266, + "logps_train/ref_chosen": -79.0, + "logps_train/ref_rejected": -81.5, + "logps_train/rejected": -128.42918395996094, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.5647495985031128, + "rewards_train/margins": 4.123871922492981, + "rewards_train/rejected": -4.688621520996094, + "step": 1634 + }, + { + "epoch": 2.17, + "logps_train/chosen": -56.634986877441406, + "logps_train/ref_chosen": -58.5, + "logps_train/ref_rejected": -58.5, + "logps_train/rejected": -99.15913391113281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.19431385397911072, + "rewards_train/margins": 4.257102340459824, + "rewards_train/rejected": -4.062788486480713, + "step": 1635 + }, + { + "epoch": 2.17, + "learning_rate": 2.3447796480250998e-07, + "loss": 0.0368, + "step": 1636 + }, + { + "epoch": 2.17, + "logps_train/chosen": -66.58016967773438, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.5, + "logps_train/rejected": -110.55088806152344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.08614209294319153, + "rewards_train/margins": 4.722071617841721, + "rewards_train/rejected": -4.808213710784912, + "step": 1636 + }, + { + "epoch": 2.17, + "logps_train/chosen": -73.31005859375, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -75.5, + "logps_train/rejected": -137.06768798828125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.47631821036338806, + "rewards_train/margins": 5.66170135140419, + "rewards_train/rejected": -6.138019561767578, + "step": 1637 + }, + { + "epoch": 2.18, + "learning_rate": 2.3392925733448615e-07, + "loss": 0.0128, + "step": 1638 + }, + { + "epoch": 2.18, + "logps_train/chosen": -89.06816864013672, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -93.0, + "logps_train/rejected": -151.74034118652344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.049004822969436646, + "rewards_train/margins": 5.818779677152634, + "rewards_train/rejected": -5.86778450012207, + "step": 1638 + }, + { + "epoch": 2.18, + "logps_train/chosen": -77.03298950195312, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -105.98013305664062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5736537575721741, + "rewards_train/margins": 4.368150770664215, + "rewards_train/rejected": -3.794497013092041, + "step": 1639 + }, + { + "epoch": 2.18, + "learning_rate": 2.3338062759402104e-07, + "loss": 0.0227, + "step": 1640 + }, + { + "epoch": 2.18, + "logps_train/chosen": -79.04985046386719, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -91.5, + "logps_train/rejected": -149.3660888671875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.32060930132865906, + "rewards_train/margins": 5.456624776124954, + "rewards_train/rejected": -5.777234077453613, + "step": 1640 + }, + { + "epoch": 2.18, + "logps_train/chosen": -53.42491912841797, + "logps_train/ref_chosen": -55.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -101.69596862792969, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.18328961730003357, + "rewards_train/margins": 4.889605611562729, + "rewards_train/rejected": -4.706315994262695, + "step": 1641 + }, + { + "epoch": 2.18, + "learning_rate": 2.328320782346107e-07, + "loss": 0.0165, + "step": 1642 + }, + { + "epoch": 2.18, + "logps_train/chosen": -66.21868133544922, + "logps_train/ref_chosen": -72.0, + "logps_train/ref_rejected": -66.0, + "logps_train/rejected": -107.97332000732422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.5953192114830017, + "rewards_train/margins": 4.776244580745697, + "rewards_train/rejected": -4.180925369262695, + "step": 1642 + }, + { + "epoch": 2.18, + "logps_train/chosen": -93.39530181884766, + "logps_train/ref_chosen": -101.0, + "logps_train/ref_rejected": -79.5, + "logps_train/rejected": -137.36459350585938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7690638303756714, + "rewards_train/margins": 6.5738829374313354, + "rewards_train/rejected": -5.804819107055664, + "step": 1643 + }, + { + "epoch": 2.18, + "learning_rate": 2.3228361190936222e-07, + "loss": 0.0331, + "step": 1644 + }, + { + "epoch": 2.18, + "logps_train/chosen": -86.44429016113281, + "logps_train/ref_chosen": -89.0, + "logps_train/ref_rejected": -90.0, + "logps_train/rejected": -161.20623779296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2493205964565277, + "rewards_train/margins": 7.376194566488266, + "rewards_train/rejected": -7.126873970031738, + "step": 1644 + }, + { + "epoch": 2.18, + "logps_train/chosen": -57.833553314208984, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -109.65545654296875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15523049235343933, + "rewards_train/margins": 5.338439971208572, + "rewards_train/rejected": -5.493670463562012, + "step": 1645 + }, + { + "epoch": 2.19, + "learning_rate": 2.3173523127098124e-07, + "loss": 0.0098, + "step": 1646 + }, + { + "epoch": 2.19, + "logps_train/chosen": -97.1297836303711, + "logps_train/ref_chosen": -97.0, + "logps_train/ref_rejected": -82.5, + "logps_train/rejected": -145.30487060546875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.0053609758615493774, + "rewards_train/margins": 6.2641888707876205, + "rewards_train/rejected": -6.26954984664917, + "step": 1646 + }, + { + "epoch": 2.19, + "logps_train/chosen": -71.72000885009766, + "logps_train/ref_chosen": -83.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -108.19837951660156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.143624186515808, + "rewards_train/margins": 5.172838091850281, + "rewards_train/rejected": -4.029213905334473, + "step": 1647 + }, + { + "epoch": 2.19, + "learning_rate": 2.311869389717588e-07, + "loss": 0.0119, + "step": 1648 + }, + { + "epoch": 2.19, + "logps_train/chosen": -52.44001770019531, + "logps_train/ref_chosen": -59.75, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -111.6469497680664, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7356858253479004, + "rewards_train/margins": 5.510537624359131, + "rewards_train/rejected": -4.7748517990112305, + "step": 1648 + }, + { + "epoch": 2.19, + "logps_train/chosen": -43.7813720703125, + "logps_train/ref_chosen": -43.5, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -103.22969055175781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.027746468782424927, + "rewards_train/margins": 4.8170983493328094, + "rewards_train/rejected": -4.844844818115234, + "step": 1649 + }, + { + "epoch": 2.19, + "learning_rate": 2.306387376635588e-07, + "loss": 0.0245, + "step": 1650 + }, + { + "epoch": 2.19, + "logps_train/chosen": -54.53162384033203, + "logps_train/ref_chosen": -63.25, + "logps_train/ref_rejected": -62.75, + "logps_train/rejected": -111.63616943359375, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8798456788063049, + "rewards_train/margins": 5.77471262216568, + "rewards_train/rejected": -4.894866943359375, + "step": 1650 + }, + { + "epoch": 2.19, + "logps_train/chosen": -45.57465362548828, + "logps_train/ref_chosen": -50.25, + "logps_train/ref_rejected": -43.25, + "logps_train/rejected": -83.05009460449219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.45972204208374023, + "rewards_train/margins": 4.446763038635254, + "rewards_train/rejected": -3.9870409965515137, + "step": 1651 + }, + { + "epoch": 2.19, + "learning_rate": 2.3009062999780515e-07, + "loss": 0.0449, + "step": 1652 + }, + { + "epoch": 2.19, + "logps_train/chosen": -71.48902893066406, + "logps_train/ref_chosen": -76.0, + "logps_train/ref_rejected": -77.5, + "logps_train/rejected": -134.68780517578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4339093565940857, + "rewards_train/margins": 6.150344789028168, + "rewards_train/rejected": -5.716435432434082, + "step": 1652 + }, + { + "epoch": 2.2, + "logps_train/chosen": -79.80711364746094, + "logps_train/ref_chosen": -82.5, + "logps_train/ref_rejected": -82.0, + "logps_train/rejected": -140.88546752929688, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2911634147167206, + "rewards_train/margins": 6.189084976911545, + "rewards_train/rejected": -5.897921562194824, + "step": 1653 + }, + { + "epoch": 2.2, + "learning_rate": 2.2954261862546853e-07, + "loss": 0.0137, + "step": 1654 + }, + { + "epoch": 2.2, + "logps_train/chosen": -74.31769561767578, + "logps_train/ref_chosen": -78.0, + "logps_train/ref_rejected": -68.0, + "logps_train/rejected": -119.30254364013672, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.36119920015335083, + "rewards_train/margins": 5.507860004901886, + "rewards_train/rejected": -5.146660804748535, + "step": 1654 + }, + { + "epoch": 2.2, + "logps_train/chosen": -73.23081970214844, + "logps_train/ref_chosen": -88.5, + "logps_train/ref_rejected": -83.0, + "logps_train/rejected": -146.81724548339844, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.5425432920455933, + "rewards_train/margins": 7.918408274650574, + "rewards_train/rejected": -6.3758649826049805, + "step": 1655 + }, + { + "epoch": 2.2, + "learning_rate": 2.2899470619705429e-07, + "loss": 0.0193, + "step": 1656 + }, + { + "epoch": 2.2, + "logps_train/chosen": -51.79977035522461, + "logps_train/ref_chosen": -63.0, + "logps_train/ref_rejected": -40.5, + "logps_train/rejected": -74.28446197509766, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 1.123147964477539, + "rewards_train/margins": 4.505500316619873, + "rewards_train/rejected": -3.382352352142334, + "step": 1656 + }, + { + "epoch": 2.2, + "logps_train/chosen": -45.744300842285156, + "logps_train/ref_chosen": -45.5, + "logps_train/ref_rejected": -47.75, + "logps_train/rejected": -88.53366088867188, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.017398618161678314, + "rewards_train/margins": 4.051592089235783, + "rewards_train/rejected": -4.068990707397461, + "step": 1657 + }, + { + "epoch": 2.2, + "learning_rate": 2.2844689536258886e-07, + "loss": 0.0409, + "step": 1658 + }, + { + "epoch": 2.2, + "logps_train/chosen": -61.734397888183594, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -88.5, + "logps_train/rejected": -144.71124267578125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6796855330467224, + "rewards_train/margins": 6.288310945034027, + "rewards_train/rejected": -5.608625411987305, + "step": 1658 + }, + { + "epoch": 2.2, + "logps_train/chosen": -33.049800872802734, + "logps_train/ref_chosen": -34.5, + "logps_train/ref_rejected": -42.75, + "logps_train/rejected": -80.02346801757812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.15693405270576477, + "rewards_train/margins": 3.894535332918167, + "rewards_train/rejected": -3.7376012802124023, + "step": 1659 + }, + { + "epoch": 2.2, + "learning_rate": 2.2789918877160756e-07, + "loss": 0.0296, + "step": 1660 + }, + { + "epoch": 2.2, + "logps_train/chosen": -53.490760803222656, + "logps_train/ref_chosen": -51.25, + "logps_train/ref_rejected": -61.0, + "logps_train/rejected": -119.21926879882812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.21782618761062622, + "rewards_train/margins": 5.613475978374481, + "rewards_train/rejected": -5.831302165985107, + "step": 1660 + }, + { + "epoch": 2.21, + "logps_train/chosen": -59.0009880065918, + "logps_train/ref_chosen": -56.5, + "logps_train/ref_rejected": -59.75, + "logps_train/rejected": -115.6325912475586, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.25791144371032715, + "rewards_train/margins": 5.322535276412964, + "rewards_train/rejected": -5.580446720123291, + "step": 1661 + }, + { + "epoch": 2.21, + "learning_rate": 2.2735158907314143e-07, + "loss": 0.0132, + "step": 1662 + }, + { + "epoch": 2.21, + "logps_train/chosen": -40.65831756591797, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -51.75, + "logps_train/rejected": -96.56291198730469, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2068241834640503, + "rewards_train/margins": 4.699052691459656, + "rewards_train/rejected": -4.4922285079956055, + "step": 1662 + }, + { + "epoch": 2.21, + "logps_train/chosen": -79.8701171875, + "logps_train/ref_chosen": -76.5, + "logps_train/ref_rejected": -86.5, + "logps_train/rejected": -154.0372772216797, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3135742247104645, + "rewards_train/margins": 6.437028497457504, + "rewards_train/rejected": -6.750602722167969, + "step": 1663 + }, + { + "epoch": 2.21, + "learning_rate": 2.2680409891570448e-07, + "loss": 0.0119, + "step": 1664 + }, + { + "epoch": 2.21, + "logps_train/chosen": -41.70933532714844, + "logps_train/ref_chosen": -46.0, + "logps_train/ref_rejected": -53.75, + "logps_train/rejected": -101.9284439086914, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4243786931037903, + "rewards_train/margins": 5.246129810810089, + "rewards_train/rejected": -4.821751117706299, + "step": 1664 + }, + { + "epoch": 2.21, + "logps_train/chosen": -87.85283660888672, + "logps_train/ref_chosen": -93.0, + "logps_train/ref_rejected": -109.0, + "logps_train/rejected": -156.11868286132812, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.513593316078186, + "rewards_train/margins": 5.227804780006409, + "rewards_train/rejected": -4.714211463928223, + "step": 1665 + }, + { + "epoch": 2.21, + "learning_rate": 2.2625672094728097e-07, + "loss": 0.0401, + "step": 1666 + }, + { + "epoch": 2.21, + "logps_train/chosen": -59.63440704345703, + "logps_train/ref_chosen": -68.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -114.00321960449219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.8709344863891602, + "rewards_train/margins": 5.858757019042969, + "rewards_train/rejected": -4.987822532653809, + "step": 1666 + }, + { + "epoch": 2.21, + "logps_train/chosen": -50.95458984375, + "logps_train/ref_chosen": -49.5, + "logps_train/ref_rejected": -55.5, + "logps_train/rejected": -104.30546569824219, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.15014678239822388, + "rewards_train/margins": 4.7319623827934265, + "rewards_train/rejected": -4.88210916519165, + "step": 1667 + }, + { + "epoch": 2.22, + "learning_rate": 2.2570945781531259e-07, + "loss": 0.0139, + "step": 1668 + }, + { + "epoch": 2.22, + "logps_train/chosen": -40.91083526611328, + "logps_train/ref_chosen": -39.75, + "logps_train/ref_rejected": -54.5, + "logps_train/rejected": -98.2369384765625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.11608340591192245, + "rewards_train/margins": 4.256439425051212, + "rewards_train/rejected": -4.372522830963135, + "step": 1668 + }, + { + "epoch": 2.22, + "logps_train/chosen": -55.73370361328125, + "logps_train/ref_chosen": -54.5, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -102.60723876953125, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.12766724824905396, + "rewards_train/margins": 4.325244128704071, + "rewards_train/rejected": -4.452911376953125, + "step": 1669 + }, + { + "epoch": 2.22, + "learning_rate": 2.2516231216668542e-07, + "loss": 0.0407, + "step": 1670 + }, + { + "epoch": 2.22, + "logps_train/chosen": -61.32091522216797, + "logps_train/ref_chosen": -54.0, + "logps_train/ref_rejected": -55.75, + "logps_train/rejected": -111.96902465820312, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.7242790460586548, + "rewards_train/margins": 4.899185538291931, + "rewards_train/rejected": -5.623464584350586, + "step": 1670 + }, + { + "epoch": 2.22, + "logps_train/chosen": -65.0093994140625, + "logps_train/ref_chosen": -62.0, + "logps_train/ref_rejected": -54.75, + "logps_train/rejected": -87.73915100097656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.3032352328300476, + "rewards_train/margins": 2.9973891377449036, + "rewards_train/rejected": -3.300624370574951, + "step": 1671 + }, + { + "epoch": 2.22, + "learning_rate": 2.2461528664771763e-07, + "loss": 0.0675, + "step": 1672 + }, + { + "epoch": 2.22, + "logps_train/chosen": -55.681724548339844, + "logps_train/ref_chosen": -56.25, + "logps_train/ref_rejected": -55.0, + "logps_train/rejected": -106.15396118164062, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.06698378920555115, + "rewards_train/margins": 5.1714416444301605, + "rewards_train/rejected": -5.104457855224609, + "step": 1672 + }, + { + "epoch": 2.22, + "logps_train/chosen": -60.871726989746094, + "logps_train/ref_chosen": -64.5, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -125.38886260986328, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3456396460533142, + "rewards_train/margins": 5.705228745937347, + "rewards_train/rejected": -5.359589099884033, + "step": 1673 + }, + { + "epoch": 2.22, + "learning_rate": 2.2406838390414605e-07, + "loss": 0.0094, + "step": 1674 + }, + { + "epoch": 2.22, + "logps_train/chosen": -45.904052734375, + "logps_train/ref_chosen": -49.25, + "logps_train/ref_rejected": -47.0, + "logps_train/rejected": -90.9768295288086, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.33928221464157104, + "rewards_train/margins": 4.726809203624725, + "rewards_train/rejected": -4.387526988983154, + "step": 1674 + }, + { + "epoch": 2.22, + "logps_train/chosen": -71.08259582519531, + "logps_train/ref_chosen": -62.5, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -104.04598236083984, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.8645099997520447, + "rewards_train/margins": 3.940479338169098, + "rewards_train/rejected": -4.804989337921143, + "step": 1675 + }, + { + "epoch": 2.23, + "learning_rate": 2.23521606581114e-07, + "loss": 0.0349, + "step": 1676 + }, + { + "epoch": 2.23, + "logps_train/chosen": -75.70677185058594, + "logps_train/ref_chosen": -80.0, + "logps_train/ref_rejected": -103.0, + "logps_train/rejected": -157.05160522460938, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.4386974573135376, + "rewards_train/margins": 5.848544716835022, + "rewards_train/rejected": -5.409847259521484, + "step": 1676 + }, + { + "epoch": 2.23, + "logps_train/chosen": -68.19268035888672, + "logps_train/ref_chosen": -65.5, + "logps_train/ref_rejected": -62.0, + "logps_train/rejected": -112.60706329345703, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2786432206630707, + "rewards_train/margins": 4.794368177652359, + "rewards_train/rejected": -5.07301139831543, + "step": 1677 + }, + { + "epoch": 2.23, + "learning_rate": 2.229749573231579e-07, + "loss": 0.0238, + "step": 1678 + }, + { + "epoch": 2.23, + "logps_train/chosen": -40.02933120727539, + "logps_train/ref_chosen": -43.0, + "logps_train/ref_rejected": -58.0, + "logps_train/rejected": -108.81055450439453, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.2919215261936188, + "rewards_train/margins": 5.369461923837662, + "rewards_train/rejected": -5.077540397644043, + "step": 1678 + }, + { + "epoch": 2.23, + "logps_train/chosen": -50.072044372558594, + "logps_train/ref_chosen": -52.5, + "logps_train/ref_rejected": -53.0, + "logps_train/rejected": -106.66670227050781, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.23576444387435913, + "rewards_train/margins": 5.614153802394867, + "rewards_train/rejected": -5.378389358520508, + "step": 1679 + }, + { + "epoch": 2.23, + "learning_rate": 2.2242843877419493e-07, + "loss": 0.0151, + "step": 1680 + }, + { + "epoch": 2.23, + "logps_train/chosen": -50.86200714111328, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -56.0, + "logps_train/rejected": -96.57337951660156, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.3942679762840271, + "rewards_train/margins": 4.443792760372162, + "rewards_train/rejected": -4.049524784088135, + "step": 1680 + }, + { + "epoch": 2.23, + "logps_train/chosen": -68.32810974121094, + "logps_train/ref_chosen": -72.5, + "logps_train/ref_rejected": -64.0, + "logps_train/rejected": -109.55654907226562, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.40781405568122864, + "rewards_train/margins": 4.947844535112381, + "rewards_train/rejected": -4.540030479431152, + "step": 1681 + }, + { + "epoch": 2.23, + "learning_rate": 2.2188205357751017e-07, + "loss": 0.0222, + "step": 1682 + }, + { + "epoch": 2.23, + "logps_train/chosen": -59.291446685791016, + "logps_train/ref_chosen": -54.75, + "logps_train/ref_rejected": -60.5, + "logps_train/rejected": -113.6025390625, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.4572695195674896, + "rewards_train/margins": 4.846734195947647, + "rewards_train/rejected": -5.304003715515137, + "step": 1682 + }, + { + "epoch": 2.24, + "logps_train/chosen": -41.99505615234375, + "logps_train/ref_chosen": -39.0, + "logps_train/ref_rejected": -46.0, + "logps_train/rejected": -95.7668685913086, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.2979428470134735, + "rewards_train/margins": 4.681087285280228, + "rewards_train/rejected": -4.979030132293701, + "step": 1683 + }, + { + "epoch": 2.24, + "learning_rate": 2.2133580437574352e-07, + "loss": 0.0204, + "step": 1684 + }, + { + "epoch": 2.24, + "logps_train/chosen": -74.65121459960938, + "logps_train/ref_chosen": -82.0, + "logps_train/ref_rejected": -86.0, + "logps_train/rejected": -137.10507202148438, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7301912903785706, + "rewards_train/margins": 5.850073873996735, + "rewards_train/rejected": -5.119882583618164, + "step": 1684 + }, + { + "epoch": 2.24, + "logps_train/chosen": -38.73481369018555, + "logps_train/ref_chosen": -40.0, + "logps_train/ref_rejected": -46.75, + "logps_train/rejected": -85.76426696777344, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.13159671425819397, + "rewards_train/margins": 4.025992304086685, + "rewards_train/rejected": -3.894395589828491, + "step": 1685 + }, + { + "epoch": 2.24, + "learning_rate": 2.207896938108773e-07, + "loss": 0.0397, + "step": 1686 + }, + { + "epoch": 2.24, + "logps_train/chosen": -47.91719055175781, + "logps_train/ref_chosen": -55.5, + "logps_train/ref_rejected": -42.0, + "logps_train/rejected": -79.89845275878906, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.7535935044288635, + "rewards_train/margins": 4.555939495563507, + "rewards_train/rejected": -3.8023459911346436, + "step": 1686 + }, + { + "epoch": 2.24, + "logps_train/chosen": -44.0987434387207, + "logps_train/ref_chosen": -42.75, + "logps_train/ref_rejected": -46.5, + "logps_train/rejected": -87.04654693603516, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.13741333782672882, + "rewards_train/margins": 3.910014793276787, + "rewards_train/rejected": -4.047428131103516, + "step": 1687 + }, + { + "epoch": 2.24, + "learning_rate": 2.2024372452422316e-07, + "loss": 0.0416, + "step": 1688 + }, + { + "epoch": 2.24, + "logps_train/chosen": -75.06198120117188, + "logps_train/ref_chosen": -81.0, + "logps_train/ref_rejected": -116.0, + "logps_train/rejected": -175.1716766357422, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6117703914642334, + "rewards_train/margins": 6.510187864303589, + "rewards_train/rejected": -5.8984174728393555, + "step": 1688 + }, + { + "epoch": 2.24, + "logps_train/chosen": -68.56094360351562, + "logps_train/ref_chosen": -61.75, + "logps_train/ref_rejected": -42.25, + "logps_train/rejected": -94.68940734863281, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.6732823848724365, + "rewards_train/margins": 4.561283349990845, + "rewards_train/rejected": -5.234565734863281, + "step": 1689 + }, + { + "epoch": 2.24, + "learning_rate": 2.1969789915640963e-07, + "loss": 0.0179, + "step": 1690 + }, + { + "epoch": 2.24, + "logps_train/chosen": -63.77099609375, + "logps_train/ref_chosen": -70.0, + "logps_train/ref_rejected": -78.0, + "logps_train/rejected": -132.34539794921875, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": 0.6213377118110657, + "rewards_train/margins": 6.080877602100372, + "rewards_train/rejected": -5.459539890289307, + "step": 1690 + }, + { + "epoch": 2.25, + "logps_train/chosen": -66.35791015625, + "logps_train/ref_chosen": -65.0, + "logps_train/ref_rejected": -72.0, + "logps_train/rejected": -123.77528381347656, + "rewards_train/accuracies": 1.0, + "rewards_train/chosen": -0.1279783844947815, + "rewards_train/margins": 5.065175950527191, + "rewards_train/rejected": -5.193154335021973, + "step": 1691 + }, + { + "epoch": 2.25, + "learning_rate": 2.1915222034736893e-07, + "loss": 0.0154, + "step": 1692 + } + ], + "logging_steps": 2, + "max_steps": 3008, + "num_train_epochs": 4, + "save_steps": 188, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}