{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "abs_diff": 0.043448589742183685, "all_logps_1": -124.6441650390625, "all_logps_1_values": -124.64417266845703, "all_logps_2": 459.15625, "all_logps_2_values": 459.15625, "epoch": 0.0021373230029388193, "grad_norm": 16.66867807446414, "learning_rate": 2.127659574468085e-08, "logits/chosen": -1.1381689310073853, "logits/rejected": -0.9913416504859924, "logps/chosen": -0.2839311361312866, "logps/rejected": -0.29555341601371765, "loss": 1.5077, "original_losses": 1.5989841222763062, "rewards/accuracies": 0.625, "rewards/chosen": -0.7098277807235718, "rewards/margins": 0.029055725783109665, "rewards/rejected": -0.7388835549354553, "step": 1, "weight": 0.9598712921142578 }, { "abs_diff": 0.050563473254442215, "all_logps_1": -113.89578247070312, "all_logps_1_values": -113.89578247070312, "all_logps_2": 426.234375, "all_logps_2_values": 426.234375, "epoch": 0.010686615014694095, "grad_norm": 12.434660441186981, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.9904537796974182, "logits/rejected": -0.9189692735671997, "logps/chosen": -0.2694719731807709, "logps/rejected": -0.2684631943702698, "loss": 1.5251, "original_losses": 1.6255850791931152, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6736798286437988, "rewards/margins": -0.0025218012742698193, "rewards/rejected": -0.6711580753326416, "step": 5, "weight": 0.9548923373222351 }, { "abs_diff": 0.06418919563293457, "all_logps_1": -118.16609191894531, "all_logps_1_values": -118.16609191894531, "all_logps_2": 443.21875, "all_logps_2_values": 443.21875, "epoch": 0.02137323002938819, "grad_norm": 11.724962863400911, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.9794756174087524, "logits/rejected": -0.9353710412979126, "logps/chosen": -0.2719997763633728, "logps/rejected": -0.2735568881034851, "loss": 1.5172, "original_losses": 1.620931625366211, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.6799993515014648, "rewards/margins": 0.0038928240537643433, "rewards/rejected": -0.6838923096656799, "step": 10, "weight": 0.9420804977416992 }, { "abs_diff": 0.06552017480134964, "all_logps_1": -101.9596939086914, "all_logps_1_values": -101.95967864990234, "all_logps_2": 370.20001220703125, "all_logps_2_values": 370.20001220703125, "epoch": 0.03205984504408229, "grad_norm": 9.773542967175878, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9607246518135071, "logits/rejected": -0.9163097143173218, "logps/chosen": -0.29539960622787476, "logps/rejected": -0.2832711338996887, "loss": 1.5128, "original_losses": 1.6492595672607422, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7384990453720093, "rewards/margins": -0.030321191996335983, "rewards/rejected": -0.708177924156189, "step": 15, "weight": 0.9420396089553833 }, { "abs_diff": 0.082237109541893, "all_logps_1": -95.52127075195312, "all_logps_1_values": -95.52125549316406, "all_logps_2": 368.6625061035156, "all_logps_2_values": 368.6625061035156, "epoch": 0.04274646005877638, "grad_norm": 14.386337719633973, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9820459485054016, "logits/rejected": -0.9820452928543091, "logps/chosen": -0.26204216480255127, "logps/rejected": -0.26956799626350403, "loss": 1.5149, "original_losses": 1.6124236583709717, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6551053524017334, "rewards/margins": 0.018814602866768837, "rewards/rejected": -0.6739200353622437, "step": 20, "weight": 0.9291993379592896 }, { "abs_diff": 0.07468467205762863, "all_logps_1": -101.43566131591797, "all_logps_1_values": -101.43565368652344, "all_logps_2": 359.6499938964844, "all_logps_2_values": 359.6499938964844, "epoch": 0.053433075073470476, "grad_norm": 12.506683302853757, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0295155048370361, "logits/rejected": -1.0065571069717407, "logps/chosen": -0.28278106451034546, "logps/rejected": -0.2869016230106354, "loss": 1.5005, "original_losses": 1.6180095672607422, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.706952691078186, "rewards/margins": 0.010301386937499046, "rewards/rejected": -0.7172540426254272, "step": 25, "weight": 0.9346221089363098 }, { "abs_diff": 0.07145524024963379, "all_logps_1": -96.14094543457031, "all_logps_1_values": -96.14093780517578, "all_logps_2": 358.6937561035156, "all_logps_2_values": 358.6937561035156, "epoch": 0.06411969008816458, "grad_norm": 17.486598946846197, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0747442245483398, "logits/rejected": -0.9867307543754578, "logps/chosen": -0.27444857358932495, "logps/rejected": -0.27685946226119995, "loss": 1.5207, "original_losses": 1.6215848922729492, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6861215233802795, "rewards/margins": 0.006027159281075001, "rewards/rejected": -0.6921486258506775, "step": 30, "weight": 0.9376131296157837 }, { "abs_diff": 0.08128118515014648, "all_logps_1": -110.31912994384766, "all_logps_1_values": -110.3191146850586, "all_logps_2": 396.7250061035156, "all_logps_2_values": 396.7250061035156, "epoch": 0.07480630510285867, "grad_norm": 10.190092324128308, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0031483173370361, "logits/rejected": -0.9225772023200989, "logps/chosen": -0.2776695191860199, "logps/rejected": -0.3029964566230774, "loss": 1.5058, "original_losses": 1.5780258178710938, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6941738128662109, "rewards/margins": 0.06331733614206314, "rewards/rejected": -0.7574911713600159, "step": 35, "weight": 0.9304083585739136 }, { "abs_diff": 0.06388907134532928, "all_logps_1": -94.03665924072266, "all_logps_1_values": -94.03666687011719, "all_logps_2": 347.20001220703125, "all_logps_2_values": 347.20001220703125, "epoch": 0.08549292011755276, "grad_norm": 12.383837039803712, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9180997014045715, "logits/rejected": -0.9071486592292786, "logps/chosen": -0.28308817744255066, "logps/rejected": -0.29446059465408325, "loss": 1.5141, "original_losses": 1.6014320850372314, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7077205181121826, "rewards/margins": 0.028431018814444542, "rewards/rejected": -0.7361515760421753, "step": 40, "weight": 0.9425530433654785 }, { "abs_diff": 0.09521429240703583, "all_logps_1": -106.0528793334961, "all_logps_1_values": -106.0528793334961, "all_logps_2": 362.95623779296875, "all_logps_2_values": 362.95623779296875, "epoch": 0.09617953513224686, "grad_norm": 9.970613374779385, "learning_rate": 9.574468085106384e-07, "logits/chosen": -0.9140686988830566, "logits/rejected": -0.8324721455574036, "logps/chosen": -0.33634239435195923, "logps/rejected": -0.34527257084846497, "loss": 1.4915, "original_losses": 1.614324927330017, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8408559560775757, "rewards/margins": 0.02232544682919979, "rewards/rejected": -0.8631814122200012, "step": 45, "weight": 0.9211470484733582 }, { "abs_diff": 0.12202360481023788, "all_logps_1": -105.84830474853516, "all_logps_1_values": -105.84830474853516, "all_logps_2": 377.7437438964844, "all_logps_2_values": 377.7437438964844, "epoch": 0.10686615014694095, "grad_norm": 10.765426712830973, "learning_rate": 9.998741174712533e-07, "logits/chosen": -0.8902776837348938, "logits/rejected": -0.8994420766830444, "logps/chosen": -0.31167787313461304, "logps/rejected": -0.3589983582496643, "loss": 1.466, "original_losses": 1.5521076917648315, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7791945934295654, "rewards/margins": 0.11830125004053116, "rewards/rejected": -0.8974958658218384, "step": 50, "weight": 0.9070577621459961 }, { "abs_diff": 0.11367271095514297, "all_logps_1": -112.1168441772461, "all_logps_1_values": -112.1168441772461, "all_logps_2": 420.46875, "all_logps_2_values": 420.46875, "epoch": 0.11755276516163506, "grad_norm": 10.584693183679102, "learning_rate": 9.991050648838675e-07, "logits/chosen": -0.8847481608390808, "logits/rejected": -0.8255330920219421, "logps/chosen": -0.28891468048095703, "logps/rejected": -0.3513794541358948, "loss": 1.465, "original_losses": 1.557521939277649, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7222867012023926, "rewards/margins": 0.15616199374198914, "rewards/rejected": -0.8784486651420593, "step": 55, "weight": 0.9259511828422546 }, { "abs_diff": 0.08213352411985397, "all_logps_1": -120.3653564453125, "all_logps_1_values": -120.36537170410156, "all_logps_2": 451.7250061035156, "all_logps_2_values": 451.7250061035156, "epoch": 0.12823938017632916, "grad_norm": 20.487281254270606, "learning_rate": 9.97637968732563e-07, "logits/chosen": -0.9171462059020996, "logits/rejected": -0.8949100375175476, "logps/chosen": -0.2980085015296936, "logps/rejected": -0.32817280292510986, "loss": 1.4606, "original_losses": 1.5710750818252563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7450211644172668, "rewards/margins": 0.07541082799434662, "rewards/rejected": -0.8204320073127747, "step": 60, "weight": 0.9325092434883118 }, { "abs_diff": 0.08584319800138474, "all_logps_1": -115.28419494628906, "all_logps_1_values": -115.28419494628906, "all_logps_2": 410.28125, "all_logps_2_values": 410.28125, "epoch": 0.13892599519102325, "grad_norm": 13.268818877197086, "learning_rate": 9.954748808839674e-07, "logits/chosen": -0.9003847241401672, "logits/rejected": -0.9516555666923523, "logps/chosen": -0.31763237714767456, "logps/rejected": -0.3270418345928192, "loss": 1.4586, "original_losses": 1.614269495010376, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.794080913066864, "rewards/margins": 0.023523610085248947, "rewards/rejected": -0.817604660987854, "step": 65, "weight": 0.9301543235778809 }, { "abs_diff": 0.23710966110229492, "all_logps_1": -129.6254119873047, "all_logps_1_values": -129.6254425048828, "all_logps_2": 391.6187438964844, "all_logps_2_values": 391.6187438964844, "epoch": 0.14961261020571734, "grad_norm": 19.008527618804656, "learning_rate": 9.926188266120295e-07, "logits/chosen": -0.9297588467597961, "logits/rejected": -0.8964225053787231, "logps/chosen": -0.4621095657348633, "logps/rejected": -0.565943717956543, "loss": 1.4309, "original_losses": 1.5991542339324951, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.1552739143371582, "rewards/margins": 0.25958532094955444, "rewards/rejected": -1.414859414100647, "step": 70, "weight": 0.8780097961425781 }, { "abs_diff": 0.22396209836006165, "all_logps_1": -126.3341064453125, "all_logps_1_values": -126.33412170410156, "all_logps_2": 375.15625, "all_logps_2_values": 375.15625, "epoch": 0.16029922522041143, "grad_norm": 14.741661325228266, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.88294517993927, "logits/rejected": -0.8696261644363403, "logps/chosen": -0.6373583078384399, "logps/rejected": -0.7649468779563904, "loss": 1.371, "original_losses": 1.5059027671813965, "rewards/accuracies": 0.5625, "rewards/chosen": -1.593395709991455, "rewards/margins": 0.3189714848995209, "rewards/rejected": -1.9123672246932983, "step": 75, "weight": 0.874294102191925 }, { "abs_diff": 0.4753897786140442, "all_logps_1": -154.002197265625, "all_logps_1_values": -154.002197265625, "all_logps_2": 385.40625, "all_logps_2_values": 385.40625, "epoch": 0.17098584023510552, "grad_norm": 10.653088582817368, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9209216833114624, "logits/rejected": -0.905800461769104, "logps/chosen": -0.9318068623542786, "logps/rejected": -1.1782509088516235, "loss": 1.3728, "original_losses": 1.6557430028915405, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.329517126083374, "rewards/margins": 0.61611008644104, "rewards/rejected": -2.945627212524414, "step": 80, "weight": 0.8384539484977722 }, { "abs_diff": 0.4482264518737793, "all_logps_1": -181.6018829345703, "all_logps_1_values": -181.6018829345703, "all_logps_2": 381.91876220703125, "all_logps_2_values": 381.91876220703125, "epoch": 0.18167245524979964, "grad_norm": 8.388730168314039, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.8116687536239624, "logits/rejected": -0.7630541324615479, "logps/chosen": -1.007387638092041, "logps/rejected": -1.0764662027359009, "loss": 1.3965, "original_losses": 1.8705193996429443, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.5184690952301025, "rewards/margins": 0.17269621789455414, "rewards/rejected": -2.6911654472351074, "step": 85, "weight": 0.8248960375785828 }, { "abs_diff": 0.638414204120636, "all_logps_1": -197.71530151367188, "all_logps_1_values": -197.7152862548828, "all_logps_2": 368.6000061035156, "all_logps_2_values": 368.6000061035156, "epoch": 0.19235907026449373, "grad_norm": 12.62143276771947, "learning_rate": 9.743592451943998e-07, "logits/chosen": -0.7098425626754761, "logits/rejected": -0.6454850435256958, "logps/chosen": -1.299263596534729, "logps/rejected": -1.3454030752182007, "loss": 1.3792, "original_losses": 2.042982578277588, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.2481586933135986, "rewards/margins": 0.11534923315048218, "rewards/rejected": -3.3635077476501465, "step": 90, "weight": 0.788603663444519 }, { "abs_diff": 0.3771124482154846, "all_logps_1": -198.22885131835938, "all_logps_1_values": -198.22885131835938, "all_logps_2": 307.64373779296875, "all_logps_2_values": 307.64373779296875, "epoch": 0.20304568527918782, "grad_norm": 9.223783777700444, "learning_rate": 9.681174353198686e-07, "logits/chosen": -0.7450689077377319, "logits/rejected": -0.7714122533798218, "logps/chosen": -1.5162893533706665, "logps/rejected": -1.538206696510315, "loss": 1.3537, "original_losses": 1.7573131322860718, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.7907233238220215, "rewards/margins": 0.054793525487184525, "rewards/rejected": -3.8455166816711426, "step": 95, "weight": 0.7987316846847534 }, { "abs_diff": 0.531648576259613, "all_logps_1": -257.82080078125, "all_logps_1_values": -257.82080078125, "all_logps_2": 405.08123779296875, "all_logps_2_values": 405.08123779296875, "epoch": 0.2137323002938819, "grad_norm": 13.130824511623645, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.7543559074401855, "logits/rejected": -0.6947053074836731, "logps/chosen": -1.3733211755752563, "logps/rejected": -1.4744349718093872, "loss": 1.3472, "original_losses": 1.8884124755859375, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.433303117752075, "rewards/margins": 0.2527844309806824, "rewards/rejected": -3.6860873699188232, "step": 100, "weight": 0.8195359110832214 }, { "abs_diff": 0.4814772605895996, "all_logps_1": -285.88824462890625, "all_logps_1_values": -285.88824462890625, "all_logps_2": 447.76251220703125, "all_logps_2_values": 447.76251220703125, "epoch": 0.224418915308576, "grad_norm": 15.741233324493118, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.5685318112373352, "logits/rejected": -0.5175650119781494, "logps/chosen": -1.1041462421417236, "logps/rejected": -1.3609198331832886, "loss": 1.347, "original_losses": 1.60434091091156, "rewards/accuracies": 0.5625, "rewards/chosen": -2.7603654861450195, "rewards/margins": 0.6419342756271362, "rewards/rejected": -3.4022998809814453, "step": 105, "weight": 0.8199658393859863 }, { "abs_diff": 0.5063992738723755, "all_logps_1": -312.87860107421875, "all_logps_1_values": -312.8785705566406, "all_logps_2": 410.79998779296875, "all_logps_2_values": 410.79998779296875, "epoch": 0.2351055303232701, "grad_norm": 14.779833008390499, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.3194349706172943, "logits/rejected": -0.27131232619285583, "logps/chosen": -1.436680793762207, "logps/rejected": -1.3837544918060303, "loss": 1.3485, "original_losses": 2.0654890537261963, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.5917022228240967, "rewards/margins": -0.13231578469276428, "rewards/rejected": -3.459386110305786, "step": 110, "weight": 0.8112524151802063 }, { "abs_diff": 0.79926997423172, "all_logps_1": -352.8046875, "all_logps_1_values": -352.8046875, "all_logps_2": 401.26873779296875, "all_logps_2_values": 401.26873779296875, "epoch": 0.2457921453379642, "grad_norm": 17.098670325278757, "learning_rate": 9.367041003085648e-07, "logits/chosen": -0.27068907022476196, "logits/rejected": -0.25977402925491333, "logps/chosen": -1.8351905345916748, "logps/rejected": -2.079685688018799, "loss": 1.2568, "original_losses": 1.9370386600494385, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.587975978851318, "rewards/margins": 0.6112388968467712, "rewards/rejected": -5.199214458465576, "step": 115, "weight": 0.7355886101722717 }, { "abs_diff": 0.4315846860408783, "all_logps_1": -371.93505859375, "all_logps_1_values": -371.93505859375, "all_logps_2": 397.9624938964844, "all_logps_2_values": 397.9624938964844, "epoch": 0.2564787603526583, "grad_norm": 17.135021647585766, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.18766793608665466, "logits/rejected": -0.1377825289964676, "logps/chosen": -1.6060386896133423, "logps/rejected": -1.7283703088760376, "loss": 1.2524, "original_losses": 1.669327974319458, "rewards/accuracies": 0.4375, "rewards/chosen": -4.015096187591553, "rewards/margins": 0.30582934617996216, "rewards/rejected": -4.320925712585449, "step": 120, "weight": 0.7726086378097534 }, { "abs_diff": 0.8556106686592102, "all_logps_1": -424.2312927246094, "all_logps_1_values": -424.2313537597656, "all_logps_2": 358.1312561035156, "all_logps_2_values": 358.1312561035156, "epoch": 0.2671653753673524, "grad_norm": 18.949047249790798, "learning_rate": 9.172866268606513e-07, "logits/chosen": -0.0937797874212265, "logits/rejected": -0.08780622482299805, "logps/chosen": -2.3565449714660645, "logps/rejected": -2.821481227874756, "loss": 1.2455, "original_losses": 1.5799314975738525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.891362190246582, "rewards/margins": 1.1623404026031494, "rewards/rejected": -7.053703308105469, "step": 125, "weight": 0.6997275352478027 }, { "abs_diff": 1.122897982597351, "all_logps_1": -483.11285400390625, "all_logps_1_values": -483.11279296875, "all_logps_2": 356.2250061035156, "all_logps_2_values": 356.2250061035156, "epoch": 0.2778519903820465, "grad_norm": 16.067627857167523, "learning_rate": 9.066954722907638e-07, "logits/chosen": 0.18425658345222473, "logits/rejected": 0.12208795547485352, "logps/chosen": -2.2584593296051025, "logps/rejected": -2.747421979904175, "loss": 1.2378, "original_losses": 1.9530925750732422, "rewards/accuracies": 0.5, "rewards/chosen": -5.646147727966309, "rewards/margins": 1.2224081754684448, "rewards/rejected": -6.868556022644043, "step": 130, "weight": 0.6967185139656067 }, { "abs_diff": 0.5274697542190552, "all_logps_1": -584.13671875, "all_logps_1_values": -584.13671875, "all_logps_2": 443.01873779296875, "all_logps_2_values": 443.01873779296875, "epoch": 0.2885386053967406, "grad_norm": 29.366033343143968, "learning_rate": 8.955355173281707e-07, "logits/chosen": 0.3088318705558777, "logits/rejected": 0.3932690918445587, "logps/chosen": -2.3267366886138916, "logps/rejected": -2.385960102081299, "loss": 1.1916, "original_losses": 1.8465898036956787, "rewards/accuracies": 0.5, "rewards/chosen": -5.816841125488281, "rewards/margins": 0.1480589658021927, "rewards/rejected": -5.964900016784668, "step": 135, "weight": 0.7594529390335083 }, { "abs_diff": 0.9901386499404907, "all_logps_1": -715.9130859375, "all_logps_1_values": -715.9131469726562, "all_logps_2": 402.9312438964844, "all_logps_2_values": 402.9312438964844, "epoch": 0.2992252204114347, "grad_norm": 27.69156284264097, "learning_rate": 8.838223701790055e-07, "logits/chosen": 0.5694825649261475, "logits/rejected": 0.5738533139228821, "logps/chosen": -3.3967947959899902, "logps/rejected": -3.4784629344940186, "loss": 1.1521, "original_losses": 2.2902231216430664, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.491987228393555, "rewards/margins": 0.20417042076587677, "rewards/rejected": -8.696157455444336, "step": 140, "weight": 0.6874681115150452 }, { "abs_diff": 0.9199058413505554, "all_logps_1": -995.3132934570312, "all_logps_1_values": -995.3132934570312, "all_logps_2": 409.5249938964844, "all_logps_2_values": 409.5249938964844, "epoch": 0.30991183542612877, "grad_norm": 28.11539806786062, "learning_rate": 8.71572412738697e-07, "logits/chosen": 0.8747909665107727, "logits/rejected": 0.9098325967788696, "logps/chosen": -3.898921251296997, "logps/rejected": -3.9907355308532715, "loss": 1.1592, "original_losses": 2.074253797531128, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -9.74730110168457, "rewards/margins": 0.22953681647777557, "rewards/rejected": -9.976838111877441, "step": 145, "weight": 0.6336122751235962 }, { "abs_diff": 1.7418813705444336, "all_logps_1": -1663.3861083984375, "all_logps_1_values": -1663.3861083984375, "all_logps_2": 383.75, "all_logps_2_values": 383.75, "epoch": 0.32059845044082286, "grad_norm": 43.30888911111554, "learning_rate": 8.588027776804058e-07, "logits/chosen": 1.2933635711669922, "logits/rejected": 1.2684452533721924, "logps/chosen": -6.538305759429932, "logps/rejected": -7.486212253570557, "loss": 1.0994, "original_losses": 1.926180124282837, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -16.34576416015625, "rewards/margins": 2.3697667121887207, "rewards/rejected": -18.715530395507812, "step": 150, "weight": 0.5583394765853882 }, { "abs_diff": 1.5373389720916748, "all_logps_1": -2462.133056640625, "all_logps_1_values": -2462.13330078125, "all_logps_2": 434.73748779296875, "all_logps_2_values": 434.73748779296875, "epoch": 0.33128506545551695, "grad_norm": 47.421884036345716, "learning_rate": 8.455313244934324e-07, "logits/chosen": 1.8083369731903076, "logits/rejected": 1.890794038772583, "logps/chosen": -8.33267879486084, "logps/rejected": -9.018165588378906, "loss": 1.0741, "original_losses": 2.0032851696014404, "rewards/accuracies": 0.5625, "rewards/chosen": -20.83169937133789, "rewards/margins": 1.713716745376587, "rewards/rejected": -22.5454158782959, "step": 155, "weight": 0.5593416094779968 }, { "abs_diff": 1.8985588550567627, "all_logps_1": -2538.660400390625, "all_logps_1_values": -2538.66064453125, "all_logps_2": 403.66876220703125, "all_logps_2_values": 403.66876220703125, "epoch": 0.34197168047021104, "grad_norm": 58.88642904599502, "learning_rate": 8.317766145051057e-07, "logits/chosen": 2.1515212059020996, "logits/rejected": 2.141986846923828, "logps/chosen": -8.633856773376465, "logps/rejected": -9.374483108520508, "loss": 1.0769, "original_losses": 2.3099827766418457, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -21.584644317626953, "rewards/margins": 1.8515657186508179, "rewards/rejected": -23.436208724975586, "step": 160, "weight": 0.5209288001060486 }, { "abs_diff": 2.2082934379577637, "all_logps_1": -3570.93603515625, "all_logps_1_values": -3570.936279296875, "all_logps_2": 442.4437561035156, "all_logps_2_values": 442.4437561035156, "epoch": 0.3526582954849052, "grad_norm": 32.977138977170775, "learning_rate": 8.175578849210894e-07, "logits/chosen": 2.5748469829559326, "logits/rejected": 2.677804470062256, "logps/chosen": -9.694478988647461, "logps/rejected": -10.093037605285645, "loss": 1.0398, "original_losses": 3.134640693664551, "rewards/accuracies": 0.5625, "rewards/chosen": -24.236202239990234, "rewards/margins": 0.9963935017585754, "rewards/rejected": -25.232593536376953, "step": 165, "weight": 0.49304407835006714 }, { "abs_diff": 2.007434129714966, "all_logps_1": -3220.789794921875, "all_logps_1_values": -3220.789794921875, "all_logps_2": 357.3062438964844, "all_logps_2_values": 357.3062438964844, "epoch": 0.36334491049959927, "grad_norm": 44.745926058943496, "learning_rate": 8.028950219204099e-07, "logits/chosen": 2.934321641921997, "logits/rejected": 2.8931219577789307, "logps/chosen": -11.122208595275879, "logps/rejected": -11.998506546020508, "loss": 0.9596, "original_losses": 2.2297332286834717, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -27.80552101135254, "rewards/margins": 2.1907458305358887, "rewards/rejected": -29.996265411376953, "step": 170, "weight": 0.49520620703697205 }, { "abs_diff": 2.5391037464141846, "all_logps_1": -3010.77099609375, "all_logps_1_values": -3010.77099609375, "all_logps_2": 336.26251220703125, "all_logps_2_values": 336.26251220703125, "epoch": 0.37403152551429336, "grad_norm": 50.44282847929724, "learning_rate": 7.878085328428368e-07, "logits/chosen": 2.6517717838287354, "logits/rejected": 2.698502779006958, "logps/chosen": -11.271635055541992, "logps/rejected": -12.422686576843262, "loss": 0.953, "original_losses": 2.4483256340026855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -28.1790828704834, "rewards/margins": 2.8776297569274902, "rewards/rejected": -31.056713104248047, "step": 175, "weight": 0.45478373765945435 }, { "abs_diff": 2.311084270477295, "all_logps_1": -3630.26123046875, "all_logps_1_values": -3630.26123046875, "all_logps_2": 367.6937561035156, "all_logps_2_values": 367.6937561035156, "epoch": 0.38471814052898745, "grad_norm": 54.556403188950036, "learning_rate": 7.723195175075135e-07, "logits/chosen": 2.640475273132324, "logits/rejected": 2.6134068965911865, "logps/chosen": -12.537522315979004, "logps/rejected": -13.568713188171387, "loss": 0.9044, "original_losses": 2.333768844604492, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -31.343807220458984, "rewards/margins": 2.577979564666748, "rewards/rejected": -33.921791076660156, "step": 180, "weight": 0.46685990691185 }, { "abs_diff": 2.934654951095581, "all_logps_1": -5179.39404296875, "all_logps_1_values": -5179.39404296875, "all_logps_2": 370.9624938964844, "all_logps_2_values": 370.9624938964844, "epoch": 0.39540475554368154, "grad_norm": 57.260252425269734, "learning_rate": 7.564496387029531e-07, "logits/chosen": 2.326862096786499, "logits/rejected": 2.4421494007110596, "logps/chosen": -15.849513053894043, "logps/rejected": -17.323734283447266, "loss": 0.9407, "original_losses": 2.5988547801971436, "rewards/accuracies": 0.5625, "rewards/chosen": -39.623779296875, "rewards/margins": 3.6855552196502686, "rewards/rejected": -43.30933380126953, "step": 185, "weight": 0.40877920389175415 }, { "abs_diff": 2.9652016162872314, "all_logps_1": -5177.00244140625, "all_logps_1_values": -5177.00244140625, "all_logps_2": 374.4312438964844, "all_logps_2_values": 374.4312438964844, "epoch": 0.40609137055837563, "grad_norm": 83.2255328888069, "learning_rate": 7.402210918896689e-07, "logits/chosen": 2.44303297996521, "logits/rejected": 2.4873244762420654, "logps/chosen": -15.580667495727539, "logps/rejected": -17.045442581176758, "loss": 0.9238, "original_losses": 2.6292238235473633, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -38.95166778564453, "rewards/margins": 3.661935329437256, "rewards/rejected": -42.61360168457031, "step": 190, "weight": 0.40674179792404175 }, { "abs_diff": 2.7273154258728027, "all_logps_1": -4500.06005859375, "all_logps_1_values": -4500.06005859375, "all_logps_2": 380.1312561035156, "all_logps_2_values": 380.1312561035156, "epoch": 0.4167779855730697, "grad_norm": 84.18074257984793, "learning_rate": 7.236565741578162e-07, "logits/chosen": 2.6910769939422607, "logits/rejected": 2.7326107025146484, "logps/chosen": -13.98046875, "logps/rejected": -15.500396728515625, "loss": 0.9048, "original_losses": 2.1395676136016846, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -34.95117950439453, "rewards/margins": 3.7998204231262207, "rewards/rejected": -38.75099182128906, "step": 195, "weight": 0.3992369771003723 }, { "abs_diff": 2.3771374225616455, "all_logps_1": -4996.7001953125, "all_logps_1_values": -4996.7001953125, "all_logps_2": 438.8500061035156, "all_logps_2_values": 438.8500061035156, "epoch": 0.4274646005877638, "grad_norm": 51.852682835194706, "learning_rate": 7.067792524832603e-07, "logits/chosen": 2.5128085613250732, "logits/rejected": 2.454047679901123, "logps/chosen": -13.007303237915039, "logps/rejected": -13.782841682434082, "loss": 0.9745, "original_losses": 2.816681385040283, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -32.51825714111328, "rewards/margins": 1.9388458728790283, "rewards/rejected": -34.45710372924805, "step": 200, "weight": 0.4336828589439392 }, { "abs_diff": 2.789199113845825, "all_logps_1": -5606.87744140625, "all_logps_1_values": -5606.87744140625, "all_logps_2": 413.7875061035156, "all_logps_2_values": 413.7875061035156, "epoch": 0.4381512156024579, "grad_norm": 82.65919240097834, "learning_rate": 6.896127313264642e-07, "logits/chosen": 2.4827866554260254, "logits/rejected": 2.610020399093628, "logps/chosen": -15.495327949523926, "logps/rejected": -16.71689224243164, "loss": 0.8079, "original_losses": 2.6531503200531006, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -38.738319396972656, "rewards/margins": 3.053907871246338, "rewards/rejected": -41.7922248840332, "step": 205, "weight": 0.40062981843948364 }, { "abs_diff": 3.1174449920654297, "all_logps_1": -6078.1650390625, "all_logps_1_values": -6078.1650390625, "all_logps_2": 408.83123779296875, "all_logps_2_values": 408.83123779296875, "epoch": 0.448837830617152, "grad_norm": 66.91462129577006, "learning_rate": 6.721810196195174e-07, "logits/chosen": 2.3251194953918457, "logits/rejected": 2.481720209121704, "logps/chosen": -15.918850898742676, "logps/rejected": -17.23949432373047, "loss": 0.8447, "original_losses": 2.9006853103637695, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -39.79712677001953, "rewards/margins": 3.3016059398651123, "rewards/rejected": -43.09873580932617, "step": 210, "weight": 0.37287402153015137 }, { "abs_diff": 3.3388848304748535, "all_logps_1": -6523.8935546875, "all_logps_1_values": -6523.8935546875, "all_logps_2": 405.98748779296875, "all_logps_2_values": 405.98748779296875, "epoch": 0.45952444563184613, "grad_norm": 95.8421548369589, "learning_rate": 6.545084971874736e-07, "logits/chosen": 2.866258382797241, "logits/rejected": 2.9341139793395996, "logps/chosen": -16.77628517150879, "logps/rejected": -18.90264320373535, "loss": 0.8426, "original_losses": 2.032466411590576, "rewards/accuracies": 0.75, "rewards/chosen": -41.940711975097656, "rewards/margins": 5.31589412689209, "rewards/rejected": -47.25660705566406, "step": 215, "weight": 0.35115545988082886 }, { "abs_diff": 2.8094236850738525, "all_logps_1": -4738.73046875, "all_logps_1_values": -4738.73046875, "all_logps_2": 363.98126220703125, "all_logps_2_values": 363.98126220703125, "epoch": 0.4702110606465402, "grad_norm": 112.65545034373879, "learning_rate": 6.3661988065096e-07, "logits/chosen": 2.7162396907806396, "logits/rejected": 2.835710048675537, "logps/chosen": -15.200531005859375, "logps/rejected": -15.732034683227539, "loss": 0.7804, "original_losses": 3.5092949867248535, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -38.0013313293457, "rewards/margins": 1.32875394821167, "rewards/rejected": -39.33008575439453, "step": 220, "weight": 0.4126754403114319 }, { "abs_diff": 3.209429979324341, "all_logps_1": -5642.91943359375, "all_logps_1_values": -5642.92041015625, "all_logps_2": 383.92498779296875, "all_logps_2_values": 383.92498779296875, "epoch": 0.4808976756612343, "grad_norm": 37.46832492030243, "learning_rate": 6.185401888577487e-07, "logits/chosen": 2.5830130577087402, "logits/rejected": 2.689384937286377, "logps/chosen": -15.603918075561523, "logps/rejected": -16.610340118408203, "loss": 0.9122, "original_losses": 3.4112372398376465, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -39.00979995727539, "rewards/margins": 2.5160529613494873, "rewards/rejected": -41.525856018066406, "step": 225, "weight": 0.3604838252067566 }, { "abs_diff": 3.7125911712646484, "all_logps_1": -5569.94384765625, "all_logps_1_values": -5569.94384765625, "all_logps_2": 361.3125, "all_logps_2_values": 361.3125, "epoch": 0.4915842906759284, "grad_norm": 66.55702343700871, "learning_rate": 6.002947078916364e-07, "logits/chosen": 2.3465304374694824, "logits/rejected": 2.66461181640625, "logps/chosen": -17.106571197509766, "logps/rejected": -19.080835342407227, "loss": 0.8076, "original_losses": 2.705897808074951, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -42.76642608642578, "rewards/margins": 4.935657501220703, "rewards/rejected": -47.70208740234375, "step": 230, "weight": 0.3060615658760071 }, { "abs_diff": 2.281184434890747, "all_logps_1": -3926.673828125, "all_logps_1_values": -3926.67333984375, "all_logps_2": 311.42498779296875, "all_logps_2_values": 311.42498779296875, "epoch": 0.5022709056906225, "grad_norm": 52.12193473626352, "learning_rate": 5.819089557075688e-07, "logits/chosen": 2.5927655696868896, "logits/rejected": 2.721041679382324, "logps/chosen": -14.924860000610352, "logps/rejected": -15.577176094055176, "loss": 0.8527, "original_losses": 2.7761876583099365, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -37.3121452331543, "rewards/margins": 1.6307960748672485, "rewards/rejected": -38.94294357299805, "step": 235, "weight": 0.42170318961143494 }, { "abs_diff": 2.7382442951202393, "all_logps_1": -5511.8671875, "all_logps_1_values": -5511.8671875, "all_logps_2": 424.04376220703125, "all_logps_2_values": 424.04376220703125, "epoch": 0.5129575207053166, "grad_norm": 59.31175783914156, "learning_rate": 5.634086464424742e-07, "logits/chosen": 2.750415086746216, "logits/rejected": 2.8377902507781982, "logps/chosen": -15.228363037109375, "logps/rejected": -16.618165969848633, "loss": 0.8222, "original_losses": 2.4813647270202637, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -38.07091522216797, "rewards/margins": 3.474503993988037, "rewards/rejected": -41.545413970947266, "step": 240, "weight": 0.43811964988708496 }, { "abs_diff": 3.254149913787842, "all_logps_1": -5742.85595703125, "all_logps_1_values": -5742.85546875, "all_logps_2": 412.4624938964844, "all_logps_2_values": 412.4624938964844, "epoch": 0.5236441357200107, "grad_norm": 54.9226284927014, "learning_rate": 5.448196544517167e-07, "logits/chosen": 2.565314531326294, "logits/rejected": 2.691755533218384, "logps/chosen": -15.381324768066406, "logps/rejected": -17.23483657836914, "loss": 0.7997, "original_losses": 2.286261558532715, "rewards/accuracies": 0.75, "rewards/chosen": -38.45330810546875, "rewards/margins": 4.633780479431152, "rewards/rejected": -43.08708953857422, "step": 245, "weight": 0.3790872097015381 }, { "abs_diff": 3.364607334136963, "all_logps_1": -5477.4482421875, "all_logps_1_values": -5477.4482421875, "all_logps_2": 341.70623779296875, "all_logps_2_values": 341.70623779296875, "epoch": 0.5343307507347048, "grad_norm": 75.70050581279018, "learning_rate": 5.26167978121472e-07, "logits/chosen": 2.6822657585144043, "logits/rejected": 2.7521121501922607, "logps/chosen": -16.76608657836914, "logps/rejected": -19.173168182373047, "loss": 0.8369, "original_losses": 1.7328109741210938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -41.91521453857422, "rewards/margins": 6.017703056335449, "rewards/rejected": -47.932918548583984, "step": 250, "weight": 0.3458004593849182 }, { "abs_diff": 3.0323586463928223, "all_logps_1": -6443.626953125, "all_logps_1_values": -6443.62646484375, "all_logps_2": 363.6000061035156, "all_logps_2_values": 363.6000061035156, "epoch": 0.5450173657493989, "grad_norm": 39.687795704366174, "learning_rate": 5.074797035076318e-07, "logits/chosen": 2.954530954360962, "logits/rejected": 2.9405295848846436, "logps/chosen": -18.485279083251953, "logps/rejected": -19.909687042236328, "loss": 0.7436, "original_losses": 2.6259872913360596, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -46.21319580078125, "rewards/margins": 3.561020612716675, "rewards/rejected": -49.77421569824219, "step": 255, "weight": 0.37864193320274353 }, { "abs_diff": 3.276740312576294, "all_logps_1": -8267.275390625, "all_logps_1_values": -8267.275390625, "all_logps_2": 393.79376220703125, "all_logps_2_values": 393.79376220703125, "epoch": 0.555703980764093, "grad_norm": 82.01295751328553, "learning_rate": 4.887809678520975e-07, "logits/chosen": 3.072216749191284, "logits/rejected": 3.1636574268341064, "logps/chosen": -20.29796600341797, "logps/rejected": -22.58323860168457, "loss": 0.767, "original_losses": 1.7455909252166748, "rewards/accuracies": 0.6875, "rewards/chosen": -50.74491882324219, "rewards/margins": 5.713181972503662, "rewards/rejected": -56.458106994628906, "step": 260, "weight": 0.35711461305618286 }, { "abs_diff": 3.182936429977417, "all_logps_1": -9216.587890625, "all_logps_1_values": -9216.5869140625, "all_logps_2": 407.7562561035156, "all_logps_2_values": 407.7562561035156, "epoch": 0.566390595778787, "grad_norm": 52.240919124363245, "learning_rate": 4.700979230274829e-07, "logits/chosen": 3.0337119102478027, "logits/rejected": 3.0206868648529053, "logps/chosen": -23.038707733154297, "logps/rejected": -23.99751091003418, "loss": 0.807, "original_losses": 3.4282360076904297, "rewards/accuracies": 0.625, "rewards/chosen": -57.596778869628906, "rewards/margins": 2.3969998359680176, "rewards/rejected": -59.9937744140625, "step": 265, "weight": 0.34545254707336426 }, { "abs_diff": 3.006873607635498, "all_logps_1": -10153.31640625, "all_logps_1_values": -10153.3154296875, "all_logps_2": 477.38751220703125, "all_logps_2_values": 477.38751220703125, "epoch": 0.5770772107934812, "grad_norm": 59.52695646189972, "learning_rate": 4.514566989613559e-07, "logits/chosen": 2.972503185272217, "logits/rejected": 2.9690961837768555, "logps/chosen": -22.136503219604492, "logps/rejected": -23.38858413696289, "loss": 0.8091, "original_losses": 2.8519082069396973, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -55.34125900268555, "rewards/margins": 3.1301934719085693, "rewards/rejected": -58.47145462036133, "step": 270, "weight": 0.36154988408088684 }, { "abs_diff": 2.563995599746704, "all_logps_1": -7391.75, "all_logps_1_values": -7391.75, "all_logps_2": 375.40625, "all_logps_2_values": 375.40625, "epoch": 0.5877638258081752, "grad_norm": 59.32000543668621, "learning_rate": 4.328833670911724e-07, "logits/chosen": 3.481792449951172, "logits/rejected": 3.5533995628356934, "logps/chosen": -21.077594757080078, "logps/rejected": -22.37049674987793, "loss": 0.7438, "original_losses": 2.241507053375244, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -52.6939811706543, "rewards/margins": 3.2322616577148438, "rewards/rejected": -55.926246643066406, "step": 275, "weight": 0.41661015152931213 }, { "abs_diff": 3.315411329269409, "all_logps_1": -7719.34619140625, "all_logps_1_values": -7719.34521484375, "all_logps_2": 439.35626220703125, "all_logps_2_values": 439.35626220703125, "epoch": 0.5984504408228694, "grad_norm": 53.61685628912313, "learning_rate": 4.144039039010124e-07, "logits/chosen": 2.6844732761383057, "logits/rejected": 2.87386417388916, "logps/chosen": -17.859844207763672, "logps/rejected": -19.173076629638672, "loss": 0.7914, "original_losses": 3.27254056930542, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -44.64960861206055, "rewards/margins": 3.283079147338867, "rewards/rejected": -47.93268966674805, "step": 280, "weight": 0.38690507411956787 }, { "abs_diff": 2.917543649673462, "all_logps_1": -6426.8310546875, "all_logps_1_values": -6426.8310546875, "all_logps_2": 355.16876220703125, "all_logps_2_values": 355.16876220703125, "epoch": 0.6091370558375635, "grad_norm": 55.70128923603701, "learning_rate": 3.960441545911204e-07, "logits/chosen": 3.0214133262634277, "logits/rejected": 3.1276047229766846, "logps/chosen": -20.0152530670166, "logps/rejected": -20.51242446899414, "loss": 0.8001, "original_losses": 3.778569459915161, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -50.03813552856445, "rewards/margins": 1.2429269552230835, "rewards/rejected": -51.28105926513672, "step": 285, "weight": 0.4056159555912018 }, { "abs_diff": 3.5806915760040283, "all_logps_1": -6845.4326171875, "all_logps_1_values": -6845.4326171875, "all_logps_2": 341.95001220703125, "all_logps_2_values": 341.95001220703125, "epoch": 0.6198236708522575, "grad_norm": 67.43658438729601, "learning_rate": 3.778297969310529e-07, "logits/chosen": 2.87160325050354, "logits/rejected": 2.953885555267334, "logps/chosen": -19.99938201904297, "logps/rejected": -22.214576721191406, "loss": 0.8043, "original_losses": 2.356289863586426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -49.99845504760742, "rewards/margins": 5.537986755371094, "rewards/rejected": -55.53644561767578, "step": 290, "weight": 0.3736080527305603 }, { "abs_diff": 3.2340712547302246, "all_logps_1": -7549.24755859375, "all_logps_1_values": -7549.24755859375, "all_logps_2": 351.07501220703125, "all_logps_2_values": 351.07501220703125, "epoch": 0.6305102858669517, "grad_norm": 48.7199759637811, "learning_rate": 3.5978630534699865e-07, "logits/chosen": 2.5181379318237305, "logits/rejected": 2.6238226890563965, "logps/chosen": -21.65777587890625, "logps/rejected": -23.368385314941406, "loss": 0.8187, "original_losses": 2.4767355918884277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.144432067871094, "rewards/margins": 4.276528835296631, "rewards/rejected": -58.42096710205078, "step": 295, "weight": 0.35286107659339905 }, { "abs_diff": 3.32385516166687, "all_logps_1": -8850.7470703125, "all_logps_1_values": -8850.748046875, "all_logps_2": 415.8999938964844, "all_logps_2_values": 415.8999938964844, "epoch": 0.6411969008816457, "grad_norm": 40.177069639353974, "learning_rate": 3.4193891529348795e-07, "logits/chosen": 2.7022032737731934, "logits/rejected": 2.7918949127197266, "logps/chosen": -22.0867862701416, "logps/rejected": -24.649303436279297, "loss": 0.7237, "original_losses": 1.4938082695007324, "rewards/accuracies": 0.75, "rewards/chosen": -55.21696090698242, "rewards/margins": 6.4062957763671875, "rewards/rejected": -61.623252868652344, "step": 300, "weight": 0.38917768001556396 }, { "abs_diff": 2.9255619049072266, "all_logps_1": -7401.28662109375, "all_logps_1_values": -7401.2861328125, "all_logps_2": 407.0562438964844, "all_logps_2_values": 407.0562438964844, "epoch": 0.6518835158963399, "grad_norm": 48.44087105424344, "learning_rate": 3.243125879593286e-07, "logits/chosen": 2.6635046005249023, "logits/rejected": 2.777791976928711, "logps/chosen": -18.8937931060791, "logps/rejected": -20.48404312133789, "loss": 0.8144, "original_losses": 2.4159512519836426, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -47.23448181152344, "rewards/margins": 3.975621461868286, "rewards/rejected": -51.21010208129883, "step": 305, "weight": 0.4257276952266693 }, { "abs_diff": 3.013671875, "all_logps_1": -7221.7607421875, "all_logps_1_values": -7221.76171875, "all_logps_2": 377.16876220703125, "all_logps_2_values": 377.16876220703125, "epoch": 0.6625701309110339, "grad_norm": 59.965292421288716, "learning_rate": 3.069319753571269e-07, "logits/chosen": 2.7733490467071533, "logits/rejected": 2.600106954574585, "logps/chosen": -19.796558380126953, "logps/rejected": -20.72552490234375, "loss": 0.8117, "original_losses": 3.373765230178833, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -49.491390228271484, "rewards/margins": 2.3224196434020996, "rewards/rejected": -51.813812255859375, "step": 310, "weight": 0.3705739974975586 }, { "abs_diff": 3.0523111820220947, "all_logps_1": -7815.6552734375, "all_logps_1_values": -7815.65478515625, "all_logps_2": 449.16876220703125, "all_logps_2_values": 449.16876220703125, "epoch": 0.673256745925728, "grad_norm": 52.38266043751792, "learning_rate": 2.898213858452173e-07, "logits/chosen": 2.1578516960144043, "logits/rejected": 2.247980833053589, "logps/chosen": -17.26466941833496, "logps/rejected": -18.508235931396484, "loss": 0.7937, "original_losses": 2.871872901916504, "rewards/accuracies": 0.6875, "rewards/chosen": -43.16167449951172, "rewards/margins": 3.1089208126068115, "rewards/rejected": -46.270591735839844, "step": 315, "weight": 0.3665739893913269 }, { "abs_diff": 3.318554639816284, "all_logps_1": -6473.89013671875, "all_logps_1_values": -6473.890625, "all_logps_2": 359.54376220703125, "all_logps_2_values": 359.54376220703125, "epoch": 0.6839433609404221, "grad_norm": 87.91813204561389, "learning_rate": 2.730047501302266e-07, "logits/chosen": 2.3339014053344727, "logits/rejected": 2.4213125705718994, "logps/chosen": -17.509052276611328, "logps/rejected": -19.367351531982422, "loss": 0.7705, "original_losses": 2.49141263961792, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -43.77263259887695, "rewards/margins": 4.645747184753418, "rewards/rejected": -48.41838455200195, "step": 320, "weight": 0.37379634380340576 }, { "abs_diff": 4.029627799987793, "all_logps_1": -8260.576171875, "all_logps_1_values": -8260.576171875, "all_logps_2": 420.7562561035156, "all_logps_2_values": 420.7562561035156, "epoch": 0.6946299759551162, "grad_norm": 56.877689091030156, "learning_rate": 2.5650558779781635e-07, "logits/chosen": 2.8901479244232178, "logits/rejected": 2.8577167987823486, "logps/chosen": -18.898571014404297, "logps/rejected": -21.253376007080078, "loss": 0.7293, "original_losses": 2.648833751678467, "rewards/accuracies": 0.625, "rewards/chosen": -47.24642562866211, "rewards/margins": 5.887020111083984, "rewards/rejected": -53.133445739746094, "step": 325, "weight": 0.299586683511734 }, { "abs_diff": 3.5890209674835205, "all_logps_1": -8075.91650390625, "all_logps_1_values": -8075.91552734375, "all_logps_2": 370.53125, "all_logps_2_values": 370.53125, "epoch": 0.7053165909698104, "grad_norm": 51.745088875170836, "learning_rate": 2.403469744184154e-07, "logits/chosen": 2.560868978500366, "logits/rejected": 2.73579740524292, "logps/chosen": -20.837478637695312, "logps/rejected": -23.191274642944336, "loss": 0.8048, "original_losses": 2.0950331687927246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -52.09369659423828, "rewards/margins": 5.884491443634033, "rewards/rejected": -57.978187561035156, "step": 330, "weight": 0.35120078921318054 }, { "abs_diff": 3.3162055015563965, "all_logps_1": -8510.986328125, "all_logps_1_values": -8510.9873046875, "all_logps_2": 404.1875, "all_logps_2_values": 404.1875, "epoch": 0.7160032059845044, "grad_norm": 69.38405615517823, "learning_rate": 2.2455150927394878e-07, "logits/chosen": 2.6478374004364014, "logits/rejected": 2.565058946609497, "logps/chosen": -20.254060745239258, "logps/rejected": -22.03819465637207, "loss": 0.7845, "original_losses": 2.5941619873046875, "rewards/accuracies": 0.5625, "rewards/chosen": -50.63515090942383, "rewards/margins": 4.460334300994873, "rewards/rejected": -55.095489501953125, "step": 335, "weight": 0.3666679263114929 }, { "abs_diff": 3.677370548248291, "all_logps_1": -8691.5263671875, "all_logps_1_values": -8691.5263671875, "all_logps_2": 381.01873779296875, "all_logps_2_values": 381.01873779296875, "epoch": 0.7266898209991985, "grad_norm": 84.25483121877998, "learning_rate": 2.0914128375069722e-07, "logits/chosen": 2.709319829940796, "logits/rejected": 2.7781405448913574, "logps/chosen": -21.70474624633789, "logps/rejected": -23.93856430053711, "loss": 0.7836, "original_losses": 2.420710802078247, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -54.261871337890625, "rewards/margins": 5.5845465660095215, "rewards/rejected": -59.84641647338867, "step": 340, "weight": 0.35216349363327026 }, { "abs_diff": 2.6053452491760254, "all_logps_1": -8825.68359375, "all_logps_1_values": -8825.68359375, "all_logps_2": 365.8812561035156, "all_logps_2_values": 365.8812561035156, "epoch": 0.7373764360138926, "grad_norm": 70.75060453919657, "learning_rate": 1.9413785044249676e-07, "logits/chosen": 2.845489501953125, "logits/rejected": 2.95839262008667, "logps/chosen": -24.01942253112793, "logps/rejected": -25.074626922607422, "loss": 0.7906, "original_losses": 2.7006657123565674, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -60.048553466796875, "rewards/margins": 2.63801908493042, "rewards/rejected": -62.68656539916992, "step": 345, "weight": 0.4165709912776947 }, { "abs_diff": 3.481846570968628, "all_logps_1": -9110.2353515625, "all_logps_1_values": -9110.2353515625, "all_logps_2": 392.26251220703125, "all_logps_2_values": 392.26251220703125, "epoch": 0.7480630510285867, "grad_norm": 50.64757547547787, "learning_rate": 1.7956219300748792e-07, "logits/chosen": 2.579031467437744, "logits/rejected": 2.5901365280151367, "logps/chosen": -21.98320198059082, "logps/rejected": -24.78140640258789, "loss": 0.7388, "original_losses": 1.231533408164978, "rewards/accuracies": 0.8125, "rewards/chosen": -54.9580078125, "rewards/margins": 6.995513916015625, "rewards/rejected": -61.953514099121094, "step": 350, "weight": 0.3367912769317627 }, { "abs_diff": 3.284003496170044, "all_logps_1": -9058.169921875, "all_logps_1_values": -9058.169921875, "all_logps_2": 396.1812438964844, "all_logps_2_values": 396.1812438964844, "epoch": 0.7587496660432808, "grad_norm": 74.67147548055407, "learning_rate": 1.6543469682057104e-07, "logits/chosen": 2.3805794715881348, "logits/rejected": 2.5762991905212402, "logps/chosen": -21.627700805664062, "logps/rejected": -23.67769432067871, "loss": 0.7775, "original_losses": 2.081150531768799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -54.06926345825195, "rewards/margins": 5.124981880187988, "rewards/rejected": -59.194244384765625, "step": 355, "weight": 0.333683043718338 }, { "abs_diff": 3.9802608489990234, "all_logps_1": -8140.62646484375, "all_logps_1_values": -8140.625, "all_logps_2": 368.1812438964844, "all_logps_2_values": 368.1812438964844, "epoch": 0.7694362810579749, "grad_norm": 58.567962370545146, "learning_rate": 1.5177512046261666e-07, "logits/chosen": 2.5346484184265137, "logits/rejected": 2.3816428184509277, "logps/chosen": -22.101619720458984, "logps/rejected": -24.49993896484375, "loss": 0.6993, "original_losses": 2.540489435195923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -55.254051208496094, "rewards/margins": 5.995795249938965, "rewards/rejected": -61.249847412109375, "step": 360, "weight": 0.34169501066207886 }, { "abs_diff": 3.0081470012664795, "all_logps_1": -7452.68115234375, "all_logps_1_values": -7452.68115234375, "all_logps_2": 344.38751220703125, "all_logps_2_values": 344.38751220703125, "epoch": 0.7801228960726689, "grad_norm": 83.23124267198439, "learning_rate": 1.3860256808630427e-07, "logits/chosen": 2.4369776248931885, "logits/rejected": 2.584667682647705, "logps/chosen": -21.06991195678711, "logps/rejected": -22.77521324157715, "loss": 0.7695, "original_losses": 2.1701793670654297, "rewards/accuracies": 0.6875, "rewards/chosen": -52.674774169921875, "rewards/margins": 4.263253211975098, "rewards/rejected": -56.93803024291992, "step": 365, "weight": 0.345781534910202 }, { "abs_diff": 3.4343185424804688, "all_logps_1": -9116.8271484375, "all_logps_1_values": -9116.826171875, "all_logps_2": 410.375, "all_logps_2_values": 410.375, "epoch": 0.7908095110873631, "grad_norm": 70.00940117238335, "learning_rate": 1.2593546269723647e-07, "logits/chosen": 2.4547030925750732, "logits/rejected": 2.5984954833984375, "logps/chosen": -21.283931732177734, "logps/rejected": -23.039413452148438, "loss": 0.7116, "original_losses": 2.7036542892456055, "rewards/accuracies": 0.625, "rewards/chosen": -53.20983123779297, "rewards/margins": 4.388695240020752, "rewards/rejected": -57.59852981567383, "step": 370, "weight": 0.3490845561027527 }, { "abs_diff": 3.5221400260925293, "all_logps_1": -8307.474609375, "all_logps_1_values": -8307.4755859375, "all_logps_2": 382.3999938964844, "all_logps_2_values": 382.3999938964844, "epoch": 0.8014961261020572, "grad_norm": 46.47990793449235, "learning_rate": 1.1379152038770029e-07, "logits/chosen": 2.5157063007354736, "logits/rejected": 2.4793992042541504, "logps/chosen": -20.3429012298584, "logps/rejected": -21.467952728271484, "loss": 0.836, "original_losses": 3.6247520446777344, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -50.85725784301758, "rewards/margins": 2.8126296997070312, "rewards/rejected": -53.669883728027344, "step": 375, "weight": 0.3106473684310913 }, { "abs_diff": 3.1884102821350098, "all_logps_1": -7604.51953125, "all_logps_1_values": -7604.5185546875, "all_logps_2": 386.5625, "all_logps_2_values": 386.5625, "epoch": 0.8121827411167513, "grad_norm": 53.33016210631404, "learning_rate": 1.0218772555910954e-07, "logits/chosen": 2.299121141433716, "logits/rejected": 2.4894156455993652, "logps/chosen": -18.67618179321289, "logps/rejected": -20.802087783813477, "loss": 0.7253, "original_losses": 1.8635917901992798, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -46.690452575683594, "rewards/margins": 5.314764499664307, "rewards/rejected": -52.005226135253906, "step": 380, "weight": 0.36687955260276794 }, { "abs_diff": 3.728355884552002, "all_logps_1": -6403.2841796875, "all_logps_1_values": -6403.2841796875, "all_logps_2": 352.4937438964844, "all_logps_2_values": 352.4937438964844, "epoch": 0.8228693561314454, "grad_norm": 51.015747481657996, "learning_rate": 9.114030716778432e-08, "logits/chosen": 2.5289080142974854, "logits/rejected": 2.568324565887451, "logps/chosen": -17.892498016357422, "logps/rejected": -20.8332462310791, "loss": 0.6978, "original_losses": 1.4315834045410156, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -44.73124694824219, "rewards/margins": 7.351869106292725, "rewards/rejected": -52.0831184387207, "step": 385, "weight": 0.3369835317134857 }, { "abs_diff": 3.4457297325134277, "all_logps_1": -7086.70947265625, "all_logps_1_values": -7086.70849609375, "all_logps_2": 400.7562561035156, "all_logps_2_values": 400.7562561035156, "epoch": 0.8335559711461394, "grad_norm": 68.15400742768429, "learning_rate": 8.066471602728803e-08, "logits/chosen": 2.300518751144409, "logits/rejected": 2.432492256164551, "logps/chosen": -17.285266876220703, "logps/rejected": -19.280744552612305, "loss": 0.6869, "original_losses": 2.3130502700805664, "rewards/accuracies": 0.75, "rewards/chosen": -43.213172912597656, "rewards/margins": 4.9886932373046875, "rewards/rejected": -48.201866149902344, "step": 390, "weight": 0.337992399930954 }, { "abs_diff": 2.9501354694366455, "all_logps_1": -7602.40478515625, "all_logps_1_values": -7602.40380859375, "all_logps_2": 396.3125, "all_logps_2_values": 396.3125, "epoch": 0.8442425861608336, "grad_norm": 72.89829906287879, "learning_rate": 7.077560319906694e-08, "logits/chosen": 2.815917491912842, "logits/rejected": 3.0646049976348877, "logps/chosen": -17.960046768188477, "logps/rejected": -19.63981056213379, "loss": 0.7686, "original_losses": 2.1916909217834473, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -44.900108337402344, "rewards/margins": 4.1994123458862305, "rewards/rejected": -49.099525451660156, "step": 395, "weight": 0.36895015835762024 }, { "abs_diff": 3.306037187576294, "all_logps_1": -6128.6064453125, "all_logps_1_values": -6128.6064453125, "all_logps_2": 348.07501220703125, "all_logps_2_values": 348.07501220703125, "epoch": 0.8549292011755276, "grad_norm": 49.93516351014214, "learning_rate": 6.148679950161672e-08, "logits/chosen": 2.5622057914733887, "logits/rejected": 2.715359926223755, "logps/chosen": -18.067874908447266, "logps/rejected": -20.04085922241211, "loss": 0.7506, "original_losses": 2.2437596321105957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -45.169681549072266, "rewards/margins": 4.932468891143799, "rewards/rejected": -50.10215377807617, "step": 400, "weight": 0.34681177139282227 }, { "epoch": 0.8549292011755276, "eval_abs_diff": 3.175931930541992, "eval_all_logps_1": -7614.6904296875, "eval_all_logps_1_values": -7614.69091796875, "eval_all_logps_2": 414.86090087890625, "eval_all_logps_2_values": 414.86090087890625, "eval_logits/chosen": 1.7177369594573975, "eval_logits/rejected": 1.830857753753662, "eval_logps/chosen": -18.158353805541992, "eval_logps/rejected": -20.146547317504883, "eval_loss": 0.752778172492981, "eval_original_losses": 2.049124002456665, "eval_rewards/accuracies": 0.6975806355476379, "eval_rewards/chosen": -45.3958854675293, "eval_rewards/margins": 4.970486640930176, "eval_rewards/rejected": -50.36636734008789, "eval_runtime": 70.2236, "eval_samples_per_second": 27.925, "eval_steps_per_second": 0.883, "eval_weight": 0.37132638692855835, "step": 400 }, { "abs_diff": 3.7374179363250732, "all_logps_1": -6704.875, "all_logps_1_values": -6704.875, "all_logps_2": 385.4375, "all_logps_2_values": 385.4375, "epoch": 0.8656158161902218, "grad_norm": 69.66297582257639, "learning_rate": 5.2811296166831666e-08, "logits/chosen": 2.536898612976074, "logits/rejected": 2.8442349433898926, "logps/chosen": -17.0179443359375, "logps/rejected": -19.512527465820312, "loss": 0.6907, "original_losses": 1.9623138904571533, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -42.544864654541016, "rewards/margins": 6.236458778381348, "rewards/rejected": -48.78131866455078, "step": 405, "weight": 0.320218563079834 }, { "abs_diff": 3.427241802215576, "all_logps_1": -6360.1455078125, "all_logps_1_values": -6360.1455078125, "all_logps_2": 339.8062438964844, "all_logps_2_values": 339.8062438964844, "epoch": 0.8763024312049158, "grad_norm": 61.75715741585555, "learning_rate": 4.4761226670592066e-08, "logits/chosen": 2.682762861251831, "logits/rejected": 2.7268879413604736, "logps/chosen": -18.33367347717285, "logps/rejected": -20.431079864501953, "loss": 0.7588, "original_losses": 2.1594674587249756, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -45.83418273925781, "rewards/margins": 5.243517875671387, "rewards/rejected": -51.07769775390625, "step": 410, "weight": 0.32417041063308716 }, { "abs_diff": 3.5729141235351562, "all_logps_1": -8468.05078125, "all_logps_1_values": -8468.05078125, "all_logps_2": 414.2124938964844, "all_logps_2_values": 414.2124938964844, "epoch": 0.88698904621961, "grad_norm": 40.72517812799497, "learning_rate": 3.734784976300165e-08, "logits/chosen": 2.8361315727233887, "logits/rejected": 2.8616833686828613, "logps/chosen": -19.978229522705078, "logps/rejected": -22.11844825744629, "loss": 0.7242, "original_losses": 2.3812079429626465, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -49.94557189941406, "rewards/margins": 5.3505539894104, "rewards/rejected": -55.29612350463867, "step": 415, "weight": 0.3600180447101593 }, { "abs_diff": 2.8872458934783936, "all_logps_1": -8678.85546875, "all_logps_1_values": -8678.85546875, "all_logps_2": 427.64373779296875, "all_logps_2_values": 427.64373779296875, "epoch": 0.897675661234304, "grad_norm": 40.225954100303696, "learning_rate": 3.058153372200695e-08, "logits/chosen": 2.452263355255127, "logits/rejected": 2.515206813812256, "logps/chosen": -20.152559280395508, "logps/rejected": -21.298845291137695, "loss": 0.7959, "original_losses": 2.8674798011779785, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -50.38140106201172, "rewards/margins": 2.865709066390991, "rewards/rejected": -53.247108459472656, "step": 420, "weight": 0.39061683416366577 }, { "abs_diff": 4.009498119354248, "all_logps_1": -7007.01708984375, "all_logps_1_values": -7007.01708984375, "all_logps_2": 359.6187438964844, "all_logps_2_values": 359.6187438964844, "epoch": 0.9083622762489981, "grad_norm": 54.351457754994804, "learning_rate": 2.4471741852423233e-08, "logits/chosen": 2.617743968963623, "logits/rejected": 2.7704989910125732, "logps/chosen": -19.43728256225586, "logps/rejected": -21.93575668334961, "loss": 0.7422, "original_losses": 2.389147996902466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -48.59320831298828, "rewards/margins": 6.246188163757324, "rewards/rejected": -54.83939743041992, "step": 425, "weight": 0.31762319803237915 }, { "abs_diff": 2.755589723587036, "all_logps_1": -8104.44921875, "all_logps_1_values": -8104.44921875, "all_logps_2": 428.79376220703125, "all_logps_2_values": 428.79376220703125, "epoch": 0.9190488912636923, "grad_norm": 45.34333002111428, "learning_rate": 1.9027019250647036e-08, "logits/chosen": 2.6327333450317383, "logits/rejected": 2.7319021224975586, "logps/chosen": -18.6940975189209, "logps/rejected": -20.32192039489746, "loss": 0.6933, "original_losses": 2.010368824005127, "rewards/accuracies": 0.6875, "rewards/chosen": -46.735252380371094, "rewards/margins": 4.069557189941406, "rewards/rejected": -50.80480194091797, "step": 430, "weight": 0.38695794343948364 }, { "abs_diff": 3.834909439086914, "all_logps_1": -7406.4482421875, "all_logps_1_values": -7406.44775390625, "all_logps_2": 382.15625, "all_logps_2_values": 382.15625, "epoch": 0.9297355062783863, "grad_norm": 103.89987589364694, "learning_rate": 1.4254980853566246e-08, "logits/chosen": 2.688000440597534, "logits/rejected": 2.763110399246216, "logps/chosen": -19.011985778808594, "logps/rejected": -21.563823699951172, "loss": 0.7401, "original_losses": 2.1428942680358887, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -47.52996063232422, "rewards/margins": 6.379598617553711, "rewards/rejected": -53.90956497192383, "step": 435, "weight": 0.35418570041656494 }, { "abs_diff": 3.49601411819458, "all_logps_1": -7640.515625, "all_logps_1_values": -7640.515625, "all_logps_2": 394.25, "all_logps_2_values": 394.25, "epoch": 0.9404221212930804, "grad_norm": 66.9604311531267, "learning_rate": 1.016230078838226e-08, "logits/chosen": 2.6405506134033203, "logits/rejected": 2.7150299549102783, "logps/chosen": -18.938282012939453, "logps/rejected": -21.01675796508789, "loss": 0.7279, "original_losses": 2.3662502765655518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -47.345703125, "rewards/margins": 5.196188449859619, "rewards/rejected": -52.541893005371094, "step": 440, "weight": 0.35034170746803284 }, { "abs_diff": 3.1276192665100098, "all_logps_1": -9211.677734375, "all_logps_1_values": -9211.6787109375, "all_logps_2": 462.4624938964844, "all_logps_2_values": 462.4624938964844, "epoch": 0.9511087363077745, "grad_norm": 62.83164635980714, "learning_rate": 6.754703038239329e-09, "logits/chosen": 2.502159357070923, "logits/rejected": 2.6519925594329834, "logps/chosen": -18.46548080444336, "logps/rejected": -20.194454193115234, "loss": 0.6978, "original_losses": 2.2807629108428955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -46.16370391845703, "rewards/margins": 4.322434902191162, "rewards/rejected": -50.48613739013672, "step": 445, "weight": 0.35711297392845154 }, { "abs_diff": 3.5259463787078857, "all_logps_1": -7040.4140625, "all_logps_1_values": -7040.4140625, "all_logps_2": 358.57501220703125, "all_logps_2_values": 358.57501220703125, "epoch": 0.9617953513224686, "grad_norm": 58.22216553617623, "learning_rate": 4.036953436716895e-09, "logits/chosen": 2.841308832168579, "logits/rejected": 2.788696050643921, "logps/chosen": -19.402172088623047, "logps/rejected": -21.435121536254883, "loss": 0.6648, "original_losses": 2.3494513034820557, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -48.505435943603516, "rewards/margins": 5.082365989685059, "rewards/rejected": -53.587799072265625, "step": 450, "weight": 0.32446950674057007 }, { "abs_diff": 2.527660369873047, "all_logps_1": -7083.1259765625, "all_logps_1_values": -7083.1259765625, "all_logps_2": 354.76873779296875, "all_logps_2_values": 354.76873779296875, "epoch": 0.9724819663371627, "grad_norm": 63.64964025419041, "learning_rate": 2.0128530023804656e-09, "logits/chosen": 2.5407052040100098, "logits/rejected": 2.6334285736083984, "logps/chosen": -19.97518539428711, "logps/rejected": -21.144289016723633, "loss": 0.773, "original_losses": 2.366673469543457, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -49.937965393066406, "rewards/margins": 2.922760486602783, "rewards/rejected": -52.86072540283203, "step": 455, "weight": 0.4015112519264221 }, { "abs_diff": 3.792357921600342, "all_logps_1": -6872.0908203125, "all_logps_1_values": -6872.0908203125, "all_logps_2": 352.35626220703125, "all_logps_2_values": 352.35626220703125, "epoch": 0.9831685813518568, "grad_norm": 70.18502240580426, "learning_rate": 6.852326227130833e-10, "logits/chosen": 2.659250020980835, "logits/rejected": 2.507812976837158, "logps/chosen": -20.08974266052246, "logps/rejected": -22.00864028930664, "loss": 0.7596, "original_losses": 3.0322279930114746, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -50.22435760498047, "rewards/margins": 4.797248840332031, "rewards/rejected": -55.0216064453125, "step": 460, "weight": 0.3797384202480316 }, { "abs_diff": 3.1223583221435547, "all_logps_1": -7477.0185546875, "all_logps_1_values": -7477.0185546875, "all_logps_2": 386.9937438964844, "all_logps_2_values": 386.9937438964844, "epoch": 0.9938551963665508, "grad_norm": 70.11026953873642, "learning_rate": 5.594909486328348e-11, "logits/chosen": 2.367159366607666, "logits/rejected": 2.6166296005249023, "logps/chosen": -18.468345642089844, "logps/rejected": -20.6806697845459, "loss": 0.6765, "original_losses": 1.560880422592163, "rewards/accuracies": 0.8125, "rewards/chosen": -46.17086410522461, "rewards/margins": 5.5308074951171875, "rewards/rejected": -51.7016716003418, "step": 465, "weight": 0.36222249269485474 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.9884350126254227, "train_runtime": 7236.0008, "train_samples_per_second": 8.275, "train_steps_per_second": 0.065 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }